Spaces:

likhonsheikh
/

code-interpreter-sandbox

Sleeping

App Files Files Community

code-interpreter-sandbox / examples /data_analysis_example.py

likhonsheikh

Initial commit: Advanced Code Interpreter Sandbox

523f6c3 verified 4 months ago

raw

history blame contribute delete

2.57 kB

	"""
	Data Analysis Example
	Demonstrates data manipulation and visualization capabilities
	"""

	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from datetime import datetime, timedelta

	# Generate sample data
	np.random.seed(42)
	dates = pd.date_range(start='2023-01-01', periods=365, freq='D')
	values = np.cumsum(np.random.randn(365)) + 100
	df = pd.DataFrame({
	'date': dates,
	'value': values,
	'category': np.random.choice(['A', 'B', 'C'], 365)
	})

	print("=" * 60)
	print("DATA ANALYSIS EXAMPLE")
	print("=" * 60)

	# Basic statistics
	print("\n📊 Dataset Overview:")
	print(f"Total records: {len(df)}")
	print(f"Date range: {df['date'].min()} to {df['date'].max()}")
	print(f"Categories: {df['category'].unique()}")

	print("\n📈 Basic Statistics:")
	print(df.describe())

	# Category distribution
	print("\n📋 Category Distribution:")
	category_counts = df['category'].value_counts()
	print(category_counts)

	# Time series analysis
	print("\n⏰ Time Series Analysis:")
	monthly_avg = df.groupby(df['date'].dt.month)['value'].mean()
	print(monthly_avg)

	# Create visualizations
	plt.figure(figsize=(15, 10))

	# Plot 1: Time series
	plt.subplot(2, 2, 1)
	plt.plot(df['date'], df['value'])
	plt.title('Time Series Data')
	plt.xlabel('Date')
	plt.ylabel('Value')
	plt.xticks(rotation=45)

	# Plot 2: Histogram
	plt.subplot(2, 2, 2)
	plt.hist(df['value'], bins=30, alpha=0.7, color='skyblue')
	plt.title('Value Distribution')
	plt.xlabel('Value')
	plt.ylabel('Frequency')

	# Plot 3: Category boxplot
	plt.subplot(2, 2, 3)
	sns.boxplot(data=df, x='category', y='value')
	plt.title('Value by Category')
	plt.xlabel('Category')
	plt.ylabel('Value')

	# Plot 4: Scatter plot
	plt.subplot(2, 2, 4)
	colors = {'A': 'red', 'B': 'blue', 'C': 'green'}
	for category in df['category'].unique():
	subset = df[df['category'] == category]
	plt.scatter(subset.index, subset['value'],
	c=colors[category], label=category, alpha=0.6)
	plt.title('Scatter Plot by Category')
	plt.xlabel('Index')
	plt.ylabel('Value')
	plt.legend()

	plt.tight_layout()
	plt.show()

	# Advanced analysis
	print("\n🔍 Advanced Analysis:")

	# Correlation
	correlation = df['value'].corr(df.index)
	print(f"Correlation with time: {correlation:.4f}")

	# Rolling statistics
	rolling_mean = df['value'].rolling(window=30).mean()
	print(f"30-day rolling mean (latest): {rolling_mean.iloc[-1]:.2f}")

	# Growth rate
	growth_rate = (df['value'].iloc[-1] - df['value'].iloc[0]) / df['value'].iloc[0] * 100
	print(f"Total growth rate: {growth_rate:.2f}%")

	print("\n✅ Data analysis complete!")