I've been teaching sql for data science for years, and the #1 question I get is: "How does this actually work in practice?" This guide answers that question with real examples.
Table of Contents
Getting Started with SQL for Data Science
Data Science starts with understanding your tools. Let's set up and explore the fundamentals.
import pandas as pd
import numpy as np
# Create a sample dataset
data = {
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
'age': [28, 34, 22, 31, 27],
'salary': [75000, 92000, 55000, 88000, 71000],
'department': ['Engineering', 'Marketing', 'Engineering', 'Sales', 'Marketing']
}
df = pd.DataFrame(data)
print(df)
print(f"\nShape: {df.shape}")
print(f"\nBasic stats:\n{df.describe()}")
Data Analysis Techniques
# Filtering and grouping
engineers = df[df['department'] == 'Engineering']
print(f"Engineers:\n{{engineers}}")
# Group by department
dept_stats = df.groupby('department').agg({{
'salary': ['mean', 'min', 'max'],
'age': 'mean',
'name': 'count'
}}).round(0)
print(f"\nDepartment stats:\n{{dept_stats}}")
# Add calculated columns
df['salary_rank'] = df['salary'].rank(ascending=False).astype(int)
df['above_avg'] = df['salary'] > df['salary'].mean()
print(f"\nWith rankings:\n{{df}}")
# Sorting
top_earners = df.nlargest(3, 'salary')[['name', 'salary', 'department']]
print(f"\nTop 3 earners:\n{{top_earners}}")
Data Visualization
import matplotlib.pyplot as plt
# Create a figure with multiple subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
# Bar chart — Salary by person
axes[0].barh(df['name'], df['salary'], color='#00d4aa')
axes[0].set_xlabel('Salary ($)')
axes[0].set_title('Salary by Employee')
# Pie chart — Department distribution
dept_counts = df['department'].value_counts()
axes[1].pie(dept_counts, labels=dept_counts.index, autopct='%1.0f%%',
colors=['#00d4aa', '#16213e', '#2d3748'])
axes[1].set_title('Department Distribution')
# Scatter plot — Age vs Salary
axes[2].scatter(df['age'], df['salary'], s=100, c='#00d4aa', edgecolors='white')
for _, row in df.iterrows():
axes[2].annotate(row['name'], (row['age'], row['salary']),
textcoords="offset points", xytext=(5, 5))
axes[2].set_xlabel('Age')
axes[2].set_ylabel('Salary ($)')
axes[2].set_title('Age vs Salary')
plt.tight_layout()
plt.savefig('analysis.png', dpi=150, bbox_inches='tight')
plt.show()
Real-World Project
Let's analyze a real-world scenario — sales data analysis:
# Sales data analysis pipeline
import pandas as pd
from datetime import datetime, timedelta
import random
# Generate sample sales data
np.random.seed(42)
dates = pd.date_range('2025-01-01', periods=365, freq='D')
sales_data = pd.DataFrame({
'date': dates,
'revenue': np.random.normal(5000, 1500, 365).clip(500),
'orders': np.random.poisson(50, 365),
'category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], 365)
})
# Monthly aggregation
monthly = sales_data.set_index('date').resample('M').agg({
'revenue': 'sum',
'orders': 'sum'
})
monthly['avg_order_value'] = monthly['revenue'] / monthly['orders']
print("Monthly Summary:")
print(monthly.round(2))
# Best performing category
cat_perf = sales_data.groupby('category')['revenue'].agg(['sum', 'mean', 'count'])
cat_perf.columns = ['total_revenue', 'avg_daily_revenue', 'days']
print(f"\nCategory Performance:\n{cat_perf.round(2)}")
Next Steps
- Learn pandas deeply — It's the foundation of data analysis in Python
- Practice with real datasets — Kaggle has thousands of free datasets
- Learn SQL — Most data lives in databases
- Study statistics — Mean, median, standard deviation, correlation
- Explore machine learning — scikit-learn is the natural next step
AM
Arjun Mehta
Full-Stack Developer & Technical Writer at DRIXO
Full-Stack Developer & Technical Writer at DRIXO
Full-stack developer with 5+ years of experience in Python and JavaScript. I love breaking down complex concepts into simple, practical tutorials. When I'm not coding, you'll find me contributing to open-source projects.
Comments