Python for Data Science Cheatsheet

Language: Python | Reference Guide

# 1. NumPy: Numerical Data

import numpy as np

# Creating Arrays
a = np.array([1, 2, 3]) # From a list
b = np.arange(0, 10, 2) # Range of values with a step
c = np.linspace(0, 1, 5) # 5 numbers evenly spaced between 0 and 1
d = np.zeros((2, 3)) # 2x3 array of zeros

# Array Attributes
print(f"Shape: {a.shape}") # (3,)
print(f"Data type: {a.dtype}") # int64

# Vectorized Operations (element-wise)
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])
print(f"Sum: {arr1 + arr2}") # [5 7 9]
print(f"Squared: {arr1 ** 2}") # [1 4 9]

# Indexing and Slicing
print(f"First element: {arr1[0]}")
print(f"Slice: {arr1[1:]}") # [2 3]

# Common Aggregations
print(f"Mean: {arr1.mean()}")
print(f"Max: {arr1.max()}")

# 2. Pandas: Series

import pandas as pd

# A Series is a one-dimensional labeled array.
s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s)

# Accessing data
print(f"By label: {s['b']}") # 20
print(f"By position: {s.iloc[0]}") # 10

# 3. Pandas: DataFrame

import pandas as pd

# A DataFrame is a two-dimensional labeled data structure.
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Paris', 'London']
}
df = pd.DataFrame(data)

# Data Inspection
print("--- Head ---")
print(df.head(2)) # First 2 rows
print("\n--- Info ---")
df.info() # Column data types and non-null counts
print("\n--- Describe ---")
print(df.describe()) # Statistical summary

# 4. Pandas: Reading & Writing Data

import pandas as pd

# Assuming 'data.csv' exists
# df = pd.read_csv('data.csv')

# Writing to a CSV file
# df.to_csv('output.csv', index=False) # index=False avoids writing row numbers

print("Pandas can read from/write to CSV, Excel, SQL, JSON, and more.")

# 5. Pandas: Selection & Indexing

import pandas as pd
data = {'Name': ['Alice', 'Bob'], 'Age': [25, 30]}
df = pd.DataFrame(data)

# Selecting a column (returns a Series)
ages = df['Age']

# Selecting multiple columns
name_city = df[['Name', 'Age']]

# --- .loc: Selection by label ---
# Select row by index label
row_0 = df.loc[0]
# Select rows and specific columns
alice_age = df.loc[0, 'Age']

# --- .iloc: Selection by integer position ---
# Select row by position
row_1 = df.iloc[1]

# --- Boolean Indexing ---
# Select rows based on a condition
older_than_25 = df[df['Age'] > 25]
print(older_than_25)

# 6. Pandas: Data Cleaning

import pandas as pd
import numpy as np

data = {'A': [1, 2, np.nan], 'B': [4, np.nan, 6]}
df = pd.DataFrame(data)

# Check for missing values
print(df.isnull().sum())

# Drop rows with any missing values
df_dropped = df.dropna()

# Fill missing values with a specific value
df_filled = df.fillna(value=0)
print(df_filled)

# 7. Pandas: Grouping & Aggregation

import pandas as pd

data = {
    'Department': ['Sales', 'Sales', 'HR', 'HR', 'IT'],
    'Salary': [50000, 60000, 45000, 55000, 90000]
}
df = pd.DataFrame(data)

# Group by a column and calculate the mean of each group
avg_salary_by_dept = df.groupby('Department')['Salary'].mean()
print(avg_salary_by_dept)

# Multiple aggregations
agg_df = df.groupby('Department')['Salary'].agg(['mean', 'count', 'max'])
print(agg_df)

# 8. Basic Plotting with Matplotlib

import pandas as pd
import matplotlib.pyplot as plt

# Matplotlib is the most common plotting library,
# and Pandas has built-in plotting capabilities that use it.

data = {'Year': [2020, 2021, 2022, 2023], 'Sales': [100, 120, 150, 140]}
df = pd.DataFrame(data)

# Simple line plot
df.plot(x='Year', y='Sales', kind='line', title='Annual Sales')
plt.ylabel('Sales (in millions)')
plt.show() # This will display the plot

Find more developer cheatsheets, guides, and resources at:
10xdev.blog/cheatsheets