Languages
Python
Data Wrangling in Python

Python Data Wrangling Cheat Sheet


1. Importing Libraries and Loading Data

Pandas

import pandas as pd
 
# Load CSV
df = pd.read_csv("file.csv")
 
# Load Excel
df = pd.read_excel("file.xlsx")
 
# Load JSON
df = pd.read_json("file.json")

Polars

import polars as pl
 
# Load CSV
df = pl.read_csv("file.csv")
 
# Load JSON
df = pl.read_json("file.json")

Apache Spark

from pyspark.sql import SparkSession
 
spark = SparkSession.builder.appName("Data Wrangling").getOrCreate()
 
# Load CSV
df = spark.read.option("header", "true").csv("file.csv")
 
# Load JSON
df = spark.read.json("file.json")

2. Quick Data Inspection

Pandas

df.head()          # First 5 rows
df.info()          # Summary of the DataFrame
df.describe()      # Summary statistics
df.columns         # List of column names
df.shape           # Dimensions of the DataFrame

Polars

df.head(5)         # First 5 rows
df.schema          # Schema of the DataFrame
df.describe()      # Summary statistics
df.columns         # List of column names
df.shape           # Dimensions of the DataFrame

Apache Spark

df.show()          # First 20 rows
df.printSchema()   # Schema of the DataFrame
df.describe().show()  # Summary statistics
df.columns         # List of column names
df.count()         # Number of rows

3. Selecting and Filtering Data

Pandas

# Selecting Columns
df["column_name"]          # Single column
df[["col1", "col2"]]       # Multiple columns
 
# Filtering Rows
df[df["column"] > 10]      # Rows where column > 10
df[(df["col1"] > 10) & (df["col2"] < 5)]  # Multiple conditions

Polars

# Selecting Columns
df.select("column_name")   # Single column
df.select(["col1", "col2"])  # Multiple columns
 
# Filtering Rows
df.filter(pl.col("column") > 10)  # Rows where column > 10
df.filter((pl.col("col1") > 10) & (pl.col("col2") < 5))  # Multiple conditions

Apache Spark

# Selecting Columns
df.select("column_name").show()  # Single column
df.select("col1", "col2").show()  # Multiple columns
 
# Filtering Rows
df.filter(df["column"] > 10).show()  # Rows where column > 10
df.filter((df["col1"] > 10) & (df["col2"] < 5)).show()  # Multiple conditions

4. Cleaning and Handling Missing Data

Pandas

# Drop Missing Values
df.dropna()                 # Drop rows with missing values
 
# Fill Missing Values
df.fillna(0)                # Replace missing values with 0
 
# Replace Values
df.replace("old_value", "new_value")

Polars

# Drop Missing Values
df.drop_nulls()
 
# Fill Missing Values
df.fill_null(0)
 
# Replace Values
df.with_column(
    pl.when(pl.col("col1") == "old_value").then("new_value").otherwise(pl.col("col1"))
)

Apache Spark

# Drop Missing Values
df.na.drop()
 
# Fill Missing Values
df.na.fill(0)
 
# Replace Values
df.withColumn("col1", when(df["col1"] == "old_value", "new_value").otherwise(df["col1"]))

5. Transforming Data

Pandas

# Add New Column
df["new_col"] = df["col1"] * 2
 
# Rename Columns
df.rename(columns={"old_name": "new_name"}, inplace=True)
 
# String Operations
df["col1"].str.upper()

Polars

# Add New Column
df.with_column((pl.col("col1") * 2).alias("new_col"))
 
# Rename Columns
df.rename({"old_name": "new_name"})
 
# String Operations
df.with_column(pl.col("col1").str.to_uppercase())

Apache Spark

# Add New Column
df.withColumn("new_col", df["col1"] * 2)
 
# Rename Columns
df.withColumnRenamed("old_name", "new_name")
 
# String Operations
df.withColumn("col1_upper", upper(df["col1"]))

6. Aggregating and Grouping

Pandas

# Group By
df.groupby("col1").mean()
 
# Aggregation
df.groupby("col1").agg({"col2": "sum", "col3": "max"})

Polars

# Group By
df.groupby("col1").agg(pl.col("col2").mean())
 
# Aggregation
df.groupby("col1").agg([pl.col("col2").sum(), pl.col("col3").max()])

Apache Spark

# Group By
df.groupBy("col1").mean().show()
 
# Aggregation
df.groupBy("col1").agg({"col2": "sum", "col3": "max"}).show()

7. Merging and Joining

Pandas

# Merge
pd.merge(df1, df2, on="key")
 
# Concatenate
pd.concat([df1, df2])

Polars

# Join
df1.join(df2, on="key")
 
# Concatenate
pl.concat([df1, df2])

Apache Spark

# Join
df1.join(df2, on="key", how="inner").show()
 
# Union
df1.union(df2).show()

8. Exporting Data

Pandas

# Save to CSV
df.to_csv("output.csv", index=False)
 
# Save to Excel
df.to_excel("output.xlsx", index=False)

Polars

# Save to CSV
df.write_csv("output.csv")
 
# Save to JSON
df.write_json("output.json")

Apache Spark

# Save to CSV
df.write.option("header", "true").csv("output.csv")
 
# Save to Parquet
df.write.parquet("output.parquet")

Last updated on April 18, 2025