Scala Data Wrangling Cheat Sheet
1. Importing Libraries
// Import required libraries for normal Scala
import scala.collection.mutable
import scala.collection.immutable
// For Apache Spark (if needed)
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
2. Creating and Loading Data
Normal Scala
// Create a List
val data = List(1, 2, 3, 4, 5)
// Create a Map
val mapData = Map("a" -> 1, "b" -> 2, "c" -> 3)
// Create a 2D List (like a table)
val table = List(
List("Name", "Age", "City"),
List("Alice", "25", "New York"),
List("Bob", "30", "San Francisco")
)
// Reading files
import scala.io.Source
val lines = Source.fromFile("file.txt").getLines.toList
Apache Spark
val spark = SparkSession.builder()
.appName("Data Wrangling Cheat Sheet")
.getOrCreate()
// Load CSV
val df = spark.read.option("header", "true").csv("file.csv")
// Load JSON
val df = spark.read.json("file.json")
3. Inspecting Data
Normal Scala
// Inspect Lists
data.head // First element
data.tail // All elements except the first
data.take(3) // First 3 elements
data.length // Number of elements
// Inspect Maps
mapData.keys // Get all keys
mapData.values // Get all values
mapData.get("a") // Get value for key "a"
// Inspect 2D Lists
table.foreach(println) // Print each row
Apache Spark
df.show() // Show the first 20 rows
df.printSchema() // Print the schema of the DataFrame
df.columns // Get column names
df.count() // Count the number of rows
4. Selecting and Filtering Data
Normal Scala
// Filter a List
val filtered = data.filter(_ > 3) // Elements greater than 3
// Map Transformation
val squared = data.map(x => x * x) // Square each element
// Filter a 2D List
val filteredTable = table.filter(row => row(1) != "25") // Remove rows where Age = 25
Apache Spark
// Select Columns
df.select("column_name").show()
// Filter Rows
df.filter($"column" > 10).show()
df.filter($"column" === "value").show()
5. Sorting and Reordering
Normal Scala
// Sort a List
val sorted = data.sorted // Ascending order
val descending = data.sortWith(_ > _) // Descending order
// Reorder Map (convert to List first)
val reordered = mapData.toList.sortBy(_._2) // Sort by values
Apache Spark
// Sort Rows
df.orderBy("col1").show()
df.orderBy($"col1".desc).show()
6. Aggregating and Grouping
Normal Scala
// Aggregations on List
val sum = data.sum // Sum of elements
val avg = data.sum / data.size.toDouble // Average
// Grouping
val grouped = data.groupBy(_ % 2) // Group by even/odd
Apache Spark
// Group By and Aggregation
df.groupBy("col1").count().show()
df.groupBy("col1").agg(avg("col2"), max("col3")).show()
7. Transforming Data
Normal Scala
// Add new elements to List
val extended = data :+ 6 // Append 6 to the List
val prepended = 0 +: data // Prepend 0 to the List
// Replace elements in List
val replaced = data.map(x => if (x == 3) 99 else x) // Replace 3 with 99
// String Operations
val strings = List("apple", "banana", "cherry")
val uppercased = strings.map(_.toUpperCase)
val filteredStrings = strings.filter(_.contains("a"))
Apache Spark
// Add or Modify Columns
df.withColumn("new_col", $"col1" * 2).show()
df.withColumn("col1_upper", upper($"col1")).show()
8. Merging and Joining
Normal Scala
// Combine Lists
val combined = data ++ List(6, 7, 8) // Concatenate two Lists
// Merge Maps
val mergedMap = mapData ++ Map("d" -> 4, "e" -> 5) // Add new key-value pairs
Apache Spark
// Inner Join
val joined = df1.join(df2, Seq("key"), "inner")
joined.show()
// Union
val unioned = df1.union(df2)
unioned.show()
9. Advanced Data Reshaping
Normal Scala
// Transpose a 2D List
val transposed = table.transpose
// Flatten a Nested List
val nested = List(List(1, 2), List(3, 4), List(5))
val flattened = nested.flatten
Apache Spark
// Melting (Wide to Long)
val melted = df.selectExpr("id", "stack(2, 'col1', col1, 'col2', col2) as (variable, value)")
melted.show()
// Pivoting (Long to Wide)
val pivoted = df.groupBy("id").pivot("variable").agg(first("value"))
pivoted.show()
10. Debugging and Quick Checks
Normal Scala
// Check for Duplicates in a List
val duplicates = data.groupBy(identity).filter(_._2.size > 1).keys
// Check Memory Usage
println(s"List size: ${data.size}")
Apache Spark
// Check for Duplicates
df.groupBy("col1").count().filter($"count" > 1).show()
// Schema Validation
df.schema.fields.foreach(println)
Last updated on April 18, 2025