PySpark Functions
1. Select Columns
- Example
`df = df.select(
"customer_id",
"customer_name"
)`
2. Creating or Replacing a column
- Example
df = df.withColumn("always_one", F.lit(1))
df = df.withColumn("customer_id_copy",F.col("customer_id"))
3. Rename a column
df.withColumnRenamed(<former_column_name>, <new_column_name>)
- Example
df = df.withColumnRenamed("sap_product_code","product_code")
4. Creating columns
--Returning a Column that contains <value> in every row: F.lit(<value>)
-- Example
df = df.withColumn("test",F.lit(1))
-- Example for null values: you have to give a type to the column since None has no type
df = df.withColumn("null_column",F.lit(None).cast("string"))
5. If then else statements
F.when(<condition>, <column>).otherwise(<column>)
--Example
df = df.withColumn(
"new_column",
F.when(
F.col("source") == "OK",
F.lit("OneKey")
).when(
F.col("source") == "ABV_BC",
F.lit("Business Contact")
).otherwise(
F.lit("other source"))
6. Concatenating columns
F.concat(<column_1>, <column_2>, <column_3>, ...)
-- Example
df = df.withColumn(
"new_column",
F.cancat(
F.col("firstname"),
F.col("lastname")
)
)
7. Joining datasets
dataset_a.join(dataset_b, on="column_to_join_on", how="left")
- Example
customer_with_address = customer.join(address, on="customer_id", how="left")
- Example with multiple columns to join on
dataset_c = dataset_a.join(dataset_b, on=["customer_id", "territory", "product"], how="inner")
8. Grouping by
# Example
import pyspark.sql.functions as F
aggregated_calls = calls.groupBy("customer_id").
agg(
F.mean("duration").alias("mean_duration")
)
9. Pivoting
- Example
customer_specialty = specialty.groupBy("customer_id").pivot("priority")
10. Window functions
- Example
from pyspark.sql.window import Window
window = Window.partitionBy("l0_customer_id","address_id").orderBy(F.col("ordered_code_locale"))
ordered_code_locale = dataset.withColumn(
"order_code_locale_row",
F.row_number().over(window)
)
11. Iterating over columns
-- Example only with the column name
for column_name in dataset.columns:
-- Adds the "new_name_" prefix to all the columns of the dataset
dataset = dataset.withColumnRenamed(column_name, "new_name_{column_name}".format(column_name))
-- Example with the column types
for column_name, column_type in dataset.dtypes:
-- Replace all columns values by "Test"
dataset = dataset.withColumn(column_name, F.lit("Test"))
12. Iteration Dictionaries
# Define a dictionary
my_dictionary = {
"dog": "Alice",
"cat": "Johnny"
}
# Iterate through the dictionary
for animal, name in my_dictionary.items():
# Do something
print(animal, name)
# Iterate through the dictionary
for animal in my_dictionary.keys():
# Do something
print(animal)
# Iterate through the dictionary
for name in my_dictionary.values():
# Do something
print(name)
13. lists
my_list = [
"dog",
"cat"
]
# Iterate through the list
for animal in my_list:
# Do something
print(animal)
# Iterate through the list, and get the index of the current element
for index, animal in enumerate(my_list):
# Do something
print(index, animal)
Jasminelee