PySpark Functions

1. Select Columns

- Example

`df = df.select(
	"customer_id",
	"customer_name"
)`

2. Creating or Replacing a column

- Example

df = df.withColumn("always_one", F.lit(1))
df = df.withColumn("customer_id_copy",F.col("customer_id"))

3. Rename a column

df.withColumnRenamed(<former_column_name>, <new_column_name>)
- Example
df = df.withColumnRenamed("sap_product_code","product_code")

4. Creating columns

--Returning a Column that contains <value> in every row: F.lit(<value>)
-- Example
df = df.withColumn("test",F.lit(1))

-- Example for null values: you have to give a type to the column since None has no type
df = df.withColumn("null_column",F.lit(None).cast("string"))

5. If then else statements

F.when(<condition>, <column>).otherwise(<column>)
--Example
df = df.withColumn(
	"new_column",
	F.when(
		F.col("source") == "OK",
		F.lit("OneKey")
	).when(
		F.col("source") == "ABV_BC",
		F.lit("Business Contact")
	).otherwise(
		F.lit("other source"))

6. Concatenating columns

F.concat(<column_1>, <column_2>, <column_3>, ...)
-- Example
df = df.withColumn(
	"new_column",
	F.cancat(
		F.col("firstname"),
		F.col("lastname")
	)
)

7. Joining datasets

dataset_a.join(dataset_b, on="column_to_join_on", how="left")
- Example
customer_with_address = customer.join(address, on="customer_id", how="left")
- Example with multiple columns to join on
dataset_c = dataset_a.join(dataset_b, on=["customer_id", "territory", "product"], how="inner")

8. Grouping by

# Example
import pyspark.sql.functions as F
aggregated_calls = calls.groupBy("customer_id").
agg(
  F.mean("duration").alias("mean_duration")
)

9. Pivoting

- Example
customer_specialty = specialty.groupBy("customer_id").pivot("priority")

10. Window functions

- Example
from pyspark.sql.window import Window
window = Window.partitionBy("l0_customer_id","address_id").orderBy(F.col("ordered_code_locale"))
ordered_code_locale = dataset.withColumn(
	"order_code_locale_row",
	F.row_number().over(window)
)

11. Iterating over columns

-- Example only with the column name
for column_name in dataset.columns:
-- Adds the "new_name_" prefix to all the columns of the dataset 
  dataset = dataset.withColumnRenamed(column_name, "new_name_{column_name}".format(column_name))

-- Example with the column types
for column_name, column_type in dataset.dtypes:
-- Replace all columns values by "Test"
  dataset = dataset.withColumn(column_name, F.lit("Test"))

12. Iteration Dictionaries

# Define a dictionary
my_dictionary = {
  "dog": "Alice",
  "cat": "Johnny"
}

# Iterate through the dictionary
for animal, name in my_dictionary.items():
  # Do something
  print(animal, name)

# Iterate through the dictionary
for animal in my_dictionary.keys():
  # Do something
  print(animal)

# Iterate through the dictionary
for name in my_dictionary.values():
  # Do something
  print(name)

13. lists

my_list = [
  "dog",
  "cat"
]

# Iterate through the list
for animal in my_list:
  # Do something
  print(animal)

# Iterate through the list, and get the index of the current element
for index, animal in enumerate(my_list):
  # Do something
  print(index, animal)
posted @ 2024-05-31 13:29  白云~  阅读(24)  评论(0编辑  收藏  举报