pyspark_test

PySpark Cookbook

Tomasz Drabas, Denny Lee

Version: 0.1

Date: 1/15/2018

import findspark
findspark.init()

import pyspark 
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("test").setMaster("local")
sc = SparkContext(conf=conf)

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

Some data

dirty_data = spark.createDataFrame([
          (1,'Porsche','Boxster S','Turbo',2.5,4,22,None)
        , (2,'Aston Martin','Vanquish','Aspirated',6.0,12,16,None)
        , (3,'Porsche','911 Carrera 4S Cabriolet','Turbo',3.0,6,24,None)
        , (3,'General Motors','SPARK ACTIV','Aspirated',1.4,None,32,None)
        , (5,'BMW','COOPER S HARDTOP 2 DOOR','Turbo',2.0,4,26,None)
        , (6,'BMW','330i','Turbo',2.0,None,27,None)
        , (7,'BMW','440i Coupe','Turbo',3.0,6,23,None)
        , (8,'BMW','440i Coupe','Turbo',3.0,6,23,None)
        , (9,'Mercedes-Benz',None,None,None,None,27,None)
        , (10,'Mercedes-Benz','CLS 550','Turbo',4.7,8,21,79231)
        , (11,'Volkswagen','GTI','Turbo',2.0,4,None,None)
        , (12,'Ford Motor Company','FUSION AWD','Turbo',2.7,6,20,None)
        , (13,'Nissan','Q50 AWD RED SPORT','Turbo',3.0,6,22,None)
        , (14,'Nissan','Q70 AWD','Aspirated',5.6,8,18,None)
        , (15,'Kia','Stinger RWD','Turbo',2.0,4,25,None)
        , (16,'Toyota','CAMRY HYBRID LE','Aspirated',2.5,4,46,None)
        , (16,'Toyota','CAMRY HYBRID LE','Aspirated',2.5,4,46,None)
        , (18,'FCA US LLC','300','Aspirated',3.6,6,23,None)
        , (19,'Hyundai','G80 AWD','Turbo',3.3,6,20,None)
        , (20,'Hyundai','G80 AWD','Turbo',3.3,6,20,None)
        , (21,'BMW','X5 M','Turbo',4.4,8,18,121231)
        , (22,'GE','K1500 SUBURBAN 4WD','Aspirated',5.3,8,18,None)
    ], ['Id','Manufacturer','Model','EngineType','Displacement',
        'Cylinders','FuelEconomy','MSRP'])

Handling duplicates

Exact duplicates

# do we have any rows that are duplicated?
dirty_data.count(), dirty_data.distinct().count()

(22, 21)

# what row is duplicated?
(
    dirty_data
    .groupby(dirty_data.columns)
    .count()
    .filter('count > 1')
    .show()
)

+---+------------+---------------+----------+------------+---------+-----------+----+-----+
| Id|Manufacturer|          Model|EngineType|Displacement|Cylinders|FuelEconomy|MSRP|count|
+---+------------+---------------+----------+------------+---------+-----------+----+-----+
| 16|      Toyota|CAMRY HYBRID LE| Aspirated|         2.5|        4|         46|null|    2|
+---+------------+---------------+----------+------------+---------+-----------+----+-----+

# remove the duplicated rows
full_removed = dirty_data.dropDuplicates()
full_removed.count()

Only ID differs

# count of rows
no_ids = (
    full_removed
    .select([col for col in full_removed.columns if col != 'Id'])
)

no_ids.count(), no_ids.distinct().count()

(21, 19)

# what row is duplicated?
(
    full_removed
    .groupby([col for col in full_removed.columns if col != 'Id'])
    .count()
    .filter('count > 1')
    .show()
)

+------------+----------+----------+------------+---------+-----------+----+-----+
|Manufacturer|     Model|EngineType|Displacement|Cylinders|FuelEconomy|MSRP|count|
+------------+----------+----------+------------+---------+-----------+----+-----+
|         BMW|440i Coupe|     Turbo|         3.0|        6|         23|null|    2|
|     Hyundai|   G80 AWD|     Turbo|         3.3|        6|         20|null|    2|
+------------+----------+----------+------------+---------+-----------+----+-----+

# remove the duplicated record
id_removed = full_removed.dropDuplicates(
    subset = [col for col in full_removed.columns if col != 'Id']
)

# count
id_removed.count()

Duplicated IDs

# are there any duplicated IDs?
import pyspark.sql.functions as fn

id_removed.agg(
      fn.count('Id').alias('CountOfIDs')
    , fn.countDistinct('Id').alias('CountOfDistinctIDs')
).show()

+----------+------------------+
|CountOfIDs|CountOfDistinctIDs|
+----------+------------------+
|        19|                18|
+----------+------------------+

# what's duplicated?
(
    id_removed
    .groupby('Id')
    .count()
    .filter('count > 1')
    .show()
)

+---+-----+
| Id|count|
+---+-----+
|  3|    2|
+---+-----+

(
    id_removed
    .filter('Id = 3')
    .show()
)

+---+--------------+--------------------+----------+------------+---------+-----------+----+
| Id|  Manufacturer|               Model|EngineType|Displacement|Cylinders|FuelEconomy|MSRP|
+---+--------------+--------------------+----------+------------+---------+-----------+----+
|  3|General Motors|         SPARK ACTIV| Aspirated|         1.4|     null|         32|null|
|  3|       Porsche|911 Carrera 4S Ca...|     Turbo|         3.0|        6|         24|null|
+---+--------------+--------------------+----------+------------+---------+-----------+----+

new_id = (
    id_removed
    .select(
        [fn.monotonically_increasing_id().alias('Id')] + 
        [col for col in id_removed.columns if col != 'Id'])
)

new_id.show()

+-------------+------------------+--------------------+----------+------------+---------+-----------+------+
|           Id|      Manufacturer|               Model|EngineType|Displacement|Cylinders|FuelEconomy|  MSRP|
+-------------+------------------+--------------------+----------+------------+---------+-----------+------+
|   8589934592|    General Motors|         SPARK ACTIV| Aspirated|         1.4|     null|         32|  null|
| 188978561024|     Mercedes-Benz|             CLS 550|     Turbo|         4.7|        8|         21| 79231|
| 197568495616|     Mercedes-Benz|                null|      null|        null|     null|         27|  null|
| 206158430208|Ford Motor Company|          FUSION AWD|     Turbo|         2.7|        6|         20|  null|
| 438086664192|               BMW|COOPER S HARDTOP ...|     Turbo|         2.0|        4|         26|  null|
| 523986010112|      Aston Martin|            Vanquish| Aspirated|         6.0|       12|         16|  null|
| 721554505728|        Volkswagen|                 GTI|     Turbo|         2.0|        4|       null|  null|
| 764504178688|               Kia|         Stinger RWD|     Turbo|         2.0|        4|         25|  null|
| 919123001344|               BMW|                330i|     Turbo|         2.0|     null|         27|  null|
| 944892805120|           Porsche|           Boxster S|     Turbo|         2.5|        4|         22|  null|
| 970662608896|        FCA US LLC|                 300| Aspirated|         3.6|        6|         23|  null|
|1030792151040|           Hyundai|             G80 AWD|     Turbo|         3.3|        6|         20|  null|
|1039382085632|               BMW|          440i Coupe|     Turbo|         3.0|        6|         23|  null|
|1116691496960|            Nissan|   Q50 AWD RED SPORT|     Turbo|         3.0|        6|         22|  null|
|1211180777472|               BMW|                X5 M|     Turbo|         4.4|        8|         18|121231|
|1331439861760|            Nissan|             Q70 AWD| Aspirated|         5.6|        8|         18|  null|
|1606317768704|           Porsche|911 Carrera 4S Ca...|     Turbo|         3.0|        6|         24|  null|
|1614907703296|            Toyota|     CAMRY HYBRID LE| Aspirated|         2.5|        4|         46|  null|
|1700807049216|                GE|  K1500 SUBURBAN 4WD| Aspirated|         5.3|        8|         18|  null|
+-------------+------------------+--------------------+----------+------------+---------+-----------+------+

Handling missing observations

Missing observations per row

(
    spark.createDataFrame(
        new_id.rdd.map(
           lambda row: (
                 row['Id']
               , sum([c == None for c in row])
           )
        )
        .filter(lambda el: el[1] > 1)
        .collect()
        ,
        ['Id', 'CountMissing']
    )
    .orderBy('CountMissing', ascending=False)
    .show()
)

+------------+------------+
|          Id|CountMissing|
+------------+------------+
|197568495616|           5|
|  8589934592|           2|
|919123001344|           2|
|721554505728|           2|
+------------+------------+

(
    spark.createDataFrame(
        new_id.rdd.map(
           lambda row: (
                 row['Id']
               , sum([c == None for c in row])
           )
        )
        .filter(lambda el: el[1] > 1)
        .collect()
        ,['Id', 'CountMissing']
    )
    .orderBy('CountMissing', ascending=False)
    .show()
)

+------------+------------+
|          Id|CountMissing|
+------------+------------+
|197568495616|           5|
|  8589934592|           2|
|919123001344|           2|
|721554505728|           2|
+------------+------------+

(
    new_id
    .where('Id == 197568495616')
    .show()
)

+------------+-------------+-----+----------+------------+---------+-----------+----+
|          Id| Manufacturer|Model|EngineType|Displacement|Cylinders|FuelEconomy|MSRP|
+------------+-------------+-----+----------+------------+---------+-----------+----+
|197568495616|Mercedes-Benz| null|      null|        null|     null|         27|null|
+------------+-------------+-----+----------+------------+---------+-----------+----+

new_id.show()

+-------------+------------------+--------------------+----------+------------+---------+-----------+------+
|           Id|      Manufacturer|               Model|EngineType|Displacement|Cylinders|FuelEconomy|  MSRP|
+-------------+------------------+--------------------+----------+------------+---------+-----------+------+
|   8589934592|    General Motors|         SPARK ACTIV| Aspirated|         1.4|     null|         32|  null|
| 188978561024|     Mercedes-Benz|             CLS 550|     Turbo|         4.7|        8|         21| 79231|
| 197568495616|     Mercedes-Benz|                null|      null|        null|     null|         27|  null|
| 206158430208|Ford Motor Company|          FUSION AWD|     Turbo|         2.7|        6|         20|  null|
| 438086664192|               BMW|COOPER S HARDTOP ...|     Turbo|         2.0|        4|         26|  null|
| 523986010112|      Aston Martin|            Vanquish| Aspirated|         6.0|       12|         16|  null|
| 721554505728|        Volkswagen|                 GTI|     Turbo|         2.0|        4|       null|  null|
| 764504178688|               Kia|         Stinger RWD|     Turbo|         2.0|        4|         25|  null|
| 919123001344|               BMW|                330i|     Turbo|         2.0|     null|         27|  null|
| 944892805120|           Porsche|           Boxster S|     Turbo|         2.5|        4|         22|  null|
| 970662608896|        FCA US LLC|                 300| Aspirated|         3.6|        6|         23|  null|
|1030792151040|           Hyundai|             G80 AWD|     Turbo|         3.3|        6|         20|  null|
|1039382085632|               BMW|          440i Coupe|     Turbo|         3.0|        6|         23|  null|
|1116691496960|            Nissan|   Q50 AWD RED SPORT|     Turbo|         3.0|        6|         22|  null|
|1211180777472|               BMW|                X5 M|     Turbo|         4.4|        8|         18|121231|
|1331439861760|            Nissan|             Q70 AWD| Aspirated|         5.6|        8|         18|  null|
|1606317768704|           Porsche|911 Carrera 4S Ca...|     Turbo|         3.0|        6|         24|  null|
|1614907703296|            Toyota|     CAMRY HYBRID LE| Aspirated|         2.5|        4|         46|  null|
|1700807049216|                GE|  K1500 SUBURBAN 4WD| Aspirated|         5.3|        8|         18|  null|
+-------------+------------------+--------------------+----------+------------+---------+-----------+------+

merc_out = new_id.dropna(thresh=4)
new_id.count(), merc_out.count()

(19, 18)

(
    merc_out
    .where('Id == 197568495616')
    .show()
)

+---+------------+-----+----------+------------+---------+-----------+----+
| Id|Manufacturer|Model|EngineType|Displacement|Cylinders|FuelEconomy|MSRP|
+---+------------+-----+----------+------------+---------+-----------+----+
+---+------------+-----+----------+------------+---------+-----------+----+

Missing observations per column

We also need to check whether there are columns with a particularly low incidence of useful information. There's a lot of things happening in the code we presented, so let's unpack itstep by step.

我们还需要检查是否存在有用信息发生率特别低的列。在我们展示的代码中发生了很多事情，所以让我们一步一步地解开它。

Let's start with the inner list:

让我们从内部列表开始：

[
               (1 - (fn.count(c) / fn.count('*')))
                    .alias(c + '_miss')
               for c in merc_out.columns
           ]

We loop through all the columns in the merc_out DataFrame and count how many nonmissing values we find in each column. We then pide it by the total count of all the rows and subtract this from 1 so we get the percentage of missing values.

我们遍历 merc_out DataFrame 中的所有列，并计算我们在每列中找到的非缺失值的数量。然后我们用所有行的总数计算它，然后从 1 中减去它，这样我们就得到了缺失值的百分比。

We imported pyspark.sql.functions as fn earlier in the chapter.

我们在本章前面将 pyspark.sql.functions 作为 fn 导入。

However, what we're actually doing here is not really calculating anything. The way Pythonstores this information, at this time, is just as a list of objects, or pointers, to certainoperations. Only after we pass the list to the .agg(...) method does it get translated into PySpark's internal execution graph (which only gets executed when we call the .collect()action).

然而，我们实际上在这里所做的并不是真正计算任何东西。目前，Python 存储这些信息的方式只是一个对象列表，或指向某些操作的指针。只有在我们将列表传递给 .agg(...) 方法之后，它才会被翻译成 PySpark 的内部执行图（只有在我们调用 .collect() 操作时才会被执行）。

The .agg(...) method accepts a set of parameters, not as a list object, but as a comma-separated list of parameters. Therefore, instead of passing the list itself to the .agg(...)method, we included '*' in front of the list, which unfolds each element of our list and passes it like a parameter to our method.

.agg(...) 方法接受一组参数，不是作为列表对象，而是作为逗号分隔的参数列表。因此，我们没有将列表本身传递给 .agg(...) 方法，而是在列表前面包含了“*”，它展开我们列表的每个元素并将其作为参数传递给我们的方法。

The .collect() method will return a list of one element—a Row object with aggregatedinformation. We can transform Row into a dictionary using the .asDict() method and thenextract all the items from it. This will result in a list of tuples, where the first element is thecolumn name (we used the .alias(...) method to append '_miss' to each column) and thesecond element is the percentage of missing observations.

.collect() 方法将返回一个包含一个元素的列表——一个带有聚合信息的 Row 对象。我们可以使用 .asDict() 方法将 Row 转换为字典，然后从中提取所有项目。这将产生一个元组列表，其中第一个元素是列名（我们使用 .alias(...) 方法将“_miss”附加到每一列），第二个元素是缺失观察的百分比。

While looping through the elements of the sorted list, we just print them to the screen:

在循环遍历排序列表的元素时，我们只是将它们打印到屏幕上：

for k, v in sorted(
    merc_out.agg(*[
               (1 - (fn.count(c) / fn.count('*')))
                    .alias(c + '_miss')
               for c in merc_out.columns
           ])
        .collect()[0]
        .asDict()
        .items()
    , key=lambda el: el[1]
    , reverse=True
):
    print(k, v)

MSRP_miss 0.8888888888888888
Cylinders_miss 0.11111111111111116
FuelEconomy_miss 0.05555555555555558
Id_miss 0.0
Manufacturer_miss 0.0
Model_miss 0.0
EngineType_miss 0.0
Displacement_miss 0.0

Well, it looks like most of the information in the MSRP column is missing. Thus, we can dropit, as it will not bring us any useful information:

好吧，看起来 MSRP 列中的大部分信息都丢失了。因此，我们可以放弃，因为它不会给我们带来任何有用的信息：

no_MSRP = merc_out.select([col for col in new_id.columns if col != 'MSRP'])
no_MSRP.show()

+-------------+------------------+--------------------+----------+------------+---------+-----------+
|           Id|      Manufacturer|               Model|EngineType|Displacement|Cylinders|FuelEconomy|
+-------------+------------------+--------------------+----------+------------+---------+-----------+
|   8589934592|    General Motors|         SPARK ACTIV| Aspirated|         1.4|     null|         32|
| 188978561024|     Mercedes-Benz|             CLS 550|     Turbo|         4.7|        8|         21|
| 206158430208|Ford Motor Company|          FUSION AWD|     Turbo|         2.7|        6|         20|
| 438086664192|               BMW|COOPER S HARDTOP ...|     Turbo|         2.0|        4|         26|
| 523986010112|      Aston Martin|            Vanquish| Aspirated|         6.0|       12|         16|
| 721554505728|        Volkswagen|                 GTI|     Turbo|         2.0|        4|       null|
| 764504178688|               Kia|         Stinger RWD|     Turbo|         2.0|        4|         25|
| 919123001344|               BMW|                330i|     Turbo|         2.0|     null|         27|
| 944892805120|           Porsche|           Boxster S|     Turbo|         2.5|        4|         22|
| 970662608896|        FCA US LLC|                 300| Aspirated|         3.6|        6|         23|
|1030792151040|           Hyundai|             G80 AWD|     Turbo|         3.3|        6|         20|
|1039382085632|               BMW|          440i Coupe|     Turbo|         3.0|        6|         23|
|1116691496960|            Nissan|   Q50 AWD RED SPORT|     Turbo|         3.0|        6|         22|
|1211180777472|               BMW|                X5 M|     Turbo|         4.4|        8|         18|
|1331439861760|            Nissan|             Q70 AWD| Aspirated|         5.6|        8|         18|
|1606317768704|           Porsche|911 Carrera 4S Ca...|     Turbo|         3.0|        6|         24|
|1614907703296|            Toyota|     CAMRY HYBRID LE| Aspirated|         2.5|        4|         46|
|1700807049216|                GE|  K1500 SUBURBAN 4WD| Aspirated|         5.3|        8|         18|
+-------------+------------------+--------------------+----------+------------+---------+-----------+

We still have two columns with some missing information. Let's do something about them.

Sparse missing observations

PySpark allows you to impute the missing observations. You can either pass a value thatevery null or None in your data will be replaced with, or you can pass a dictionary withdifferent values for each column with missing observations. In this example, we will use the latter approach and will specify a ratio between the fuel economy and displacement, andbetween the number of cylinders and displacement.

PySpark 允许您估算缺失的观察结果。您可以传递一个值，您的数据中的每个 null 或 None 都将被替换，或者您可以传递一个字典，该字典为每个缺少观察的列提供不同的值。在本例中，我们将使用后一种方法，并且将指定燃油经济性和排量之间以及气缸数和排量之间的比率。

First, let's create our dictionary:

multipliers = (
    no_MSRP
    .agg(
          fn.mean(
              fn.col('FuelEconomy') / 
              (
                  fn.col('Displacement') * fn.col('Cylinders')
              )
          ).alias('FuelEconomy')
        , fn.mean(
            fn.col('Cylinders') / 
            fn.col('Displacement')
        ).alias('Cylinders')
    )
).toPandas().to_dict('records')[0]

multipliers

{'FuelEconomy': 1.4957485048359973, 'Cylinders': 1.8353365984789105}

Here, we are effectively calculating our multipliers. In order to replace the missing values in the fuel economy, we will use the following formula:

在这里，我们正在有效地计算我们的multipliers。为了替换fueleconomy中的缺失值，我们将使用以下公式：

\[fuel\_economy\_mult=\frac {fuel\_economy} {displacement * cylinders} \]

For the number of cylinders, we will use the following equation:

对于cylinders，我们将使用以下公式：

\[cylinders_mult=\frac {cylinders} {displacement} \]

Our preceding code uses these two formulas to calculate the multiplier for each row and then takes the average of these.

我们前面的代码使用这两个公式来计算每一行的multiplier，然后取它们的平均值。

This is not going to be totally accurate but given the data we have, it should be accurate enough.

这不会完全准确，但鉴于我们拥有的数据，它应该足够准确。

Here, we also present yet another way of creating a dictionary out of your (small!) SparkDataFrame: use the .toPandas() method to convert the Spark DataFrame to a pandasDataFrame. The DataFrame of pandas has a .to_dict(...) method that will allow you to convertour data to a dictionary. The 'records' parameter instructs the method to convert each rowto a dictionary where the key is the column name with the corresponding record value.

在这里，我们还介绍了另一种从（小！）SparkDataFrame 创建字典的方法：使用 .toPandas() 方法将 Spark DataFrame 转换为 pandasDataFrame。 pandas 的 DataFrame 有一个 .to_dict(...) 方法，可以让您将数据转换为字典。 'records' 参数指示该方法将每一行转换为字典，其中键是具有相应记录值的列名。

Check out this link to read more about the .to_dict(...) method: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html.

Let's use it now to impute our missing data:

现在让我们用它来估算我们缺失的数据：

imputed = (
    no_MSRP
    .withColumn('FuelEconomy', fn.col('FuelEconomy')   / fn.col('Displacement') / fn.col('Cylinders'))
    .withColumn('Cylinders',   fn.col('Cylinders')   / fn.col('Displacement'))
    .fillna(multipliers)
    .withColumn('Cylinders',   (fn.col('Cylinders')   * fn.col('Displacement')).cast('integer'))
    .withColumn('FuelEconomy', fn.col('FuelEconomy') * fn.col('Displacement') * fn.col('Cylinders'))
)

imputed.show()

+-------------+------------------+--------------------+----------+------------+---------+------------------+
|           Id|      Manufacturer|               Model|EngineType|Displacement|Cylinders|       FuelEconomy|
+-------------+------------------+--------------------+----------+------------+---------+------------------+
|   8589934592|    General Motors|         SPARK ACTIV| Aspirated|         1.4|        2| 4.188095813540793|
| 188978561024|     Mercedes-Benz|             CLS 550|     Turbo|         4.7|        8|              21.0|
| 206158430208|Ford Motor Company|          FUSION AWD|     Turbo|         2.7|        5|16.666666666666668|
| 438086664192|               BMW|COOPER S HARDTOP ...|     Turbo|         2.0|        4|              26.0|
| 523986010112|      Aston Martin|            Vanquish| Aspirated|         6.0|       12|              16.0|
| 721554505728|        Volkswagen|                 GTI|     Turbo|         2.0|        4|11.965988038687978|
| 764504178688|               Kia|         Stinger RWD|     Turbo|         2.0|        4|              25.0|
| 919123001344|               BMW|                330i|     Turbo|         2.0|        3| 8.974491029015983|
| 944892805120|           Porsche|           Boxster S|     Turbo|         2.5|        4|              22.0|
| 970662608896|        FCA US LLC|                 300| Aspirated|         3.6|        6|              23.0|
|1030792151040|           Hyundai|             G80 AWD|     Turbo|         3.3|        6|              20.0|
|1039382085632|               BMW|          440i Coupe|     Turbo|         3.0|        6|23.000000000000004|
|1116691496960|            Nissan|   Q50 AWD RED SPORT|     Turbo|         3.0|        6|21.999999999999996|
|1211180777472|               BMW|                X5 M|     Turbo|         4.4|        8|              18.0|
|1331439861760|            Nissan|             Q70 AWD| Aspirated|         5.6|        8|              18.0|
|1606317768704|           Porsche|911 Carrera 4S Ca...|     Turbo|         3.0|        6|              24.0|
|1614907703296|            Toyota|     CAMRY HYBRID LE| Aspirated|         2.5|        4|              46.0|
|1700807049216|                GE|  K1500 SUBURBAN 4WD| Aspirated|         5.3|        8|              18.0|
+-------------+------------------+--------------------+----------+------------+---------+------------------+

First, we convert our original data so it also reflects the ratios we specified earlier. Next, we use the multipliers dictionary to fill in the missing values, and finally we revert the columnsto their original state.

首先，我们转换原始数据，使其也反映我们之前指定的比率。接下来，我们使用 multipliers 字典来填充缺失值，最后我们将列恢复到它们的原始状态。

Note that each time we use the .withColumn(...) method, we overwrite the original columnnames.

请注意，每次我们使用 .withColumn(...) 方法时，我们都会覆盖原始列名。

Check out PySpark's documentation on the missing observation methods:https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrameNaFunctions...

Handling outliers

Observations that differ greatly from the rest of the observations, that is, they are located inthe long tail(s) of the data distribution, are outliers. In this recipe, we will learn how to locateand handle the outliers.

与其余观测值有很大不同的观测值，即它们位于数据分布的长尾中，是异常值。在这个秘籍中，我们将学习如何定位和处理异常值。

We will only be looking at the numerical variables: the displacement, cylinders, and the fueleconomy.

我们将只关注数值变量：the displacement, cylinders, and the fueleconomy.

We loop through all these features and calculate the first and third quartiles using the .approxQuantile(...) method. The method takes the feature (column) name as its firstparameter, the float (or list of floats) of quartiles to calculate as the second parameter, and the third parameter specifies the relative target precision (setting this value to 0 will findexact quantiles but it can be very expensive).

我们遍历所有这些特征并使用 .approxQuantile(...) 方法计算第一个和第三个四分位数。该方法以特征（列）名称作为其第一个参数，四分位数的浮点数（或浮点数列表）作为第二个参数进行计算，第三个参数指定相对目标精度（将此值设置为 0 将找到精确的分位数，但它可能非常昂贵）。

The method returns a list of two (in our case) values: Q1 and Q3. We then calculate the interquartile range and append the (feature_name, [lower_bound, upper_bound]) tuple tothe cut_off_point list. After converting to a dictionary, our cut-off points are as follows:

该方法返回两个（在我们的例子中）值的列表：Q1 和 Q3。然后我们计算四分位距并将 (feature_name, [lower_bound, upper_bound]) 元组附加到 cut_off_point 列表。转换为字典后，我们的分界点如下：

features = ['Displacement', 'Cylinders', 'FuelEconomy']
quantiles = [0.25, 0.75]

cut_off_points = []

for feature in features:
    quants = imputed.approxQuantile(feature, quantiles, 0.05)
    
    IQR = quants[1] - quants[0]
    cut_off_points.append((feature, [
        quants[0] - 1.5 * IQR,
        quants[1] + 1.5 * IQR,
    ]))
    
cut_off_points = dict(cut_off_points)
cut_off_points

{'Displacement': [-1.6000000000000005, 8.0],
 'Cylinders': [-2.0, 14.0],
 'FuelEconomy': [7.166666666666664, 32.50000000000001]}

So, now we can use these to flag our outlying observations. We will only select the ID columns and then loop through our features to check whether they fall outside of our calculated bounds. Here's what we get:

所以，现在我们可以使用这些来标记我们的异常观察。我们将只选择 ID columns，然后遍历我们的特征以检查它们是否超出我们的计算范围。这是我们得到的：

# Next, we flag the outliers:
outliers = imputed.select(*['id'] + [
       (
           (imputed[f] < cut_off_points[f][0]) |
           (imputed[f] > cut_off_points[f][1])
       ).alias(f + '_o') for f in features
  ])
outliers.show()

+-------------+--------------+-----------+-------------+
|           id|Displacement_o|Cylinders_o|FuelEconomy_o|
+-------------+--------------+-----------+-------------+
|   8589934592|         false|      false|         true|
| 188978561024|         false|      false|        false|
| 206158430208|         false|      false|        false|
| 438086664192|         false|      false|        false|
| 523986010112|         false|      false|        false|
| 721554505728|         false|      false|        false|
| 764504178688|         false|      false|        false|
| 919123001344|         false|      false|        false|
| 944892805120|         false|      false|        false|
| 970662608896|         false|      false|        false|
|1030792151040|         false|      false|        false|
|1039382085632|         false|      false|        false|
|1116691496960|         false|      false|        false|
|1211180777472|         false|      false|        false|
|1331439861760|         false|      false|        false|
|1606317768704|         false|      false|        false|
|1614907703296|         false|      false|         true|
|1700807049216|         false|      false|        false|
+-------------+--------------+-----------+-------------+

So, we have two outliers in the fuel economy column. Let's check the records:

with_outliers_flag = imputed.join(outliers, on='Id')

(
    with_outliers_flag
    .filter('FuelEconomy_o')
    .select('Id', 'Manufacturer', 'Model', 'FuelEconomy')
    .show()
)

+-------------+--------------+---------------+-----------------+
|           Id|  Manufacturer|          Model|      FuelEconomy|
+-------------+--------------+---------------+-----------------+
|   8589934592|General Motors|    SPARK ACTIV|4.188095813540793|
|1614907703296|        Toyota|CAMRY HYBRID LE|             46.0|
+-------------+--------------+---------------+-----------------+

First, we join our imputed DataFrame with the outliers one and then we filter on the FuelEconomy_o flag to select our outlying records only. Finally, we just extract the most relevant columns to show:

首先，我们将估算的 DataFrame 与异常值连接起来，然后我们过滤 FuelEconomy_o 标志以仅选择我们的异常记录。最后，我们只提取最相关的列来显示

So we have SPARK ACTIV and CAMRY HYBRID LE as the outliers. SPARK ACTIV became anoutlier due to our imputation logic, as we had to impute its fuel economy values; given that its engine's displacement is 1.4 liters, our logic didn't work out well. Well, there are otherways you can impute the values. The Camry, being a hybrid, is definitely an outlier in adataset dominated by large and turbo-charged engines; it should not surprise us to see ithere.

所以我们将 SPARK ACTIV 和 CAMRY HYBRID LE 作为异常值。由于我们的估算逻辑，SPARK ACTIV 变得异常，因为我们必须估算其燃油经济性值；考虑到它的发动机排量为 1.4 升，我们的逻辑不太好。好吧，还有其他方法可以估算这些值。凯美瑞是一款混合动力车，在大型涡轮增压发动机主导的数据集中绝对是一个异常值。在那里看到我们应该不会感到惊讶。

Trying to build a machine learning model based on data with outliers can lead to some untrustworthy results or a model that does not generalize well, so we normally remove these from our dataset:

尝试基于具有异常值的数据构建机器学习模型可能会导致一些不可信的结果或模型不能很好地泛化，因此我们通常从我们的数据集中删除这些：

no_outliers = (
    with_outliers_flag
    .filter('!FuelEconomy_o')123456
    .select(imputed.columns)
)

no_outliers

DataFrame[Id: bigint, Manufacturer: string, Model: string, EngineType: string, Displacement: double, Cylinders: int, FuelEconomy: double]

The preceding snippet simply filters out all the records that are not outliers in ourFuelEconomy_o column. That's it!

前面的代码段只是过滤掉了我们FuelEconomy_o 列中不是异常值的所有记录。而已！

Check out this website for more information about outliers: http://www.itl.nist.gov/p898/handbook/prc/section1/prc16.htm

Exploring descriptive statistics

此处

https://weread.qq.com/web/reader/25c32b30722ffe3f25ce067k69a32ca02cf69adc1e109af

descriptive_stats = no_outliers.describe(features)
descriptive_stats.show()

+-------+-----------------+-----------------+------------------+
|summary|     Displacement|        Cylinders|       FuelEconomy|
+-------+-----------------+-----------------+------------------+
|  count|               16|               16|                16|
|   mean|          3.44375|            6.125|19.600446608398165|
| stddev|1.354975399530683|2.276693508870558| 4.666647767373751|
|    min|              2.0|                3| 8.974491029015983|
|    max|              6.0|               12|              26.0|
+-------+-----------------+-----------------+------------------+

descriptive_stats_all = no_outliers.describe()
descriptive_stats_all.show()

+-------+--------------------+------------+-----+----------+-----------------+-----------------+------------------+
|summary|                  Id|Manufacturer|Model|EngineType|     Displacement|        Cylinders|       FuelEconomy|
+-------+--------------------+------------+-----+----------+-----------------+-----------------+------------------+
|  count|                  16|          16|   16|        16|               16|               16|                16|
|   mean|    9.19659872256E11|        null|300.0|      null|          3.44375|            6.125|19.600446608398165|
| stddev|4.396778949583304E11|        null|  NaN|      null|1.354975399530683|2.276693508870558| 4.666647767373751|
|    min|        188978561024|Aston Martin|  300| Aspirated|              2.0|                3| 8.974491029015983|
|    max|       1700807049216|  Volkswagen| X5 M|     Turbo|              6.0|               12|              26.0|
+-------+--------------------+------------+-----+----------+-----------------+-----------------+------------------+

(
    no_outliers
    .select(features)
    .groupBy('Cylinders')
    .agg(*[
          fn.count('*').alias('Count')
        , fn.mean('FuelEconomy').alias('MPG_avg')
        , fn.mean('Displacement').alias('Disp_avg')
        , fn.stddev('FuelEconomy').alias('MPG_stdev')
        , fn.stddev('Displacement').alias('Disp_stdev')
    ])
    .orderBy('Cylinders')
).show()

+---------+-----+------------------+------------------+------------------+-------------------+
|Cylinders|Count|           MPG_avg|          Disp_avg|         MPG_stdev|         Disp_stdev|
+---------+-----+------------------+------------------+------------------+-------------------+
|        3|    1| 8.974491029015983|               2.0|               NaN|                NaN|
|        4|    4|21.241497009671995|             2.125| 6.413009924998989|0.24999999999999994|
|        5|    1|16.666666666666668|               2.7|               NaN|                NaN|
|        6|    5|              22.4|3.1799999999999997|1.5165750888103104|0.26832815729997467|
|        8|    4|             18.75|               5.0|               1.5| 0.5477225575051655|
|       12|    1|              16.0|               6.0|               NaN|                NaN|
+---------+-----+------------------+------------------+------------------+-------------------+

Computing correlations

(
    no_outliers
    .corr('Cylinders', 'Displacement')
)

0.9381829964408109

n_features = len(features)

corr = []

for i in range(0, n_features):
    temp = [None] * i

    for j in range(i, n_features):
        temp.append(no_outliers.corr(features[i], features[j]))
    corr.append([features[i]] + temp)

correlations = spark.createDataFrame(corr, ['Column'] + features)

correlations.show()

+------------+------------+------------------+--------------------+
|      Column|Displacement|         Cylinders|         FuelEconomy|
+------------+------------+------------------+--------------------+
|Displacement|         1.0|0.9381829964408109|-0.10757908872387652|
|   Cylinders|        null|               1.0|-0.04218546545035314|
| FuelEconomy|        null|              null|                 1.0|
+------------+------------+------------------+--------------------+

Drawing histograms

histogram_MPG = (
    no_outliers
    .select('FuelEconomy')
    .rdd
    .flatMap(lambda record: record)
    .histogram(5)
)

for i in histogram_MPG:
    print(i)
    
histogram_MPG

[8.974491029015983, 12.379592823212786, 15.78469461740959, 19.189796411606395, 22.594898205803197, 26.0]
[2, 0, 5, 4, 5]
([8.974491029015983, 12.379592823212786, 15.78469461740959, 19.189796411606395, 22.594898205803197, 26.0], [2, 0, 5, 4, 5])

for i in range(len(histogram_MPG[0])-1):
    print('[' + str(round(histogram_MPG[0][i],2))
        + ',' + str(round(histogram_MPG[0][i+1],2))
          + ')'
         )

[8.97,12.38)
[12.38,15.78)
[15.78,19.19)
[19.19,22.59)
[22.59,26.0)

(
    spark
    .createDataFrame(
        [(bins, counts) 
         for bins, counts 
         in zip(
             histogram_MPG[0], 
             histogram_MPG[1]
         )]
        , ['bins', 'counts']
    )
    .registerTempTable('histogram_MPG')
)

%%sql -o hist_MPG -q
SELECT * FROM histogram_MPG

%%local
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

fig = plt.figure(figsize=(12,9))
ax = fig.add_subplot(1, 1, 1)
ax.bar(hist_MPG['bins'], hist_MPG['counts'], width=3)
ax.set_title('Histogram of fuel economy')

Text(0.5,1,'Histogram of fuel economy')

png

%%local
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
output_notebook()

labels = [str(round(e, 2)) for e in hist_MPG['bins']]

p = figure(
    x_range=labels, 
    plot_height=350, 
    title='Histogram of fuel economy'
)

p.vbar(x=labels, top=hist_MPG['counts'], width=0.9)

show(p)

Loading BokehJS ...

Visualizing interactions between features

scatter = (
    no_outliers
    .select('Displacement', 'Cylinders')
)

scatter.registerTempTable('scatter')

%%sql -o scatter_source -q
SELECT * FROM scatter

%%local
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

fig = plt.figure(figsize=(12,9))
ax = fig.add_subplot(1, 1, 1)
ax.scatter(
      list(scatter_source['Cylinders'])
    , list(scatter_source['Displacement'])
    , s = 200
    , alpha = 0.5
)

ax.set_xlabel('Cylinders')
ax.set_ylabel('Displacement')

ax.set_title('Relationship between cylinders and displacement')

Text(0.5,1,'Relationship between cylinders and displacement')

png

%%local 
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.io import output_notebook
output_notebook()
p = figure(title = 'Relationship between cylinders and displacement')
p.xaxis.axis_label = 'Cylinders'
p.yaxis.axis_label = 'Displacement'

p.circle(  list(scatter_source['Cylinders'])
         , list(scatter_source['Displacement'])
         , fill_alpha=0.2, size=10)

show(p)

Loading BokehJS ...

posted @ 2021-12-13 23:47 Geoffreygau 阅读(65) 评论(0) 编辑收藏举报

刷新页面返回顶部

Geoffrey

Long, long the pathway to Cold Hill;
Drear, drear the waterside so chill.

Geoffrey

pyspark_test

PySpark Cookbook

Tomasz Drabas, Denny Lee

Version: 0.1

Date: 1/15/2018

Some data

Handling duplicates

Exact duplicates

Only ID differs

Duplicated IDs

Handling missing observations

Missing observations per row

Missing observations per column

Sparse missing observations

Handling outliers

Exploring descriptive statistics

此处

Computing correlations

Drawing histograms

Visualizing interactions between features

公告

Geoffrey

Long, long the pathway to Cold Hill; Drear, drear the waterside so chill.

Geoffrey

pyspark_test

PySpark Cookbook

Tomasz Drabas, Denny Lee

Version: 0.1

Date: 1/15/2018

Some data

Handling duplicates

Exact duplicates

Only ID differs

Duplicated IDs

Handling missing observations

Missing observations per row

Missing observations per column

Sparse missing observations

Handling outliers

Exploring descriptive statistics

此处

Computing correlations

Drawing histograms

Visualizing interactions between features

公告

Long, long the pathway to Cold Hill;
Drear, drear the waterside so chill.