Spark2 探索性数据统计分析
data数据源,请参考我的博客http://www.cnblogs.com/wwxbi/p/6063613.html
import org.apache.Spark.sql.DataFrameStatFunctions
import org.apache.spark.sql.functions._
相关系数
val df = Range(0,10,step=1).toDF("id").withColumn("rand1", rand(seed=10)).withColumn("rand2", rand(seed=27)) df: org.apache.spark.sql.DataFrame = [id: int, rand1: double ... 1 more field] df.show +---+-------------------+-------------------+ | id| rand1| rand2| +---+-------------------+-------------------+ | 0|0.41371264720975787| 0.714105256846827| | 1| 0.7311719281896606| 0.8143487574232506| | 2| 0.9031701155118229| 0.5282207324381174| | 3|0.09430205113458567| 0.4420100497826609| | 4|0.38340505276222947| 0.9387162206758006| | 5| 0.5569246135523511| 0.6398126862647711| | 6| 0.4977441406613893| 0.9895498513115722| | 7| 0.2076666106201438| 0.3398720242725498| | 8| 0.9571919406508957|0.15042237695815963| | 9| 0.7429395461204413| 0.7302723457066639| +---+-------------------+-------------------+ df.stat.corr("rand1", "rand2", "pearson") res24: Double = -0.10993962467082698
查看数据的统计分布情况
val colArray = Array("age", "yearsmarried", "religiousness", "education", "occupation", "rating") // 查看数据的统计分布情况 val descrDF = data.describe("age", "yearsmarried", "religiousness", "education", "occupation", "rating") descrDF: org.apache.spark.sql.DataFrame = [summary: string, age: string ... 5 more fields] descrDF.selectExpr("summary", "round(age,2) as age", "round(yearsmarried,2) as yearsmarried", "round(religiousness,2) as religiousness", "round(education,2) as education", "round(occupation,2) as occupation", "round(rating,2) as rating").show(10, truncate = false) +-------+-----+------------+-------------+---------+----------+------+ |summary|age |yearsmarried|religiousness|education|occupation|rating| +-------+-----+------------+-------------+---------+----------+------+ |count |601.0|601.0 |601.0 |601.0 |601.0 |601.0 | |mean |32.49|8.18 |3.12 |16.17 |4.19 |3.93 | |stddev |9.29 |5.57 |1.17 |2.4 |1.82 |1.1 | |min |17.5 |0.13 |1.0 |9.0 |1.0 |1.0 | |max |57.0 |15.0 |5.0 |20.0 |7.0 |5.0 | +-------+-----+------------+-------------+---------+----------+------+
统计字段中元素的个数
// 统计字段中元素的个数 val fi = data.stat.freqItems(colArray) fi: org.apache.spark.sql.DataFrame = [age_freqItems: array<double>, yearsmarried_freqItems: array<double> ... 4 more fields] fi.printSchema() root |-- age_freqItems: array (nullable = true) | |-- element: double (containsNull = false) |-- yearsmarried_freqItems: array (nullable = true) | |-- element: double (containsNull = false) |-- religiousness_freqItems: array (nullable = true) | |-- element: double (containsNull = false) |-- education_freqItems: array (nullable = true) | |-- element: double (containsNull = false) |-- occupation_freqItems: array (nullable = true) | |-- element: double (containsNull = false) |-- rating_freqItems: array (nullable = true) | |-- element: double (containsNull = false) val f = fi.selectExpr( | "size(age_freqItems)", | "size(yearsmarried_freqItems)", | "size(religiousness_freqItems)", | "size(education_freqItems)", | "size(occupation_freqItems)", | "size(rating_freqItems)") f: org.apache.spark.sql.DataFrame = [size(age_freqItems): int, size(yearsmarried_freqItems): int ... 4 more fields] f.show(10, truncate = false) +-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+ |size(age_freqItems)|size(yearsmarried_freqItems)|size(religiousness_freqItems)|size(education_freqItems)|size(occupation_freqItems)|size(rating_freqItems)| +-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+ |9 |8 |5 |7 |7 |5 | +-------------------+----------------------------+-----------------------------+-------------------------+--------------------------+----------------------+
集合字段的元素
// 集合字段的元素 val f1 = data.stat.freqItems(Array("age", "yearsmarried", "religiousness")) f1: org.apache.spark.sql.DataFrame = [age_freqItems: array<double>, yearsmarried_freqItems: array<double> ... 1 more field] f1.show(10, truncate = false) +------------------------------------------------------+-----------------------------------------------+-------------------------+ |age_freqItems |yearsmarried_freqItems |religiousness_freqItems | +------------------------------------------------------+-----------------------------------------------+-------------------------+ |[32.0, 47.0, 22.0, 52.0, 37.0, 17.5, 27.0, 57.0, 42.0]|[0.75, 0.125, 1.5, 0.417, 4.0, 7.0, 10.0, 15.0]|[2.0, 5.0, 4.0, 1.0, 3.0]| +------------------------------------------------------+-----------------------------------------------+-------------------------+ // 对数组的元素排序 f1.selectExpr("sort_array(age_freqItems)", "sort_array(yearsmarried_freqItems)", "sort_array(religiousness_freqItems)").show(10, truncate = false) +------------------------------------------------------+-----------------------------------------------+-----------------------------------------+ |sort_array(age_freqItems, true) |sort_array(yearsmarried_freqItems, true) |sort_array(religiousness_freqItems, true)| +------------------------------------------------------+-----------------------------------------------+-----------------------------------------+ |[17.5, 22.0, 27.0, 32.0, 37.0, 42.0, 47.0, 52.0, 57.0]|[0.125, 0.417, 0.75, 1.5, 4.0, 7.0, 10.0, 15.0]|[1.0, 2.0, 3.0, 4.0, 5.0] | +------------------------------------------------------+-----------------------------------------------+-----------------------------------------+ // 集合字段的元素 val f2 = data.stat.freqItems(Array("education", "occupation", "rating")) f2: org.apache.spark.sql.DataFrame = [education_freqItems: array<double>, occupation_freqItems: array<double> ... 1 more field] f2.show(10, truncate = false) +-----------------------------------------+-----------------------------------+-------------------------+ |education_freqItems |occupation_freqItems |rating_freqItems | +-----------------------------------------+-----------------------------------+-------------------------+ |[17.0, 20.0, 14.0, 16.0, 9.0, 18.0, 12.0]|[2.0, 5.0, 4.0, 7.0, 1.0, 3.0, 6.0]|[2.0, 5.0, 4.0, 1.0, 3.0]| +-----------------------------------------+-----------------------------------+-------------------------+ // 对数组的元素排序 f2.selectExpr("sort_array(education_freqItems)", "sort_array(occupation_freqItems)", "sort_array(rating_freqItems)").show(10, truncate = false) +-----------------------------------------+--------------------------------------+----------------------------------+ |sort_array(education_freqItems, true) |sort_array(occupation_freqItems, true)|sort_array(rating_freqItems, true)| +-----------------------------------------+--------------------------------------+----------------------------------+ |[9.0, 12.0, 14.0, 16.0, 17.0, 18.0, 20.0]|[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] |[1.0, 2.0, 3.0, 4.0, 5.0] | +-----------------------------------------+--------------------------------------+----------------------------------+