Spark高级数据分析· 2数据分析

wget https://archive.ics.uci.edu/ml/machine-learning-databases/00210/donation.zip

数据清洗

cd /Users/erichan/garden/spark-1.6.0-bin-hadoop2.6/bin
./spark-shell --master local

val data ="/Users/erichan/AliDrive/ml_spark/data/linkage"
val rawblocks = sc.textFile(data)
//rawblocks.count()
//res0: Long = 6552407
//val head = rawblocks.take(10)

val noheader = rawblocks.filter(l => !l.contains("id_1"))

noheader.filter(l => l.contains("cmp_fname_c1")).foreach(println)

//noheader.count()
//res1: Long = 6552396

case class MatchData(id1: Int, id2: Int, scores: Array[Double], matched: Boolean)

def toDouble(s: String) = {
  if ("?".equals(s)) Double.NaN else s.toDouble
}
def parse(line: String) = {
  val pieces = line.split(',')
  val id1 = pieces(0).toInt
  val id2 = pieces(1).toInt
  val scores = pieces.slice(2, 11).map(toDouble)
  val matched = pieces(11).toBoolean
  MatchData(id1, id2, scores, matched)
}
val parsed = noheader.map(line => parse(line))

val matchCounts = parsed.map(md => md.matched).countByValue()
val matchCountsSeq = matchCounts.toSeq

import java.lang.Double.isNaN
val stats = (0 until 9).map(i => {
  parsed.map(md => md.scores(i)).filter(!isNaN(_)).stats()
})
posted @ 2016-03-25 20:53  开心玩数据  阅读(733)  评论(0编辑  收藏  举报