spark教程-1

scala基本操作

scala> val input=sc.textFile("C:\\Users\\gwj\\Desktop\\cont.txt")
input: org.apache.spark.rdd.RDD[String] = C:\Users\gwj\Desktop\cont.txt MapPartitionsRDD[3] at textFile at <console>:23

scala> input.count()
res2: Long = 129

scala> val rdd = sc.parallelize(Array(1,2,2,4),4)
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[4] at parallelize at <console>:23

scala> rdd.count()
res3: Long = 4

scala> rdd.foreach(print)
2241
scala> rdd.foreach(println)
2
1
2
4
scala> val lines2=input.filter(line=>line.contains("知乎"))
lines2: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[5] at filter at <console>:23
scala> lines2.foreach(println)
有没有大佬知道现在什么免费的虚拟机好用啊啊? - 知乎
病毒会不会逃出虚拟机? - 知乎
大佬们,主要的开源虚拟化平台都有哪些啊。? - 知乎
虚拟机 VMware 和 VirtualBox 哪个更好用? - 知乎
推荐一个免费好用的虚拟机软件 - 知乎
虚拟机 VMware 和 VirtualBox 哪个更好用? - 知乎
VirtualBox - 知乎
知乎专栏
推荐一个免费好用的虚拟机软件 - 知乎
VMware下载、安装、卸载、使用 - 知乎

scala> val lines=sc.parallelize(Array("Hello","Spark","Hello","World"))
lines: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[7] at parallelize at <console>:23

scala> lines.foreach(println)
World
Hello
Spark
Hello

scala> val lines2=lines.map(word=>(word,1))
lines2: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[8] at map at <console>:23

scala> lines2.foreach(println)
(Spark,1)
(Hello,1)
(Hello,1)
(World,1)

scala> val lines3=lines.filter(word=>word.contains("ll"))
lines3: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[9] at filter at <console>:23

scala> lines3.foreach(print)
HelloHello
scala> lines3.foreach(println)
Hello
Hello

scala> val line4=lines.flatMap(word=>word.split(" "))
line4: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[10] at flatMap at <console>:23

scala> line4.foreach(prirnt)
<console>:24: error: not found: value prirnt
       line4.foreach(prirnt)
                     ^

scala> line4.foreach(print)
HelloSparkWorldHello

scala> line4.foreach(print)
HelloSparkWorldHello
scala> line4.foreach(println)
World
Hello
Hello
Spark

scala> val line5=line4.map(word=>(word,1))
line5: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[11] at map at <console>:23

scala> line5.foreach(println)
(World,1)
(Spark,1)
(Hello,1)
(Hello,1)

scala> val line6=sc.parallelize(Array("a","a","b","c"))
line6: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[12] at parallelize at <console>:23

scala> line6.foreach(println)
b
a
c
a

scala> val line7=line6.distinct()
line7: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[15] at distinct at <console>:23

scala> line7.foreach(println)
b
c
a

scala> val line8=line7.intersection(line6)
line8: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[21] at intersection at <console>:24

scala> line8.foreach(println)
c
a
b

scala> val line9=line7.subtract(line6)
line9: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[25] at subtract at <console>:24

scala> line9.foreach(println)

scala> val line10=line6.subtract(line7)
line10: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[29] at subtract at <console>:24

scala> line10.foreach(println)
posted @ 2023-08-31 16:39  aondw  阅读(7)  评论(0编辑  收藏  举报