Spark & Jupyter Notebook
In [3]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
sparkconf = SparkConf().setAppName("myspark")
#spark = SparkSession.builder.master("spark://localhost:7077").appName("test").getOrCreate()
spark = SparkSession.builder.config(conf=sparkconf).getOrCreate()
In [4]:
print(spark)
<pyspark.sql.session.SparkSession object at 0x7f82ba80c978>
In [12]:
!ls ./data
sample1.csv sample.csv
In [7]:
!whoami
root
In [8]:
!klist
Ticket cache: FILE:/tmp/krb5cc_0 Default principal: weirenjie@FASTRETAILING-PRO.CN Valid starting Expires Service principal 05/29/2019 10:54:45 05/30/2019 10:54:45 krbtgt/FASTRETAILING-PRO.CN@FASTRETAILING-PRO.CN renew until 06/05/2019 10:54:45
In [9]:
!hdfs dfs -ls /
Found 22 items drwxrwxrwx - hive supergroup 0 2018-11-29 17:54 /app drwxr-x--x - nifi nifi 0 2018-12-18 16:17 /archive drwx--x--x - hbase supergroup 0 2019-02-25 12:04 /data drwxr-x--x - nifi nifi 0 2018-12-18 16:18 /etl drwxrwxrwx - kettle hive 0 2019-01-21 11:05 /event_tracking drwxr-x--x - kettle hive 0 2018-12-13 10:57 /event_tracking_hk drwxr-x--x - kettle hive 0 2018-10-24 14:19 /ftp drwxr-x--x - kettle hive 0 2018-11-22 13:38 /ftp_hk drwx------ - hbase hbase 0 2019-03-14 19:28 /hbase drwxrwxrwx - insight supergroup 0 2018-12-17 14:21 /home drwxr-x--x - kettle hive 0 2019-01-21 11:06 /kafkadata drwxr-x--x - kettle hive 0 2018-12-13 10:57 /kafkadata_hk drwxrwxrwx - nifi nifi 0 2018-12-18 16:18 /model.db drwxrwxrwx - hdfs supergroup 0 2019-05-15 22:55 /system drwxrwxrwt - hdfs supergroup 0 2019-04-01 14:01 /tmp drwxr-x--x - uat-ecbi hive 0 2019-01-10 13:51 /uat_event_tracking drwxr-x--x - uat-ecbi hive 0 2019-01-18 14:12 /uat_event_tracking_hk drwxr-x--x - uat-ecbi hive 0 2019-01-08 17:19 /uat_ftp drwxr-x--x - uat-ecbi hive 0 2019-01-18 14:13 /uat_ftp_hk drwxr-x--x - uat-ecbi hive 0 2019-01-10 13:41 /uat_kafkadata drwxr-x--x - uat-ecbi hive 0 2019-01-18 14:13 /uat_kafkadata_hk drwxr-xr-x - hdfs supergroup 0 2019-05-27 10:56 /user
In [28]:
from pyspark.sql.types import *
inputPath = "./data/"
# Since we know the data format already, let's define the schema to speed up processing (no need for Spark to infer schema)
csvSchema = StructType([ StructField("plu", StringType(), True), StructField("sum_cd", IntegerType(), True), StructField("price", IntegerType(), True), StructField("gdept", StringType(), True) ])
# Static DataFrame representing data in the JSON files
staticInputDF = (
spark
.read
.format('csv')
.schema(csvSchema)
.option('header','false')
.load(inputPath)
)
staticInputDF.show()
+-------------+-------+-----+-----+ | plu| sum_cd|price|gdept| +-------------+-------+-----+-----+ |2000011896645|5070756| 99| 24| |2000011896645|3815352| 199| 25| |2000011896645|4542009| 99| 24| |2000011896645|5454821| 39| 37| |2000011896645|5507537| 259| 23| |2000011896645|5507538| 259| 23| |2000011896645|5507540| 259| 23| |2000011896645|5454813| 39| 37| |2000011896645|3815606| 299| 25| |2000011896645|4245109| 199| 24| |2000011896645|5070770| 99| 24| |2000011896645|4245107| 199| 24| |2000011896645|4031104| 199| 24| |2000011896645|5126211| 39| 34| |2000011896645|4031094| 199| 24| |2000011896645|4245106| 199| 24| |2000011896645|5070765| 99| 24| |2000011896645|4245104| 199| 24| |2000011896645|5070754| 99| 24| |2000011896645|4456272| 99| 34| +-------------+-------+-----+-----+ only showing top 20 rows
In [30]:
staticInputDF.createOrReplaceTempView("static_counts")
In [33]:
#%sql select * from static_counts
dataFrame =spark.sql("Select count(*) from static_counts")
dataFrame.show()
+--------+ |count(1)| +--------+ | 649998| +--------+