在最新的master分支上官方提供了Spark JDBC外部数据源的实现,先尝为快。
通过spark-shell测试:
import org.apache.spark.sql.SQLContext val sqlContext = new SQLContext(sc) import sqlContext._ val TBLS_JDBC_DDL = s""" |CREATE TEMPORARY TABLE spark_tbls |USING org.apache.spark.sql.jdbc |OPTIONS ( | url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', | dbtable 'TBLS' |)""".stripMargin sqlContext.sql(TBLS_JDBC_DDL)
指定列查询:
sql("SELECT * FROM spark_tbls").collect.foreach(println)
[1,1423100397,1,0,spark,0,1,page_views,MANAGED_TABLE,A,D] [6,1423116106,1,0,spark,0,6,order_created,MANAGED_TABLE,B,E] [7,1423116131,1,0,spark,0,7,test_load1,MANAGED_TABLE,C,F] [8,1423116145,1,0,spark,0,8,order_picked,MANAGED_TABLE,null,null] [9,1423116160,1,0,spark,0,9,order_shipped,MANAGED_TABLE,null,null] [10,1423116168,1,0,spark,0,10,order_received,MANAGED_TABLE,null,null] [11,1423116179,1,0,spark,0,11,order_cancelled,MANAGED_TABLE,null,null] [12,1423116193,1,0,spark,0,12,order_tracking,MANAGED_TABLE,null,null] [13,1423116248,1,0,spark,0,13,order_tracking_join,MANAGED_TABLE,null,null] [14,1423116298,1,0,spark,0,14,click_log,MANAGED_TABLE,null,null] [15,1423116316,1,0,spark,0,15,ad_list,MANAGED_TABLE,null,null][16,1423116324,1,0,spark,0,16,ad_list_string,MANAGED_TABLE,null,null] [17,1423116338,1,0,spark,0,17,cookie_cats,MANAGED_TABLE,null,null]
查询表中指定列:
sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls").collect.foreach(println)
[1,page_views,MANAGED_TABLE] [6,order_created,MANAGED_TABLE] [7,test_load1,MANAGED_TABLE] [8,order_picked,MANAGED_TABLE] [9,order_shipped,MANAGED_TABLE] [10,order_received,MANAGED_TABLE] [11,order_cancelled,MANAGED_TABLE] [12,order_tracking,MANAGED_TABLE] [13,order_tracking_join,MANAGED_TABLE] [14,click_log,MANAGED_TABLE] [15,ad_list,MANAGED_TABLE] [16,ad_list_string,MANAGED_TABLE] [17,cookie_cats,MANAGED_TABLE]
指定查询条件查询:
sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID = 1").collect.foreach(println) [1,page_views,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID < 7").collect.foreach(println) [1,page_views,MANAGED_TABLE] [6,order_created,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID <= 7").collect.foreach(println) [1,page_views,MANAGED_TABLE] [6,order_created,MANAGED_TABLE] [7,test_load1,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID > 7").collect.foreach(println) [8,order_picked,MANAGED_TABLE] [9,order_shipped,MANAGED_TABLE] [10,order_received,MANAGED_TABLE] [11,order_cancelled,MANAGED_TABLE] [12,order_tracking,MANAGED_TABLE] [13,order_tracking_join,MANAGED_TABLE] [14,click_log,MANAGED_TABLE] [15,ad_list,MANAGED_TABLE] [16,ad_list_string,MANAGED_TABLE] [17,cookie_cats,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE FROM spark_tbls WHERE TBL_ID >= 7").collect.foreach(println) [7,test_load1,MANAGED_TABLE] [8,order_picked,MANAGED_TABLE] [9,order_shipped,MANAGED_TABLE] [10,order_received,MANAGED_TABLE] [11,order_cancelled,MANAGED_TABLE] [12,order_tracking,MANAGED_TABLE] [13,order_tracking_join,MANAGED_TABLE] [14,click_log,MANAGED_TABLE] [15,ad_list,MANAGED_TABLE] [16,ad_list_string,MANAGED_TABLE] [17,cookie_cats,MANAGED_TABLE] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE VIEW_EXPANDED_TEXT IS NULL").collect.foreach(println) [8,order_picked,MANAGED_TABLE,null] [9,order_shipped,MANAGED_TABLE,null] [10,order_received,MANAGED_TABLE,null] [11,order_cancelled,MANAGED_TABLE,null] [12,order_tracking,MANAGED_TABLE,null] [13,order_tracking_join,MANAGED_TABLE,null] [14,click_log,MANAGED_TABLE,null] [15,ad_list,MANAGED_TABLE,null] [16,ad_list_string,MANAGED_TABLE,null] [17,cookie_cats,MANAGED_TABLE,null] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE VIEW_EXPANDED_TEXT IS NOT NULL").collect.foreach(println) [1,page_views,MANAGED_TABLE,A] [6,order_created,MANAGED_TABLE,B] [7,test_load1,MANAGED_TABLE,C] sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls WHERE TBL_ID>=7 AND TBL_ID <=10").collect.foreach(println) [7,test_load1,MANAGED_TABLE,C] [8,order_picked,MANAGED_TABLE,null] [9,order_shipped,MANAGED_TABLE,null] [10,order_received,MANAGED_TABLE,null]
多partition并行执行: 可以通过http://hadoop000:4040/jobs/的tasks数查看
val TBLS_PARTS_JDBC_DDL = s""" |CREATE TEMPORARY TABLE spark_tbls_parts |USING org.apache.spark.sql.jdbc |OPTIONS ( | url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', | dbtable 'TBLS', | partitionColumn 'TBL_ID', | lowerBound '1', | upperBound '50', | numPartitions '3' |)""".stripMargin sqlContext.sql(TBLS_PARTS_JDBC_DDL) sql("SELECT TBL_ID,TBL_NAME,TBL_TYPE,VIEW_EXPANDED_TEXT FROM spark_tbls_parts WHERE VIEW_EXPANDED_TEXT IS NULL").collect.foreach(println)
[8,order_picked,MANAGED_TABLE,null] [9,order_shipped,MANAGED_TABLE,null] [10,order_received,MANAGED_TABLE,null] [11,order_cancelled,MANAGED_TABLE,null] [12,order_tracking,MANAGED_TABLE,null] [13,order_tracking_join,MANAGED_TABLE,null] [14,click_log,MANAGED_TABLE,null] [15,ad_list,MANAGED_TABLE,null] [16,ad_list_string,MANAGED_TABLE,null] [17,cookie_cats,MANAGED_TABLE,null] [21,emp,MANAGED_TABLE,null] [22,dept,MANAGED_TABLE,null]
多表关联查询:
val COLUMNS_V2_JDBC_DDL = s""" |CREATE TEMPORARY TABLE spark_column_v2 |USING org.apache.spark.sql.jdbc |OPTIONS ( | url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', | dbtable 'COLUMNS_V2' |)""".stripMargin sqlContext.sql(COLUMNS_V2_JDBC_DDL) sql("SELECT CD_ID, COLUMN_NAME FROM spark_column_v2").collect.foreach(println)
[1,city_id] [1,end_user_id] [1,ip] [1,referer] [1,session_id] [1,track_time] [1,url] [6,event_time] [6,ordernumber] [7,id] [7,name] [8,event_time] [8,ordernumber] [9,event_time] [9,ordernumber] [10,event_time] [10,ordernumber] [11,event_time] [11,ordernumber] [12,order_cancelled_ts] [12,order_created_ts] [12,order_picked_ts] [12,order_received_ts] [12,order_shipped_ts] [12,ordernumber] [13,order_cancelled_ts] [13,order_created_ts] [13,order_picked_ts] [13,order_received_ts] [13,order_shipped_ts] [13,ordernumber] [14,ad_id] [14,cookie_id] [14,ts] [15,ad_id] [15,catalogs] [15,url] [16,ad_id] [16,catalogs] [16,url] [17,catalog] [17,cookie_id] [17,weight] [21,comm] [21,deptno] [21,empno] [21,ename] [21,hiredate] [21,job] [21,mgr] [21,sal] [22,deptno] [22,dname] [22,loc] sql("SELECT a.TBL_ID, a.TBL_NAME, a.TBL_TYPE, b.CD_ID, b.COLUMN_NAME FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID WHERE a.TBL_ID = 1").collect.foreach(println)
[1,page_views,MANAGED_TABLE,1,city_id] [1,page_views,MANAGED_TABLE,1,end_user_id] [1,page_views,MANAGED_TABLE,1,ip] [1,page_views,MANAGED_TABLE,1,referer] [1,page_views,MANAGED_TABLE,1,session_id] [1,page_views,MANAGED_TABLE,1,track_time] [1,page_views,MANAGED_TABLE,1,url] sql("SELECT a.TBL_ID, COUNT(b.CD_ID) FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID GROUP BY a.TBL_ID").collect.foreach(println)
[1,7] [6,2] [7,2] [8,2] [9,2] [10,2] [11,2] [12,6] [13,6] [14,3] [15,3] [16,3] [17,3] [21,8] [22,3]
通过spark-sql测试:
CREATE TEMPORARY TABLE spark_tbls USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', dbtable 'TBLS' ); SELECT * FROM spark_tbls;
CREATE TEMPORARY TABLE spark_tbls_parts USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', dbtable 'TBLS', partitionColumn 'TBL_ID', lowerBound '1', upperBound '50', numPartitions '3' ); SELECT * FROM spark_tbls_parts;
CREATE TEMPORARY TABLE spark_column_v2 USING org.apache.spark.sql.jdbc OPTIONS ( url 'jdbc:mysql://hadoop000:3306/hive?user=root&password=root', dbtable 'COLUMNS_V2' ); select * from spark_column_v2; SELECT a.TBL_ID, a.TBL_NAME, a.TBL_TYPE, b.CD_ID, b.COLUMN_NAME FROM spark_tbls a join spark_column_v2 b on a.TBL_ID = b.CD_ID WHERE a.TBL_ID = 1