spark以rdd方式读写mysql
1、读取mysql数据。从mysql读取的时候需要传入数据边界,数据类型是long,一般建议主键列,如果是时间列,需要转换成时间戳。
参考demo:https://github.com/asker124143222/spark-demo
package com.home.spark import java.sql.DriverManager import java.time.{LocalDateTime, ZoneOffset} import org.apache.spark.rdd.{JdbcRDD, RDD} import org.apache.spark.{SparkConf, SparkContext} object Ex_mysql { def main(args: Array[String]): Unit = { val conf = new SparkConf(true).setMaster("local[*]").setAppName("spark mysql demo") val sc = new SparkContext(conf) val driverClassName = "com.mysql.jdbc.Driver" val url = "jdbc:mysql://localhost:3306/busdata?characterEncoding=utf8&useSSL=false" val user = "root" val password = "root" //mysql里时间类型为datetime,传入的条件为时间戳 val sql = "select userId,userName,name from user where createTime > from_unixtime(?) and createTime < from_unixtime(?)" val connection = () => { Class.forName(driverClassName) DriverManager.getConnection(url, user, password) } val startTime = LocalDateTime.of(2018, 11, 3, 0, 0, 0) val endTime = LocalDateTime.of(2018, 11, 4, 0, 0) //mysql的时间戳只有10位,需要把java里的13位时间戳降低精度,直接除以1000 val startTimeStamp = startTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli / 1000 val endTimeStamp = endTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli / 1000 println("startTime: " + startTime + ", endTime: " + endTime) println("startTime: " + startTimeStamp + ", endTime: " + endTimeStamp) //读取 val result: JdbcRDD[(Int, String, String)] = new JdbcRDD[(Int, String, String)]( sc, connection, sql, startTimeStamp, endTimeStamp, 2, rs => { val userId = rs.getInt(1) val userName = rs.getString(2) val name = rs.getString(3) // println(s"id:${userId},userName:${userName},name:${name}") (userId, userName, name) } ) result.collect().foreach(println) sc.stop() } }
2、写mysql。减少连接创建次数,使用foreachPartition,而不是foreach
package com.home.spark import java.sql.{DriverManager, PreparedStatement} import java.time.LocalDateTime import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} import scala.collection.mutable object Ex_mysql2 { def main(args: Array[String]): Unit = { val conf = new SparkConf(true).setMaster("local[*]").setAppName("spark mysql demo") val sc = new SparkContext(conf) val driverClassName = "com.mysql.jdbc.Driver" val url = "jdbc:mysql://localhost:3306/busdata?characterEncoding=utf8&useSSL=false" val user = "root" val password = "root" //写入 val logBuffer = mutable.ListBuffer[(String, String, String, String, String, String)]() import java.time.format.DateTimeFormatter val ofPattern = DateTimeFormatter.ofPattern("yyyy-MM-dd hh:mm:ss") for (i <- 1 to 100) { logBuffer.+=(("write" + i, "写入测试" + i, "localhost" + i, LocalDateTime.now().format(ofPattern), "spark", LocalDateTime.now().format(ofPattern))) } // logBuffer.foreach(println) val logRDD: RDD[(String, String, String, String, String, String)] = sc.makeRDD(logBuffer) //为了减少连接创建次数,使用foreachPartition,而不是foreach //缺陷:所有按Partition方式传输整个迭代器的方式都有OOM的风险 logRDD.foreachPartition(logData => { Class.forName(driverClassName) val connection = DriverManager.getConnection(url, user, password) val sql = "insert into syslog(action, event, host, insertTime, userName, update_Time) values(?,?,?,?,?,?)" val statement: PreparedStatement = connection.prepareStatement(sql) try { logData.foreach { case (action, event, host, insertTime, userName, updateTime) => { statement.setString(1, action) statement.setString(2, event) statement.setString(3, host) statement.setString(4, insertTime) statement.setString(5, userName) statement.setString(6, updateTime) statement.executeUpdate() } } } finally { if(statement!=null) statement.close() if(connection!=null) connection.close() } connection.close() } ) sc.stop() } }