spark以rdd方式读写mysql

1、读取mysql数据。从mysql读取的时候需要传入数据边界,数据类型是long,一般建议主键列,如果是时间列,需要转换成时间戳。

参考demo:https://github.com/asker124143222/spark-demo

package com.home.spark


import java.sql.DriverManager
import java.time.{LocalDateTime, ZoneOffset}

import org.apache.spark.rdd.{JdbcRDD, RDD}
import org.apache.spark.{SparkConf, SparkContext}


object Ex_mysql {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf(true).setMaster("local[*]").setAppName("spark mysql demo")

    val sc = new SparkContext(conf)

    val driverClassName = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://localhost:3306/busdata?characterEncoding=utf8&useSSL=false"
    val user = "root"
    val password = "root"

    //mysql里时间类型为datetime,传入的条件为时间戳
    val sql = "select userId,userName,name from user where createTime > from_unixtime(?) and createTime < from_unixtime(?)"


    val connection = () => {
      Class.forName(driverClassName)
      DriverManager.getConnection(url, user, password)
    }

    val startTime = LocalDateTime.of(2018, 11, 3, 0, 0, 0)
    val endTime = LocalDateTime.of(2018, 11, 4, 0, 0)
    //mysql的时间戳只有10位,需要把java里的13位时间戳降低精度,直接除以1000
    val startTimeStamp = startTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli / 1000
    val endTimeStamp = endTime.toInstant(ZoneOffset.ofHours(8)).toEpochMilli / 1000

    println("startTime: " + startTime + ", endTime: " + endTime)
    println("startTime: " + startTimeStamp + ", endTime: " + endTimeStamp)

    //读取
    val result: JdbcRDD[(Int, String, String)] = new JdbcRDD[(Int, String, String)](
      sc,
      connection,
      sql,
      startTimeStamp,
      endTimeStamp,
      2,
      rs => {

        val userId = rs.getInt(1)
        val userName = rs.getString(2)
        val name = rs.getString(3)
        //        println(s"id:${userId},userName:${userName},name:${name}")
        (userId, userName, name)
      }
    )
    result.collect().foreach(println)

    sc.stop()
  }

}

2、写mysql。减少连接创建次数,使用foreachPartition,而不是foreach

 

package com.home.spark

import java.sql.{DriverManager, PreparedStatement}
import java.time.LocalDateTime

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object Ex_mysql2 {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf(true).setMaster("local[*]").setAppName("spark mysql demo")

    val sc = new SparkContext(conf)

    val driverClassName = "com.mysql.jdbc.Driver"
    val url = "jdbc:mysql://localhost:3306/busdata?characterEncoding=utf8&useSSL=false"
    val user = "root"
    val password = "root"


    //写入
    val logBuffer = mutable.ListBuffer[(String, String, String, String, String, String)]()
    import java.time.format.DateTimeFormatter
    val ofPattern = DateTimeFormatter.ofPattern("yyyy-MM-dd hh:mm:ss")
    for (i <- 1 to 100) {
      logBuffer.+=(("write" + i, "写入测试" + i, "localhost" + i, LocalDateTime.now().format(ofPattern), "spark", LocalDateTime.now().format(ofPattern)))
    }

    //    logBuffer.foreach(println)
    val logRDD: RDD[(String, String, String, String, String, String)] = sc.makeRDD(logBuffer)


    //为了减少连接创建次数,使用foreachPartition,而不是foreach
    //缺陷:所有按Partition方式传输整个迭代器的方式都有OOM的风险
    logRDD.foreachPartition(logData => {
      Class.forName(driverClassName)
      val connection = DriverManager.getConnection(url, user, password)
      val sql = "insert into syslog(action, event, host, insertTime, userName, update_Time) values(?,?,?,?,?,?)"
      val statement: PreparedStatement = connection.prepareStatement(sql)
      try {
        logData.foreach {
          case (action, event, host, insertTime, userName, updateTime) => {

            statement.setString(1, action)
            statement.setString(2, event)
            statement.setString(3, host)
            statement.setString(4, insertTime)
            statement.setString(5, userName)
            statement.setString(6, updateTime)
            statement.executeUpdate()
          }
        }
      }
      finally {
        if(statement!=null) statement.close()
        if(connection!=null) connection.close()
      }


      connection.close()
    }
    )

    sc.stop()
  }

}
posted @ 2019-12-24 16:48  我是属车的  阅读(2232)  评论(0编辑  收藏  举报