Spark:连接MySQL
spark-shell或spark-submit,载入jdbc的jar包:
--jars /mysql-connector-java-*-bin.jar
spark使用jdbc读写数据库:
//读数据库
import java.util.Properties
val prop = {
val p = new Properties()
p.put("driver", "com.mysql.jdbc.Driver")
p.put("url", "jdbc:mysql://10.18.2.3:3306/dbname?"+
"useUnicode=true&characterEncoding=utf8&rewriteBatchedStatements=true")
p.put("user", "username")
p.put("password", "****")
p
}
prop.put("dbtable", "tablename")
prop.put("dbtable", "(select * from tablename) tablename_")
import scala.collection.JavaConversions._
spark.read.format("jdbc").options(prop).load()
//写数据库:
import org.apache.spark.sql.SaveMode
df.write
.mode(SaveMode.Append)
.jdbc(prop.getProperty("url"), "tablename", prop)
rewriteBatchedStatements=true
(默认为false)这个选项会在写数据库的时候起作用,原理为spark会将一条数据转为一个insert语句,其设为true会将多条数据用一个insert语句插入。
相当于直接用jdbc中的.addBatch()
方法。
若要用jdbc提供的接口直接操作数据库(不用spark提供的接口):
import java.sql.DriverManager
import java.util.Properties
object JDBCUtil extends Serializable {
def apply(prop: Properties) = {
new JDBCUtil(prop)
}
}
class JDBCUtil private (prop: Properties) extends Serializable {
Class.forName(prop.getProperty("driver"))
val url = prop.getProperty("url")
private[this] val user = prop.getProperty("user")
private[this] val password = prop.getProperty("password")
def getConnect() = {
DriverManager.getConnection(url, user, password)
}
def getJDBCProp() = {
val prop_copy = new Properties()
prop_copy.putAll(prop)
prop_copy
}
}
// import org.apache.spark.SparkFiles
object DBUtil extends Serializable {
private[this] val jdbc_conf = {
val p = new Properties()
p.put("driver", "com.mysql.jdbc.Driver")
p.put("url", "jdbc:mysql://10.18.2.3:3306/dbname?" +
"useUnicode=true&characterEncoding=utf8&rewriteBatchedStatements=true")
p.put("user", "username")
p.put("password", "****")
p
}
//private[this] val jdbc_file = SparkFiles.get("route_analysis.properties")
private[this] val jdbcUtil = JDBCUtil(jdbc_conf)
val url = jdbcUtil.url
def conn = jdbcUtil.getConnect
def prop = jdbcUtil.getJDBCProp
import scala.collection.JavaConversions._
def options: scala.collection.mutable.Map[String, String] = prop
}
// 使用:
def update(df: DataFrame) {
val table = "tablename"
df.foreachPartition { part =>
val conn = DBUtil.conn
part.foreach { r =>
val col = r.getAs[String]("col")
//查询已有数据
val sql = s"""select col from ${table} where col = '${col}' """
val stmt_query = conn.createStatement()
val res = stmt_query.executeQuery(sql)
val stmt_new = conn.createStatement()
//如果结果集为空则插入一条记录
if (!res.first) {
val sql_insert = s"""insert into ${table} (col) values ('${col}') """
stmt_new.execute(sql_insert)
} else {
//若找到记录可以看看是否可更新ex_station字段。
val sql_update = s"""update ${table} set col = '${col}' where id = ${id} """
stmt_new.executeUpdate(sql_update)
}
res.close()
stmt_query.close()
stmt_new.close()
}
}
println(s"update表(${table})完成...")
}