Spark与mysql整合

一、需求:把最终结果存储在mysql中

1、UrlGroupCount1类

import java.net.URL
import java.sql.DriverManager

import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * 把最终结果存储在mysql中
  */
object UrlGroupCount1 {
  def main(args: Array[String]): Unit = {
    //1.创建spark程序入口
    val conf: SparkConf = new SparkConf().setAppName("UrlGroupCount1").setMaster("local[2]")
    val sc: SparkContext = new SparkContext(conf)

    //2.加载数据
    val rdd1: RDD[String] = sc.textFile("e:/access.log")

    //3.将数据切分
    val rdd2: RDD[(String, Int)] = rdd1.map(line => {
      val s: Array[String] = line.split("\t")
      //元组输出
      (s(1), 1)
    })

    //4.累加求和
    val rdd3: RDD[(String, Int)] = rdd2.reduceByKey(_+_)

    //5.取出分组的学院
    val rdd4: RDD[(String, Int)] = rdd3.map(x => {
      val url = x._1
      val host: String = new URL(url).getHost.split("[.]")(0)
      //元组输出
      (host, x._2)
    })

    //6.根据学院分组
    val rdd5: RDD[(String, List[(String, Int)])] = rdd4.groupBy(_._1).mapValues(it => {
      //根据访问量排序 倒序
      it.toList.sortBy(_._2).reverse.take(1)
    })

    //7.把计算结果保存到mysql中
    rdd5.foreach(x => {
      //把数据写到mysql
      val conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/urlcount?charactorEncoding=utf-8","root", "root")
      //把spark结果插入到mysql中
      val sql = "INSERT INTO url_data (xueyuan,number_one) VALUES (?,?)"
      //执行sql
      val statement = conn.prepareStatement(sql)

      statement.setString(1, x._1)
      statement.setString(2, x._2.toString())
      statement.executeUpdate()
      statement.close()
      conn.close()
    })

    //8.关闭资源
    sc.stop()
  }
}

2、mysql创建数据库和表

CREATE DATABASE urlcount;
USE urlcount;

CREATE TABLE url_data(
    uid INT PRIMARY KEY AUTO_INCREMENT,
    xueyuan VARCHAR(50),
    number_one VARCHAR(200)
)

3、结果

二、Spark提供的连接mysql的方式--jdbcRDD

1、JdbcRDDDemo类

import java.sql.DriverManager

import org.apache.spark.rdd.JdbcRDD
import org.apache.spark.{SparkConf, SparkContext}

/**
  * spark提供的连接mysql的方式
  * jdbcRDD
  */
object JdbcRDDDemo {
  def main(args: Array[String]): Unit = {
    //1.创建spark程序入口
    val conf: SparkConf = new SparkConf().setAppName("JdbcRDDDemo").setMaster("local[2]")
    val sc: SparkContext = new SparkContext(conf)

    //匿名函数
    val connection = () => {
      Class.forName("com.mysql.jdbc.Driver").newInstance()
      DriverManager.getConnection("jdbc:mysql://localhost:3306/urlcount?characterEncoding=utf-8","root", "root")
    }

    //查询数据
    val jdbcRdd: JdbcRDD[(Int, String, String)] = new JdbcRDD(
      //指定sparkContext
      sc,
      connection,
      "SELECT * FROM url_data where uid >= ? AND uid <= ?",
       //2个任务并行
      1, 4, 2,
      r => {
        val uid = r.getInt(1)
        val xueyuan = r.getString(2)
        val number_one = r.getString(3)
        (uid, xueyuan, number_one)
      }
    )

    val result: Array[(Int, String, String)] = jdbcRdd.collect()
    println(result.toBuffer)
    sc.stop()
  }
}

2、结果

 

posted on 2019-01-16 00:10    阅读(2054)  评论(0编辑  收藏  举报