如果是大表, 会出现OOM
原本是这样,直接写Table名称,就会读整张Table .option("dbtable", "test_table")
可以改写成:.option("dbtable", "(select * from test_table where dt >= '2017-05-01') as T")
PS记得一定要用左右括号包起来,因为dbtable的value会被当成一张table作查询,mysql connector会自动dbtable后面加上where 1=1,如果没包起来就会出现SQL Syntax Error之类的错误
参考:https://blog.csdn.net/Damionew/article/details/103918688
参考Spark Doc:https://spark.apache.org/docs/latest/sql-getting-started.html
1.首先Maven需要引入spark依赖和mysql驱动:
<dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.12</artifactId> <version>2.4.4</version> </dependency> <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql --> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.12</artifactId> <version>2.4.4</version> </dependency> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.16</version> </dependency>
读取mysql 数据(scala) 方式1
def main(args: Array[String]): Unit = { val spark = SparkSession.builder().appName("SparkSQLAndMySQL").master("local").getOrCreate() //读取方式一 val connectionProperties = new Properties() connectionProperties.put("user", "root") connectionProperties.put("password", "root") val jdbcDF = spark.read.jdbc("jdbc:mysql://localhost:3306/lzdcase", "wktableallstatisc",connectionProperties) jdbcDF.show(); import spark.implicits._ import org.apache.spark.sql.functions._ //read 方式二 val jdbcDF2 = spark.read.format("jdbc") .option("url","jdbc:mysql://localhost:3306/lzdcase") .option("dbtable","afsa2staticstb") .option("user","root") .option("password","root").load() jdbcDF2.show(2) jdbcDF2.groupBy("mawb").agg(count(jdbcDF2("bookingNo")) ,sum(jdbcDF2("totalcouriers"))).show() //write method1 val jdbcProperties = new Properties() jdbcProperties.put("user","root") jdbcProperties.put("password","root") // def jdbc(url: String, table: String, connectionProperties: Properties) jdbcDF2.write.mode(SaveMode.Overwrite).jdbc("jdbc:mysql://localhost:3306/lzdcase","sparktest",jdbcProperties) //write method2 jdbcDF2.write.mode(SaveMode.Overwrite) .format("jdbc") .option("user","root") .option("password","root") .option("dbtable","sparkwrite2") .option("url","jdbc:mysql://localhost:3306/lzdcase") .save() //输出方式三 执行创建表的列名和数据类型 数据类型不能大写 jdbcDF2.write .option("createTableColumnTypes", "name varchar(200),salary int") .jdbc("jdbc:mysql://localhost:3306/db1", "employees2", jdbcProperties) spark.stop() }
读取mysql 数据(java) 方式2
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; public class JavaSparkSqlDemo { public static void main(String[] args){ SparkSession sparkSession = SparkSession .builder() .appName("JavaSparkSqlDemo") //Sets a name for the application .master("local") //Sets the Spark master URL to connect to .getOrCreate(); //获取或者新建一个 sparkSession //设置sparkSession数据连接 Dataset userDataset = sparkSession.read() .format("jdbc") .option("url","jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=utf-8") .option("dbtable","user") .option("driver","com.mysql.cj.jdbc.Driver") .option("user","root") .option("password","root") .load(); Dataset roleDataset = sparkSession.read() .format("jdbc") .option("url","jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=utf-8") .option("dbtable","role") .option("driver","com.mysql.cj.jdbc.Driver") .option("user","root") .option("password","root") .load(); //注册临时表后才能进行select等操作,必需,否则not found in database 'default' userDataset.registerTempTable("user"); roleDataset.registerTempTable("role"); //SQL查询操作 //注意:1.所有用到的表需要在option和registerTempTable注册 Dataset<Row> sqlDF = sparkSession.sql("SELECT t1.id,t1.name,t2.role FROM USER t1 LEFT JOIN role t2 ON t1.id = t2.id "); sqlDF.show(); } }