SparkSQL in Java
参考地址:Starting Point: SparkSession
1.新建Maven项目,POM引入依赖
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.13</artifactId>
<version>3.5.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.13</artifactId>
<version>3.5.0</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId>
<version>2.16.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-reload4j</artifactId>
<version>2.0.9</version>
</dependency>
</dependencies>
2.项目添加Scala依赖库
3.在资源目录添加日志配置文件log4j.properties
log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
4.添加相关测试数据文件
{"username": "fanqi","age": 24}
{"username": "admin","age": 0}
{"username": "root","age": 100}
5.添加相关代码文件
package cn.coreqi;
import static org.apache.spark.sql.functions.col;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.*;
import java.util.Arrays;
public class Main {
public static void main(String[] args) {
// 创建SparkConf对象
SparkConf sparkConf = new SparkConf()
.setMaster("local[*]")
.setAppName("sparkSql");
SparkSession spark = SparkSession
.builder()
.config(sparkConf)
.getOrCreate();
Dataset<Row> df = spark.read().json("datas/user.json");
df.show();
// DataFrames => SQL
df.createOrReplaceTempView("user");
spark.sql("select * from user").show();
spark.sql("select avg(age) from user").show();
// DataFrames => DSL
df.select("age","username").show();
df.select(col("age").plus(1)).show();
// DataFrames是特定泛型的 Dataset
// Dataset
Encoder<Integer> intEncoder = Encoders.INT();
Dataset<Integer> ds = spark.createDataset(Arrays.asList(1, 2, 3), intEncoder);
ds.show();
// 关闭
spark.close();
}
}