oracle表数据量
60G,数据条数:58475050
提交参数
spark-submit --master yarn --deploy-mode client \
--class com.xjb.Test \
--jars ojdbc5-11.2.0.3.jar \
--conf spark.yarn.executor.memoryOverhead=300M \
--num-executors 10 --executor-cores 2 --executor-memory 3724M \
etl.jar
程序
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import com.hs.xlzf.data.etl._
object Test {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("Test")
.config("spark.sql.parquet.writeLegacyFormat", true)
.enableHiveSupport()
.getOrCreate()
val ct = 58475050
val sql_str = s"(select a.*, ROWNUM rownum__rn from db1.tb1 a) b"
val table = spark.read
.format("jdbc")
.option("driver", "oracle.jdbc.driver.OracleDriver")
.option("url", "jdbc:oracle:thin:@10.18.2.3:1521:etc")
.option("user", "***")
.option("password", "***")
.option("dbtable", sql_str)
.option("fetchsize", 100000)
.option("partitionColumn", "rownum__rn")
.option("lowerBound", 0)
.option("upperBound", ct)
.option("numPartitions", 10)
.load()
.drop("rownum__rn")
table
.repartition(100)
.write
.mode(SaveMode.Overwrite)
.saveAsTable("db1.tb1")
}
}
执行时间
25分钟