CDH6.2的spark访问OSS
一、CDH6.2中自带访问阿里云OSS的jar包,只需要将相应的jar放到./spark/jars目录下即可:
cd /opt/cloudera/parcels/CDH/jars
mv aliyun-sdk-oss-2.8.3.jar ../lib/spark/jars/
mv hadoop-aliyun-3.0.0-cdh6.2.1.jar ../lib/spark/jars/
mv jdom-1.1.jar ../lib/spark/jars/
二、测试用例
package org.shydow import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession /** * @author shydow * @date 2021-12-13 */ object TestReadOSS { def main(args: Array[String]): Unit = { val conf = new SparkConf() conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem") conf.set("spark.hadoop.mapreduce.job.run-local", "true") conf.set("spark.hadoop.fs.oss.accessKeyId", Constants.ossAccessKeyId) conf.set("spark.hadoop.fs.oss.accessKeySecret", Constants.ossAccessKeySecret) conf.set("spark.hadoop.fs.oss.endpoint", Constants.ossEndpoint) val spark: SparkSession = SparkSession.builder() .config(conf) .appName("test-read-oss") .master("local[*]") .getOrCreate() val sc: SparkContext = spark.sparkContext sc.setLogLevel("WARN") val source: RDD[String] = sc.textFile("oss://buck-name/...") val result: Array[String] = source.collect() result.foreach(println) spark.close() } }
三、提交
spark-submit --master yarn \
--deploy-mode cluster \
--queue default \
--driver-memory 1g \
--num-executors 1 \
--executor-memory 1g \
--executor-cores 2 \
--class com.shydow.Launcher ./read-oss.jar