CDH6.2的spark访问OSS

一、CDH6.2中自带访问阿里云OSS的jar包,只需要将相应的jar放到./spark/jars目录下即可:

cd /opt/cloudera/parcels/CDH/jars
mv aliyun-sdk-oss-2.8.3.jar ../lib/spark/jars/
mv hadoop-aliyun-3.0.0-cdh6.2.1.jar ../lib/spark/jars/
mv jdom-1.1.jar ../lib/spark/jars/

 

二、测试用例

package org.shydow

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

/**
 * @author shydow
 * @date 2021-12-13
 */
object TestReadOSS {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf()
    conf.set("spark.hadoop.fs.oss.impl", "com.aliyun.fs.oss.nat.NativeOssFileSystem")
    conf.set("spark.hadoop.mapreduce.job.run-local", "true")
    conf.set("spark.hadoop.fs.oss.accessKeyId", Constants.ossAccessKeyId)
    conf.set("spark.hadoop.fs.oss.accessKeySecret", Constants.ossAccessKeySecret)
    conf.set("spark.hadoop.fs.oss.endpoint", Constants.ossEndpoint)
    val spark: SparkSession = SparkSession.builder()
      .config(conf)
      .appName("test-read-oss")
      .master("local[*]")
      .getOrCreate()
    val sc: SparkContext = spark.sparkContext
    sc.setLogLevel("WARN")

    val source: RDD[String] = sc.textFile("oss://buck-name/...")
    val result: Array[String] = source.collect()
    result.foreach(println)

    spark.close()
  }
}

 

三、提交

spark-submit --master yarn \ 
  --deploy-mode cluster \
  --queue default \
  --driver-memory 1g \
  --num-executors 1 \
  --executor-memory 1g \
  --executor-cores 2 \
  --class com.shydow.Launcher ./read-oss.jar

 

posted @ 2021-12-13 10:22  Shydow  阅读(231)  评论(0编辑  收藏  举报