pyspark3.0.0使用读取Hive数据,使用ArangoClient导入Arango
记录一下使用pyspark读取Hive数据导入至Arango
import pyspark
import os
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from arango import ArangoClient
findspark.init()
os.environ['HADOOP_USER_NAME']='zhisan'
os.environ['SPARK_HOME']='/opt/spark'
os.environ['JAVA_HOME']='/opt/hava'
os.environ['PATHONPATH']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
os.environ['PYSPARK_PYTHON']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
spark = SparkSession.builder \
.enableHiveSupport() \
.appName("sparkSql") \
.config("spark.master", "yarn") \
.config("spark.submit.deployMode", "client") \
.config("spark.driver.maxResultSize", "4G") \
.getOrCreate()
# 获取Arango链接
def get_arnago_collection(db_name, collection_name):
client = ArangoClient(hosts="http://xxx:8529", request_timeout=60*60*2)
db = client.db(db_name, username="xxx", password="xxx")
collection = db.collection(collection_name)
return collection
# Arango批量写入
def write_to_arangodb(db_name, collection_name, document_dict):
collection = get_arnago_collection(db_name, collection_name)
collection.import_bulk(document_dict)
#
db_name = "gs_enterprise_graph"
collection_name = "erp_rel_email_e"
hive_db = "gs"
hive_table = "erp_rel_email_e"
collection = get_arnago_collection(db_name, collection_name)
collection.truncate()
hive_df = spark.sql(f"SELECT * FROM {hive_db}.{hive_table}")
hive_df = hive_df \
.withColumn("_id", lit(f"{collection_name}/") + col("object_key")) \
.withColumn("_key", col("object_key")) \
.withColumn("_from", col("from_key")) \
.withColumn("_to", col("to_key"))
df_dict = hive_df.rdd.map(lambda row: row.asDict())
df_dict.foreachPartition(lambda partition: write_to_arangodb(db_name, collection_name, partition))
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 2025年我用 Compose 写了一个 Todo App
· 张高兴的大模型开发实战:(一)使用 Selenium 进行网页爬虫