pyspark3.0.0使用读取Hive数据,使用ArangoClient导入Arango

记录一下使用pyspark读取Hive数据导入至Arango

import pyspark
import os
import findspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from arango import ArangoClient

findspark.init()

os.environ['HADOOP_USER_NAME']='zhisan'
os.environ['SPARK_HOME']='/opt/spark'
os.environ['JAVA_HOME']='/opt/hava'
os.environ['PATHONPATH']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
os.environ['PYSPARK_DRIVER_PYTHON']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'
os.environ['PYSPARK_PYTHON']='/home/zhisan/apps/conda/miniconda3/envs/py10/bin/python'


spark = SparkSession.builder \
    .enableHiveSupport() \
    .appName("sparkSql") \
    .config("spark.master", "yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.driver.maxResultSize", "4G") \
    .getOrCreate()

# 获取Arango链接
def get_arnago_collection(db_name, collection_name):
    client = ArangoClient(hosts="http://xxx:8529", request_timeout=60*60*2)
    db = client.db(db_name, username="xxx", password="xxx")
    collection = db.collection(collection_name)
    return collection

# Arango批量写入
def write_to_arangodb(db_name, collection_name, document_dict):
    collection = get_arnago_collection(db_name, collection_name)
    collection.import_bulk(document_dict)

# 
db_name = "gs_enterprise_graph"
collection_name = "erp_rel_email_e"
hive_db = "gs"
hive_table = "erp_rel_email_e"

collection = get_arnago_collection(db_name, collection_name)
collection.truncate()

hive_df = spark.sql(f"SELECT * FROM {hive_db}.{hive_table}")
hive_df = hive_df \
    .withColumn("_id", lit(f"{collection_name}/") + col("object_key")) \
    .withColumn("_key", col("object_key")) \
    .withColumn("_from", col("from_key")) \
    .withColumn("_to", col("to_key")) 
df_dict = hive_df.rdd.map(lambda row: row.asDict()) 

df_dict.foreachPartition(lambda partition: write_to_arangodb(db_name, collection_name, partition))

posted @   AxiaNibiru  阅读(13)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· TypeScript + Deepseek 打造卜卦网站:技术与玄学的结合
· 阿里巴巴 QwQ-32B真的超越了 DeepSeek R-1吗?
· 【译】Visual Studio 中新的强大生产力特性
· 2025年我用 Compose 写了一个 Todo App
· 张高兴的大模型开发实战:(一)使用 Selenium 进行网页爬虫
点击右上角即可分享
微信分享提示
主题色彩