是的 我完成了一个 简单的 web +spark+hive+hadoop的基本的增查功能
下面是 后端代码
采用的是 pyspark 这比 Hadoop的mapreduce 快很多
另外 我部署的是 远程解释器
from flask import Flask, jsonify, request
from pyspark.sql import SparkSession
import os
from flask_cors import CORS
app = Flask(__name__)
CORS(app)
os.environ['HADOOP_CONF_DIR'] = '/export/server/hadoop-3.3.0/etc/hadoop'
os.environ['YARN_CONF_DIR'] = '/export/server/hadoop-3.3.0/etc/hadoop'
# 配置SparkSession,连接到Hive,并指定数据库
spark = SparkSession.builder \
.appName("Flask with PySpark and Hive") \
.master("yarn") \
.config("spark.sql.warehouse.dir", "hdfs://10.0.0.129:8020/user/hive/warehouse") \
.enableHiveSupport() \
.getOrCreate()
# 使用 `itheima` 数据库
spark.sql("USE itheima")
# 增:将数据插入到Hive表中
@app.route('/create', methods=['POST'])
def create_data():
try:
data = request.json
if not isinstance(data, dict):
return jsonify({"error": "Invalid data format. Expected a JSON object."}), 400
required_fields = ['id', 'name', 'age']
if not all(field in data for field in required_fields):
return jsonify({"error": "Missing required fields"}), 400
# 数据类型验证
if not isinstance(data['id'], int) or not isinstance(data['name'], str) or not isinstance(data['age'], int):
return jsonify({"error": "Invalid data types"}), 400
# 使用 SQL 语句插入数据
insert_query = f"INSERT INTO t_1 (id, name, age) VALUES ({data['id']}, '{data['name']}', {data['age']})"
spark.sql(insert_query)
return jsonify({"message": "Data inserted into Hive table 't_1'"}), 200
except Exception as e:
app.logger.error(f"Error occurred: {str(e)}")
return jsonify({"error": str(e)}), 500
# 查:读取Hive表中的数据
@app.route('/read', methods=['GET'])
def read_data():
# 查询Hive表中的数据
df = spark.sql("SELECT * FROM t_1")
# 将结果转换为JSON
data = df.collect()
result = [{"id": row["id"],"name": row["name"], "age": row["age"]} for row in data]
return jsonify(result)
# 改:更新Hive表中的数据(通过重写实现)
@app.route('/update', methods=['POST'])
def update_data():
data = request.json
if not data:
return jsonify({"error": "No data provided"}), 400
try:
# 创建 SparkSession(假设 SparkSession 已经全局初始化)
spark = SparkSession.builder \
.appName("Hive Update") \
.enableHiveSupport() \
.getOrCreate()
# 将请求数据转换为 DataFrame,并显式指定 age 为 IntegerType
new_df = spark.createDataFrame(
[(int(data['id']), data['name'], int(data['age']))], # 将 age 转换为 int
schema=['id', 'name', 'age']
)
# 将新数据写入到临时表
new_df.createOrReplaceTempView("temp_table")
# 使用临时表中的数据来更新目标表
spark.sql("""
INSERT OVERWRITE TABLE t_1
SELECT * FROM (
SELECT id, name, age FROM t_1
UNION ALL
SELECT id, name, age FROM temp_table
) AS combined
GROUP BY id, name, age
""")
return jsonify({"message": "Data updated successfully"}), 200
except Exception as e:
print(e)
return jsonify({"error": str(e)}), 500
# 删:从Hive表中删除数据
@app.route('/delete', methods=['POST'])
def delete_data():
condition = request.json.get('condition', {})
# 读取现有数据
df = spark.sql("SELECT * FROM t_1")
# 删除条件:例如删除年龄小于某个值的数据
if "age" in condition:
age_limit = condition["age"]
df = df.filter(df["age"] >= age_limit)
# 重新写入筛选后的数据到Hive表
df.write.mode('overwrite').saveAsTable("t_1")
return jsonify({"message": f"Deleted data where age < {age_limit}"})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)