模板文件
#!/usr/bin/python3
# coding=utf-8 import datetime import subprocess def get_yesterday(): date = datetime.date.today() return date - datetime.timedelta(days=1) APP = "AIS" def check_hdfs_path(path): try: subprocess.run(['hadoop', 'fs', '-test', '-e', path], check=True) except subprocess.CalledProcessError: subprocess.run(['hadoop', 'fs', '-mkdir', '-p', path])
ganlia启停脚本
#!/usr/bin/python3 # coding:utf-8 import subprocess import sys def proc(key): if key == 'start': subprocess.run("ssh hadoop102 sudo systemctl start httpd", shell=True) subprocess.run("ssh hadoop102 sudo systemctl start gmetad", shell=True) subprocess.run("ssh hadoop102 sudo systemctl start gmond", shell=True) subprocess.run("ssh hadoop103 sudo systemctl start gmond", shell=True) subprocess.run("ssh hadoop104 sudo systemctl start gmond", shell=True) print("ganglia启动成功") elif key == 'stop': subprocess.run("ssh hadoop102 sudo systemctl stop httpd", shell=True) subprocess.run("ssh hadoop102 sudo systemctl stop gmetad", shell=True) subprocess.run("ssh hadoop102 sudo systemctl stop gmond", shell=True) subprocess.run("ssh hadoop103 sudo systemctl stop gmond", shell=True) subprocess.run("ssh hadoop104 sudo systemctl stop gmond", shell=True) print("ganglia停止成功") if __name__ == '__main__': if len(sys.argv) < 2: print("参数过少,请重新调用") exit(0) key = sys.argv[1] proc(key)
生成sqlServer的datax的json文件
#!/usr/bin/python3 # coding=utf-8 import getopt import json import os import sys import pymssql host = '192.168.64.144' port = '1433' username = "sa" password = "000000" # HDFS NameNode相关配置,需根据实际情况作出修改 hdfs_nn_host = "hadoop102" hdfs_nn_port = "8020" output_path = "/opt/module/datax/job/import" table = "" database = "AIS20230330120124" type_mapping = { 'nvarchar': 'string', 'bigint': 'bigint', "int": "bigint", "smallint": "bigint", "tinyint": "bigint", "decimal": "string", "double": "double", "float": "float", "binary": "string", "char": "string", "varchar": "string", "datetime": "string", "time": "string", "timestamp": "string", "date": "string", "text": "string" } def get_table_meta(): conn = pymssql.connect(user=username, password=password, server=host, database=database) cursor = conn.cursor() cursor.execute( f""" SELECT COLUMN_NAME, DATA_TYPE FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME = '{table}'; """ ) reader_column = [] writer_column = [] for row in cursor: column_name = row[0] column_type = row[1] reader_column.append(column_name) writer_column.append({ "name": column_name, "type": type_mapping.get(column_type, 'string') }) cursor.close() conn.close() return reader_column, writer_column def get_json(): reader_column, writer_column = get_table_meta() datax_json = { "job": { "setting": { "speed": { "channel": 1 } }, "content": [ { "reader": { "name": "sqlserverreader", "parameter": { "username": username, "password": password, "column": reader_column, "splitPk": "", "connection": [ { "table": [ table ], "jdbcUrl": [ f"jdbc:sqlserver://{host}:{port};database={database};encrypt=true" f";trustServerCertificate=true; " ] } ] } }, "writer": { "name": "hdfswriter", "parameter": { "column": writer_column, "compress": "gzip", "defaultFS": f"hdfs://{hdfs_nn_host}:{hdfs_nn_port}", "fieldDelimiter": "\t", "fileName": table, "fileType": "text", "path": "${targetdir}", "writeMode": "append" } } } ] } } if not os.path.exists(output_path): os.makedirs(output_path) with open(os.path.join(output_path, ".".join([database, table, "json"])), 'w') as f: json.dump(datax_json, f) if __name__ == '__main__': options, arguments = getopt.getopt(sys.argv[1:], '-d:-t:', ['sourcedb=', 'sourcetbl=']) for opt_name, opt_value in options: if opt_name in ('-d', '--sourcedb'): database = opt_value if opt_name in ('-t', '--sourcetbl'): table = opt_value get_json()
生成MySql的datax的json文件
# ecoding=utf-8 import json import getopt import os import sys import MySQLdb #MySQL相关配置,需根据实际情况作出修改 mysql_host = "hadoop102" mysql_port = "3306" mysql_user = "root" mysql_passwd = "000000" #HDFS NameNode相关配置,需根据实际情况作出修改 hdfs_nn_host = "hadoop102" hdfs_nn_port = "8020" #生成配置文件的目标路径,可根据实际情况作出修改 output_path = "/opt/module/datax/job/import" def get_connection(): return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd) def get_mysql_meta(database, table): connection = get_connection() cursor = connection.cursor() sql = "SELECT COLUMN_NAME,DATA_TYPE from information_schema.COLUMNS WHERE TABLE_SCHEMA=%s AND TABLE_NAME=%s ORDER BY ORDINAL_POSITION" cursor.execute(sql, [database, table]) fetchall = cursor.fetchall() cursor.close() connection.close() return fetchall def get_mysql_columns(database, table): return map(lambda x: x[0], get_mysql_meta(database, table)) def get_hive_columns(database, table): def type_mapping(mysql_type): mappings = { "bigint": "bigint", "int": "bigint", "smallint": "bigint", "tinyint": "bigint", "decimal": "string", "double": "double", "float": "float", "binary": "string", "char": "string", "varchar": "string", "datetime": "string", "time": "string", "timestamp": "string", "date": "string", "text": "string" } return mappings[mysql_type] meta = get_mysql_meta(database, table) return map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta) def generate_json(source_database, source_table): job = { "job": { "setting": { "speed": { "channel": 3 }, "errorLimit": { "record": 0, "percentage": 0.02 } }, "content": [{ "reader": { "name": "mysqlreader", "parameter": { "username": mysql_user, "password": mysql_passwd, "column": get_mysql_columns(source_database, source_table), "splitPk": "", "connection": [{ "table": [source_table], "jdbcUrl": ["jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + source_database] }] } }, "writer": { "name": "hdfswriter", "parameter": { "defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port, "fileType": "text", "path": "${targetdir}", "fileName": source_table, "column": get_hive_columns(source_database, source_table), "writeMode": "append", "fieldDelimiter": "\t", "compress": "gzip" } } }] } } if not os.path.exists(output_path): os.makedirs(output_path) with open(os.path.join(output_path, ".".join([source_database, source_table, "json"])), "w") as f: json.dump(job, f) def main(args): source_database = "" source_table = "" options, arguments = getopt.getopt(args, '-d:-t:', ['sourcedb=', 'sourcetbl=']) for opt_name, opt_value in options: if opt_name in ('-d', '--sourcedb'): source_database = opt_value if opt_name in ('-t', '--sourcetbl'): source_table = opt_value generate_json(source_database, source_table) if __name__ == '__main__': main(sys.argv[1:])
启停flume的脚本代码
#!/usr/bin/python3 # coding=utf-8 import subprocess import sys import psutil def proc(key): for i in ['hadoop102', 'hadoop103']: if key == 'start': print(f"---------------{i} 节点,日志采集开启------------------------") subprocess.Popen(f"ssh {i} nohup /opt/module/flume/bin/flume-ng agent -n a1 -c /opt/module/flume/conf/ -f " f"/opt/module/flume/job/file_to_kafka.conf >/dev/null 2>&1 &", shell=True).communicate() if key == 'stop': print(f"----------------{i} 节点,日志采集关闭----------------------------------------") result = subprocess.run(['ssh', i, 'jps', '-m'], capture_output=True, text=True) for line in result.stdout.split("\n"): if "file_to_kafka" in line: subprocess.Popen(['ssh', i, 'kill', '-9', line.split()[0]]).communicate() if __name__ == '__main__': if len(sys.argv) < 2: print('参数过少,请重新调用') exit(0) argc = sys.argv[1] proc(argc)
启停kafka的脚本文件
#!/usr/bin/python3 # coding=utf-8 import subprocess import sys def proc(key): for i in ["hadoop102", "hadoop103", "hadoop104"]: if key == "start": print(f"--------------{i} 开启kafka---------------------") subprocess.Popen(["ssh", i, "/opt/module/kafka/bin/kafka-server-start.sh", "-daemon", "/opt/module/kafka/config/server.properties"], ).communicate() if key == 'stop': print(f"------------------{i} 停止kafka-----------------------------") subprocess.Popen(['ssh', i, '/opt/module/kafka/bin/kafka-server-stop.sh']).communicate() if __name__ == '__main__': if len(sys.argv) < 2: print("参数过少,重新调用") exit(0) argc = sys.argv[1] proc(argc)
xsync
#!/bin/bash #1. 判断参数个数 if [ $# -lt 1 ] then echo Not Enough Arguement! exit; fi #2. 遍历集群所有机器 for host in hadoop102 hadoop103 hadoop104 do echo ==================== $host ==================== #3. 遍历所有目录,挨个发送 for file in $@ do #4 判断文件是否存在 if [ -e $file ] then #5. 获取父目录 pdir=$(cd -P $(dirname $file); pwd) #6. 获取当前文件的名称 fname=$(basename $file) ssh $host "mkdir -p $pdir" rsync -av $pdir/$fname $host:$pdir else echo $file does not exists! fi done done
xcall.sh
#! /bin/bash for i in hadoop102 hadoop103 hadoop104 do echo --------- $i ---------- ssh $i "$*" done
zk.sh
#!/bin/bash case $1 in "start"){ for i in hadoop102 hadoop103 hadoop104 do echo ---------- zookeeper $i 启动 ------------ ssh $i "/opt/module/zookeeper-3.5.7/bin/zkServer.sh start" done };; "stop"){ for i in hadoop102 hadoop103 hadoop104 do echo ---------- zookeeper $i 停止 ------------ ssh $i "/opt/module/zookeeper-3.5.7/bin/zkServer.sh stop" done };; "status"){ for i in hadoop102 hadoop103 hadoop104 do echo ---------- zookeeper $i 状态 ------------ ssh $i "/opt/module/zookeeper-3.5.7/bin/zkServer.sh status" done };; esac
hdp.sh
#!/bin/bash if [ $# -lt 1 ] then echo "No Args Input..." exit ; fi case $1 in "start") echo " =================== 启动 hadoop集群 ===================" echo " --------------- 启动 hdfs ---------------" ssh hadoop102 "/opt/module/hadoop-3.1.3/sbin/start-dfs.sh" echo " --------------- 启动 yarn ---------------" ssh hadoop103 "/opt/module/hadoop-3.1.3/sbin/start-yarn.sh" echo " --------------- 启动 historyserver ---------------" ssh hadoop102 "/opt/module/hadoop-3.1.3/bin/mapred --daemon start historyserver" ;; "stop") echo " =================== 关闭 hadoop集群 ===================" echo " --------------- 关闭 historyserver ---------------" ssh hadoop102 "/opt/module/hadoop-3.1.3/bin/mapred --daemon stop historyserver" echo " --------------- 关闭 yarn ---------------" ssh hadoop103 "/opt/module/hadoop-3.1.3/sbin/stop-yarn.sh" echo " --------------- 关闭 hdfs ---------------" ssh hadoop102 "/opt/module/hadoop-3.1.3/sbin/stop-dfs.sh" ;; *) echo "Input Args Error..." ;; esac