使用python操作hdfs,并grep想要的数据
代码如下:
import subprocess for day in range(24, 30): for h in range(0, 24): filename = "tls-metadata-2018-10-%02d-%02d.txt" % (day, h) cmd = "hdfs dfs -text /data/2018/10/%02d/%02d/*.snappy" % (day, h) print(cmd) #cmd = "cat *.py" cmd = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) f = open(filename, "w") for line in cmd.stdout: try: arr = line.split("^") if len(arr) >= 120 and arr[6] == "6" and arr[25] == "SSL" and arr[107]: #print(line) f.write("^".join(arr[:32]) + "^" + arr[95] + "^" + "^".join(arr[105:119])+ "\n") except Exception as e: print(e, "fuck error", line) f.close() #import sys #sys.exit(0)