python抓取SparkUI中job信息
#coding=utf-8
from __future__ import unicode_literals
import codecs
import json
import sys
import re
import csv
import xlsxwriter
from datetime import datetime
import logging
logging.basicConfig(level = logging.INFO)
import requests
from pyquery import PyQuery as PQ
from operator import itemgetter, attrgetter
DEFAULT_URL = "http://172.16.67.11:8080"
def get_app_detail(table):
"""爬取job_url,返回job的名称及链接"""
#去除掉worker的信息
applications = table("tr>td>a[href]")[10:]
#相邻的两个元素(tuple)属于同一个job信息
info = [(e.text, e.attrib["href"]) for e in applications]
jobs = zip_list(info, 2)
#对于列表中tuple内嵌的tuple展开 [((1,2),(3,4)),.....]
#只取前三个元素(appid,appid_href,appname)
job_info = [(DEFAULT_URL + "/" + i[0][1], i[0][0], i[1][0]) for i in jobs]
return job_info
def get_app_info(table):
"""对于job详细信息进行汇总,相邻6个元素属于同一个job信息,
此处,取50是因为worker有10个,每个worker下有5个td内容。
"""
tds = table("tr > td")[50:]
info = [i.text.strip() for i in tds if i.text.strip()]
jobs = zip_list(info, 6)
return jobs
def zip_list(m,k):
"""zip函数,对列表a中相邻k个元素进行合并"""
group_adjacent = lambda a, k: zip(*([iter(a)] * k))
return group_adjacent(m, k)
#打平列表,递归的方法
def flat(tree):
res = []
for i in tree:
if isinstance(i, (list,tuple)):
res.extend(flat(i))
else:
res.append(i)
return res
def ishms(l):
"""对于排序;小时 > 分钟 > 秒"""
if 'h' in l:
return 'z'
elif 'min' in l:
return 'y'
elif 's' in l:
return 'x'
else:
return 'a'
def sorted_list(m):
"""排序第一个key为时间单位,第二个key为时间(数字部分);最后对结果倒排"""
sort_func = lambda x : (ishms(x[-1]), float(x[-1].split(" ")[0]))
return sorted(m, key = sort_func, reverse = True)
def write_to_csv(filename, tuplelist):
"""数据结果写入到csv文件中"""
csvfile = file(filename, 'wb')
writer = csv.writer(csvfile)
writer.writerow(['applink', 'appid', 'appname', 'cores', 'memory', 'stm', 'user', 'state', 'duration'])
#data = [('小河', '25', '1234567'),('小芳', '18', '789456')]
writer.writerows(tuplelist)
csvfile.close()
def diff_time(stm, etm):
"""求两个时间形为 "2015-08-16 01:28:33" 的差(秒)"""
stm = datetime.strptime(stm, "%Y-%m-%d %H:%M:%S")
etm = datetime.strptime(etm, "%Y-%m-%d %H:%M:%S")
return (etm - stm).seconds
def get_job_info(dom, starttime, endtime):
#获取appid,appname,time等信息
apps = get_app_detail(dom)
job_detail = get_app_info(dom)
job_info = zip(apps, job_detail)
#flatten操作,打平列表 例:a = [1, 2, [3, 4], [[5, 6], [7, 8]]]
flatten = lambda x: [y for l in x for y in flatten(l)] if isinstance(x,(list,tuple)) else [x]
#detail = [(i[0][0],i[0][1],i[0][2],i[1][0],i[1][1],i[1][2],i[1][3],i[1][4],i[1][5]) for i in job_info]
detail = [tuple(flatten(i)) for i in job_info]
sorted_detail = sorted_list(detail)
#通过给定的时间范围,筛选出满足要求的job信息
result = []
for line in sorted_detail:
(applink, appid, appname, cores, memory, stm, user, state, duration) = line
stm = stm.replace("/", "-")
if starttime <= stm <= endtime:
result.append(line)
#return result
# with codecs.open('./SparkJobInfo.txt', 'w', 'utf8') as wf:
# for k in sorted_detail:
# line = "~".join(k)
# wf.write(line + "\n")
write_to_csv("./SparkJobInfo.csv", result)
def main():
#获取url所有内容,解析成dom树
rsp = requests.get(DEFAULT_URL)
dom = PQ(rsp.text)
#get_job_info(dom, sys.argv[1], sys.argv[2])
get_job_info(dom, "2015-11-13 02:00:00", "2015-11-14 02:00:00")
if __name__ == '__main__':
main()