python抓取SparkUI中job信息

#coding=utf-8

from __future__ import unicode_literals

import codecs
import json
import sys
import re
import csv
import xlsxwriter
from datetime import datetime
import logging
logging.basicConfig(level = logging.INFO)

import requests
from pyquery import PyQuery as PQ
from operator import itemgetter, attrgetter

DEFAULT_URL = "http://172.16.67.11:8080"

def get_app_detail(table):
"""爬取job_url,返回job的名称及链接"""
#去除掉worker的信息
  applications = table("tr>td>a[href]")[10:]

  #相邻的两个元素(tuple)属于同一个job信息
  info = [(e.text, e.attrib["href"]) for e in applications]
  jobs = zip_list(info, 2)

  #对于列表中tuple内嵌的tuple展开 [((1,2),(3,4)),.....]
  #只取前三个元素(appid,appid_href,appname)
  job_info = [(DEFAULT_URL + "/" + i[0][1], i[0][0], i[1][0]) for i in jobs]
  return job_info

def get_app_info(table):
"""对于job详细信息进行汇总,相邻6个元素属于同一个job信息,
此处,取50是因为worker有10个,每个worker下有5个td内容。
"""
  tds = table("tr > td")[50:]
  info = [i.text.strip() for i in tds if i.text.strip()]
  jobs = zip_list(info, 6)

  return jobs

def zip_list(m,k):
"""zip函数,对列表a中相邻k个元素进行合并"""
  group_adjacent = lambda a, k: zip(*([iter(a)] * k))
  return group_adjacent(m, k)

#打平列表,递归的方法
def flat(tree):
  res = []
  for i in tree:
    if isinstance(i, (list,tuple)):
      res.extend(flat(i))
    else:
      res.append(i)
  return res

def ishms(l):
"""对于排序;小时 > 分钟 > 秒"""
  if 'h' in l:
    return 'z'
  elif 'min' in l:
    return 'y'
  elif 's' in l:
    return 'x'
  else:
    return 'a'

def sorted_list(m):
"""排序第一个key为时间单位,第二个key为时间(数字部分);最后对结果倒排"""
sort_func = lambda x : (ishms(x[-1]), float(x[-1].split(" ")[0]))
return sorted(m, key = sort_func, reverse = True)

def write_to_csv(filename, tuplelist):
"""数据结果写入到csv文件中"""
csvfile = file(filename, 'wb')
writer = csv.writer(csvfile)
writer.writerow(['applink', 'appid', 'appname', 'cores', 'memory', 'stm', 'user', 'state', 'duration'])
#data = [('小河', '25', '1234567'),('小芳', '18', '789456')]
writer.writerows(tuplelist)
csvfile.close()

def diff_time(stm, etm):
"""求两个时间形为 "2015-08-16 01:28:33" 的差(秒)"""
  stm = datetime.strptime(stm, "%Y-%m-%d %H:%M:%S")
  etm = datetime.strptime(etm, "%Y-%m-%d %H:%M:%S")
  return (etm - stm).seconds

def get_job_info(dom, starttime, endtime):

#获取appid,appname,time等信息
  apps = get_app_detail(dom)
  job_detail = get_app_info(dom)

  job_info = zip(apps, job_detail)

  #flatten操作,打平列表 例:a = [1, 2, [3, 4], [[5, 6], [7, 8]]]
  flatten = lambda x: [y for l in x for y in flatten(l)] if isinstance(x,(list,tuple)) else [x]

  #detail = [(i[0][0],i[0][1],i[0][2],i[1][0],i[1][1],i[1][2],i[1][3],i[1][4],i[1][5]) for i in job_info]
  detail = [tuple(flatten(i)) for i in job_info]
  sorted_detail = sorted_list(detail)

  #通过给定的时间范围,筛选出满足要求的job信息
  result = []
  for line in sorted_detail:
    (applink, appid, appname, cores, memory, stm, user, state, duration) = line
    stm = stm.replace("/", "-")
    if starttime <= stm <= endtime:
      result.append(line)
  #return result

  # with codecs.open('./SparkJobInfo.txt', 'w', 'utf8') as wf:
  #    for k in sorted_detail:
  #      line = "~".join(k)
  #      wf.write(line + "\n")
  write_to_csv("./SparkJobInfo.csv", result)

def main():
#获取url所有内容,解析成dom树
  rsp = requests.get(DEFAULT_URL)
  dom = PQ(rsp.text)

  #get_job_info(dom, sys.argv[1], sys.argv[2])
  get_job_info(dom, "2015-11-13 02:00:00", "2015-11-14 02:00:00")


if __name__ == '__main__':
  main()

 

posted @ 2015-11-18 15:17  l_g1990  阅读(435)  评论(0编辑  收藏  举报