解析HTML文件

  1 #!/usr/bin/env python3
  2 
  3 # -*- coding: UTF-8 -*-
  4 
  5 from bs4 import BeautifulSoup
  6 import operator
  7 import os,shutil
  8 import re
  9 
 10 def processhtml(item):
 11   html_path = item
 12   with open(html_path) as fp:
 13     soup = BeautifulSoup(fp, "html.parser")
 14   return soup
 15 
 16 def IsComputer(soup_arg):
 17   soup = soup_arg
 18   result = False
 19   try:
 20     value = soup.find('input', {'name':'资源类型'}).get('value')
 21     if value == '主机':
 22       print('资源类型:主机')
 23       result = True
 24     elif value == '数据库':
 25       print('资源类型:数据库')
 26     else:
 27       print('资源类型:其他')
 28   except:
 29     pass
 30   return result
 31 
 32 def IsAgree(soup_arg):
 33   soup = soup_arg
 34   result = False
 35   try:
 36     for row in soup.findAll('tr'):
 37       cells = row.findAll('td')
 38       if len(cells) == 4:
 39         if cells[1].findChild("font") != None:
 40           nStr = ""
 41           nStr = nStr.join(cells[0].string)
 42           target = ['帐号管理人员处理']
 43           if (operator.eq(nStr.split(), target)):
 44             print(cells[1].font.string)
 45           if (operator.eq(nStr.split(), target) and cells[1].font.string == '同意'):
 46             print("满足条件为:%s && 审批意见(同意)" % nStr.split()[0])
 47             result = True
 48   except IndexError as e:
 49     pass
 50   return result
 51 
 52 def IsIntersect(soup_arg):
 53   soup = soup_arg
 54   result = False
 55   try:
 56     value = soup.find('input', {'name':'239385_资源名称'}).get('value')
 57     temp_list = re.split('[、:\n]', value)
 58     hosts_list = []
 59     hosts_list.clear()
 60     for hostlist in temp_list:
 61       if re.search('[a-z]', hostlist):
 62         print(hostlist)
 63         hosts_list.append(hostlist)
 64     hosts_set = set(hosts_list)
 65     if target_hosts.intersection(hosts_set):
 66       print('非空,有交集')
 67       result = True
 68     else:
 69       print("空,无交集")
 70   except:
 71     pass
 72   return result
 73 
 74 def IsIntersect2(soup_arg):
 75   soup = soup_arg
 76   result = False
 77   try:
 78     value = soup.find('input', {'name':'所在的硬件设备/软件平台'}).get('value')
 79     temp_list = re.split('[、:\n]', value)
 80     hosts_list = []
 81     hosts_list.clear()
 82     for hostlist in temp_list:
 83       if re.search('[a-z]', hostlist):
 84         hosts_list.append(hostlist)
 85     hosts_set = set(hosts_list)
 86     if target_hosts.intersection(hosts_set):
 87       print('非空,有交集')
 88       result = True
 89     else:
 90       print("空,无交集")
 91   except:
 92     pass
 93   return result
 94 
 95 if __name__ == '__main__':
 96   target_hosts = {'cmszsoaa', 'cmszsoab', 'cmszdcss', 'cmszicss', 'cmsznpsa', 'cmsznpsb', 'cmszinta', 'cmszintb',
 97           'cmszdpsa', 'cmszdpsb', 'mcbsoaa', 'mcbsoab', 'mcbinta', 'mcbintb', 'mcbdpsa', 'mcbdpsb',
 98           'mcbnpsa', 'mcbnpsb', 'mcbdcss', 'mcbicss', 'newdcss', 'newicss'}
 99 
100   work_dir = '/root/XmlOut/'
101   target_dir = '/root/AccountOut/'
102 
103   for parent, dirnames, filenames in os.walk(work_dir, followlinks=True):
104     for filename in filenames:
105       file_path = os.path.join(parent, filename)
106       print("filename with full path: %s" % file_path)
107       soup = processhtml(file_path)
108       flag1 = IsComputer(soup)
109       flag2 = IsAgree(soup)
110       flag3 = IsIntersect(soup)
111       flag4 = IsIntersect2(soup)
112       if (flag1 and flag2 and (flag3 or flag4)):
113         print('%s, ok----' % (file_path))
114         shutil.copy(file_path, target_dir)
posted @ 2018-08-08 17:33  东宫得臣  阅读(1111)  评论(0编辑  收藏  举报