python获取112报修单数据

python+beautifulsoup

爬取112报修单信息,写入文档保存。

  1 #!/usr/bin/python
  2 # -*- coding: utf-8 -*-
  3 import requests
  4 import sys
  5 import http.cookiejar as cookielib
  6 from bs4 import BeautifulSoup
  7 import re
  8 import time 
  9 import random
 10 
 11 s = requests.session()
 12 s.cookies = cookielib.LWPCookieJar(filename = "LoginCookies-112.txt")
 13 
 14 def login_post():
 15     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
 16     header = {'Origin':'http://112.efoxconn.com','Referer':'http://112.efoxconn.com/Home/Index','User-Agent':userAgent}
 17     
 18     login_url = 'http://112.efoxconn.com/Login/Index'
 19     login_data = {'username':'H2605177','password':'123'}
 20 
 21     r = s.post(login_url,data=login_data,headers = header,allow_redirects = False)
 22     time.sleep(random.random()*2)
 23     login_cookies = s.cookies
 24     cok = requests.utils.dict_from_cookiejar(login_cookies)
 25     #print(cok)
 26     #print(login_cookies)
 27     #print(r.text)
 28     print("登陆状态:",r.status_code)
 29     s.cookies.save()
 30 
 31 def get_bynum_1(url,num): #获取承办人信息、申请人信息、满意度信息。
 32     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
 33     header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}
 34     bynum_url= "http://112.efoxconn.com"+url+"/"+num
 35     r1= s.get(bynum_url)
 36     time.sleep(random.random()*2)
 37     soup1 = BeautifulSoup(r1.text, features='html.parser')
 38     #re_11 = soup1.td.get_text()
 39     re_12 = soup1.find_all("td",class_=re.compile("value"),limit=16)
 40     list_1 = []
 41     for i in range (len(re_12)):
 42         re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
 43         if i == 4:
 44             list_1.append(re_13)
 45         elif i == 5:
 46             list_1.append(re_13)
 47         elif i == 6:
 48             list_1.append(re_13)
 49         elif i == 7:
 50             list_1.append(re_13)
 51         elif i == 11:
 52             list_1.append(re_13)
 53         elif i == 12:
 54             list_1.append(re_13)
 55         elif i == 13:
 56             list_1.append(re_13)
 57         #elif i == 15:
 58             #list_1.append(re_13)
 59 
 60     re_12 = soup1.find_all("td",attrs={"class":"r-value","colspan":"7"})#同时使用多个属性值过滤出满意度评价。
 61     if re_12:
 62         re_13 = re_12[0].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","") 
 63         list_1.append(re_13)
 64     else:
 65         list_1.append("未评价")
 66     #re_12 = re.findall(r'<td.*?>(.*?)</td>',re_11,re.S|re.M)#使用re.findall模块和正则表达式,匹配过滤出目标字符串中的<td>标签内容
 67     #re_12 = re.findall(r'(?<=<td>).+?(?=</td>)',re_11,re.S|re.M)
 68     return list_1
 69 
 70 def get_bynum_2(url,num): #获取申请内容信息
 71     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
 72     header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}
 73     bynum_url= "http://112.efoxconn.com"+url+"/"+num
 74     r1= s.get(bynum_url)
 75     time.sleep(random.random()*2)
 76     soup1 = BeautifulSoup(r1.text, features='html.parser')
 77     #re_11 = soup1.td.get_text()
 78     list_2 = []
 79     re_12 = soup1.find_all("td",class_=re.compile("nettype-frame-title")) #获取通信单-申请信息标题
 80     for i in range (len(re_12)):
 81         re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
 82         if i == 0:
 83             list_2.append(re_13)
 84 
 85     re_12 = soup1.find_all("div",class_=re.compile("model-header")) #获取特殊单-申请信息标题
 86     for i in range (len(re_12)):
 87         re_13 = re_12[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
 88         if i == 0:
 89             list_2.append(re_13)
 90 
 91     re_122 = soup1.find_all("td",class_=re.compile("color-blue r-value")) #获取通信单数量(新增分机、迁移分机)
 92     for i in range (len(re_122)):
 93         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
 94         if i == 0:
 95             list_2.append(re_13)
 96 
 97     re_122 = soup1.find_all("span",style=re.compile("color: blue; font-weight: bold;")) #获取通信单报修数量(网点不通、电话不通)
 98     for i in range (len(re_122)):
 99         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
100         if i == 0:
101             list_2.append(re_13)
102 
103     re_122 = soup1.find_all("div",style=re.compile("padding-left: 3px;")) #获取通信单功能设定类标题信息
104     for i in range (len(re_122)):
105         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
106         if i == 0:
107             list_2.append(re_13)
108 
109     re_122 = soup1.find_all("div",class_=re.compile("tb-memo")) #获取通信单《其他》标题下的需求内容描述,特殊单《其他》标题下需求描述
110     for i in range (len(re_122)):
111         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
112         if i == 0:
113             list_2.append(re_13)
114 
115     re_122 = soup1.find_all("td",class_=re.compile("r-value tb-memo"),limit=2) #获取特殊单需求描述
116     for i in range (len(re_122)):
117         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
118         if i == 1:
119             list_2.append(re_13)
120 
121     re_122 = soup1.find_all("td",rowspan="1",limit=2) #获领料单信息 
122     for i in range (len(re_122)):
123         re_13 = re_122[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
124         if i == 0:
125             list_2.append(re_13)
126         elif i ==1:
127             list_2.append(re_13)
128 
129     return list_2
130 
131 def get_bynum_3(url,num): #获取签核进度,签核人及时间信息。
132     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
133     header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}
134     bynum_url= "http://112.efoxconn.com"+url+"/"+num
135     r1= s.get(bynum_url)
136     time.sleep(random.random()*2)
137     #print("get获取状态:",r1.status_code)
138     soup1 = BeautifulSoup(r1.text, features='html.parser')
139     
140     list_1 = []
141     list_2 = []
142     re_1 = soup1.find_all("tr",class_=re.compile("pro-item"))
143     for i in range (len(re_1)):
144         #re_11 = re_1[i].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
145         re_11 =str(re_1[i])
146         soup2 = BeautifulSoup(re_11,features='html.parser')
147         re_12 = soup2.find_all("td")
148         for j in range (len(re_12)):
149             re_13 = re_12[j].get_text().replace("\r","").replace("\n","").replace("\000","").replace(" ","")
150             list_2.append(re_13)
151         list_1.append(list_2)
152         list_2 = []
153 
154     #print(list_1) 
155 
156     return list_1
157 
158 def get_byme():
159     userAgent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
160     header = {'Host':'112.efoxconn.com','Referer':'http://112.efoxconn.com/UserCenter/ByMe','User-Agent':userAgent}
161     
162     byme_url = 'http://112.efoxconn.com/UserCenter/ByMe'
163     post_data = {'sonbr':'','formtype':'','startdate':'','enddate':'','page':'1','pagesize':'100'}
164 
165     r = s.post(byme_url,data=post_data,headers = header,allow_redirects = False)
166     time.sleep(random.random()*3)
167     print("获取查询结果:",r.status_code)
168     print('*************************************************************')
169     
170     #使用正则匹配出过滤标签内容
171     re_1 = str(re.findall(r"\[(.+)\]",r.text))
172     re_2 = re.split(r"\{|\},|\[\'|\'\]",re_1)
173     #使用过滤器筛掉空串得到了迭代器,再重新构造出列表
174     re_2 = [item for item in filter(lambda x:x != '',re_2)]
175     for i in range (len(re_2)):
176         print("报修单序号:"+str(i))
177         re_3 = re.split(r"[,\"]",str(re_2[i])) #把元素内容装进列表re_3
178         #print (re_3)
179         bb="/Sign" 
180         #print(re_3[13],re_3[18],re_3[28])      #打印出每个单所需要的关键字段,如单号、url等。
181         #print(re_3[53],re_3[3],re_3[13])
182         if re_3[1] == "SO_NBR":  #确认元素排序的顺序
183             for aa in re_3:
184                 if bb in aa:      #遍历找到包含/Sign的元素
185                     a=aa
186             b=re_3[3]
187             print(a,b,re_3[13])
188         else:
189             a=re_3[13]
190             b=re_3[18]
191             print(a,b,re_3[28])
192 
193         list_1 = get_bynum_1(a,b) #通过特定函数返回本单号相应数据(申请人信息)
194         list_2 = get_bynum_2(a,b) #通过特定函数返回本单号相应数据(申请内容)
195         list_3 = get_bynum_3(a,b) #通过调用函数获取签核记录集合
196         list_4 = [list_3[-1][-3],list_3[1][-1],list_3[-2][2],list_3[-1][-1]]
197         list_text = list_4+list_1+list_2#拼接需要打印的list为一个list
198         list_f = [a,"~",b,"~"]    
199     
200         for xx in list_text:
201             list_f.append(xx)
202             list_f.append("~")
203         list_ft=str(list_f).replace("[","").replace("'","").replace("]","").replace(",","")#把list转换为字符串,并替换掉无用标点符号。
204         with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:
205             f.write(list_ft)
206             
207         '''
208         with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:
209             f.write(list_3[-1][-3])
210             f.write("~")
211             f.write(list_3[1][-1])
212             f.write("~")
213             f.write(list_3[-2][2])
214             f.write("~")
215             f.write(list_3[-1][-1])
216             f.write("~") 
217 
218         list_1 = get_bynum_1(a,b) #通过特定函数返回本单号相应数据(申请人信息)
219         #print(list_1)
220         for x in list_1:
221             with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:
222                 f.write(x)
223                 f.write("~")
224 
225         list_2 = get_bynum_2(a,b) #通过特定函数返回本单号相应数据(申请内容)
226         #print(list_2)
227         for x in list_2:
228             with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f:
229                 f.write(x)
230                 f.write("~")
231         '''
232 
233         with open('my_check_list.txt','a+',encoding='utf-8',errors='ignore') as f: #写入一个完整的list后,换行
234             f.write('\n')
235 
236     print('#############################################################')
237 
238 
239 login_post()
240 get_byme()
241 #print(get_bynum_2("/Network/Sign","21118061100034"))

 

posted @ 2019-03-07 08:26  he_ding  阅读(478)  评论(0编辑  收藏  举报