lxml_time_代理
1 import requests 2 from pyquery import PyQuery as pq 3 import json 4 import jsonpath 5 from lxml import etree 6 import os 7 import re 8 import time 9 10 html = ''' 11 <div> 12 <ul> 13 <li class="item-0">first item</li> 14 <li class="item-1"><a href="link2.html">second item</a></li> 15 <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li> 16 <li class="item-1 active"><a href="link4.html">fourth item</a></li> 17 <li class="item-0"><a href="link5.html">fifth item</a></li> 18 </ul> 19 </div> 20 ''' 21 22 # html = requests.get('http://news.4399.com/gonglue/lscs/kptj/').content.decode('gbk') 23 num = 0 24 # def pq方法(url): 25 # global num 26 # html=requests.get(url).content.decode('gbk') 27 # doc = pq(html) 28 # items = doc('#dq_list > li').items() 29 # # print(doc) 30 # # print(type(doc)) 31 # for item in items: 32 # url=item.find('img').attr('lz_src') 33 # num+=1 34 # print(str(num),url) 35 # url_content=requests.get(url).content 36 # name = item.find('.kp-name').text() 37 38 # with open('e:/py3/002/'+'{:0>4}'.format(str(num))+name+'.jpg','wb') as file: 39 # file.write(url_content) 40 # # print(url,name) 41 42 def transformCodec(re_data):#ascii (gbk) 转 unicode 43 try: 44 re_data = re_data.decode('gbk') 45 except Exception as error: 46 print (error) 47 print ('delete illegal string,try again...') 48 49 pos = re.findall(r'decodebytesinposition([\d]+)-([\d]+):illegal',str(error).replace(' ','')) 50 if len(pos)==1: 51 re_data = re_data[0:int(pos[0][0])]+re_data[int(pos[0][1]):] 52 re_data = transformCodec(re_data) 53 return re_data 54 return re_data 55 56 57 def lxml方法(url): 58 global num 59 header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2473.400'} 60 content=requests.get(url,headers=header).content 61 html=content.decode('utf-8') 62 # print(html.status_code) 63 # print(content) 64 # print(html) 65 r=etree.HTML(html) 66 # items=r.xpath("//div[@class='box10-content']//ul[@id='dq_list']/li/a/img/@lz_src") 67 items=r.xpath("//div[@id='list']/table//tr") 68 # print(items) 69 for item in items: 70 dl_ip=item.xpath("./td[1]/text()") 71 dl_port=item.xpath("./td[2]/text()") 72 dl_name=item.xpath("./td[5]/text()") 73 num+=1 74 dl_ip=dl_ip[0]+":" if len(dl_ip)>=1 else '' 75 dl_port=dl_port[0]+"#" if len(dl_port)>=1 else '' 76 dl_name=dl_name[0] if len(dl_name)>=1 else '' 77 78 # print(len(dl_ip)) 79 # print(dl_ip) 80 # print(r'{}{}{}'.format(dl_ip,dl_port,dl_name)) 81 with open("proxy.txt",'a',encoding='utf-8') as file: 82 file.write('{}{}{}\n'.format(dl_ip,dl_port,dl_name)) 83 # lzcontent=requests.get(lzsrc).content 84 # with open('e:/py3/004/'+'{:0>4}'.format(str(num))+'_'+kpname+'.jpg','wb')as file: 85 # file.write(lzcontent) 86 87 88 89 90 91 if __name__ == '__main__': 92 with open("proxy.txt", 'w', encoding='utf-8') as file: 93 file.write(str(time.localtime()[0])+'_'+str(time.localtime()[1])+'_'+str(time.localtime()[2])+'_采集:\n') 94 # url='https://www.kuaidaili.com/free/inha/1/' 95 for i in range(1,11): 96 print('第'+str(i)+'次:\n') 97 url2 = r'https://www.kuaidaili.com/free/inha/'+str(i)+r'/' 98 print(url2) 99 lxml方法(url2) 100 time.sleep(5) 101 102 # header={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2473.400'} 103 # pq方法() 104 # print(str(time.localtime()[0])+'_'+str(time.localtime()[1])+'_'+str(time.localtime()[2])) 105 print(str(num)+' ok!') 106 107 108 # 创建目录 109 ''' 110 for dirnum in range(1,100): 111 dirnum2='{:0>3}'.format(str(dirnum)) 112 mkpath="e:\\py3\\{}\\".format(dirnum2) 113 print(mkpath) 114 print('已存在!') if os.path.exists(mkpath) else os.makedirs(mkpath) 115 '''