Python四线程爬取西刺代理
1 import requests 2 from bs4 import BeautifulSoup 3 import lxml 4 import telnetlib #验证代理的可用性 5 import pymysql.cursors 6 import random 7 import threading 8 9 10 11 12 BASEURL = 'http://www.xicidaili.com/' #西刺首页 13 urls = [BASEURL+ 'nn/',BASEURL+'nt/',BASEURL+'wn/',BASEURL+'wt/']#西刺分组(more)的ip信息链接列表 14 15 #请求头信息,必须有User-Agent 16 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'} 17 18 #proxies = {'https': 'http://123.57.85.224:80', 'http': 'http://123.57.85.224:80'} 19 20 #获得与数据库的连接和游标 21 def get_cc(): 22 # 连接MySQL数据库 23 connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', password='root', db='iptables', 24 charset='utf8', cursorclass=pymysql.cursors.DictCursor) 25 # 通过cursor创建游标 26 cursor = connection.cursor() 27 return connection,cursor 28 29 30 31 #保存ip_port到数据库 32 def save_ip_port(ip_port): 33 connection,cursor = get_cc() 34 try: 35 sql = 'insert into iptable(ip_port) values("'+ip_port+'")' 36 cursor.execute(sql) 37 except: 38 print('保存'+ip_port+'失败!!!!!') 39 else: 40 connection.commit() 41 connection.close() 42 43 44 45 #从数据库获得ip_port 46 def get_ip_port(): 47 connection,cursor = get_cc() 48 sql_get_id = 'select id,ip_port from iptable' 49 cursor.execute(sql_get_id) 50 #fetchone()是查询一条数据 51 id_list = cursor.fetchall()#得到所有的id的字典列表 52 i = random.randint(0,len(id_list)-1) 53 id_num = id_list[i]['id'] 54 ip_port = id_list[i]['ip_port'] #获得所有可用的代理 55 56 return id_num,ip_port#返回id和ip_port:192.168.1.2:8080 57 58 #删除被封的ip_port 59 def del_ip_port(id_num): 60 connection,cursor = get_cc() 61 try: 62 sql = 'delete from iptable where id = ' + str(id_num) 63 cursor.execute(sql) 64 except: 65 print('删除'+ip_port+'失败!!!!!') 66 else: 67 connection.commit() 68 connection.close() 69 70 71 72 #获得代理 73 def get_proxies(ip_port):#ip_port = '192.168.2.45:8088' 74 proxy_ip = 'http://' + ip_port 75 proxy_ips = 'https://' + ip_port 76 proxies = {'https': proxy_ips, 'http': proxy_ip} 77 return proxies 78 79 80 #获得对应url分类的最大页码 81 def get_max_pagenum(url): #url是more(分类)的链接,/nn,/nt.... 82 83 response = requests.get(url,headers = headers) 84 status_code = response.status_code 85 soup = BeautifulSoup(response.content,'lxml') 86 max_pagenum = soup.find('div',attrs = {'class':'pagination'}).find_all('a')[-2].string 87 max_pagenum = int(max_pagenum) 88 return max_pagenum 89 90 #验证代理是否有用,ip_port = '192.168.2.45:8088' 91 #每得到一个ip_port都要进行验证,如果可用则保存,否则抛弃 92 def verifyProxyList(ip_port): 93 url = 'http://www.baidu.com' 94 # proxies = { "http": "http://"+ ip_port } 95 host ,port = ip_port.split(':') 96 try: 97 # res = requests.get(url,headers = headers,proxies = proxies,timeout = 5.0) 98 telnetlib.Telnet(host, port=port, timeout=5) 99 except: 100 print('---Failur:' + ip_port) 101 else: 102 #ips.append(ip_port)#这里应该存储到Redis等数据库中 103 save_ip_port(ip_port) 104 105 106 107 def main(url,proxies):#这里是more的链接,/nn/1,/nn/2.... 108 109 try: 110 response = requests.get(url,headers = headers,proxies = proxies,timeout = 5.0) 111 status_code = response.status_code #503说明ip被封 112 113 if(status_code != requests.codes.ok):#响应的不是正常状态 114 #删除旧的代理ip_port,这里还需要验证是否有bug 115 old_ip_port = proxies['http'][7:] 116 del_ip_port(old_ip_port) 117 #修改代理,重新请求 118 id_num,ip_port = get_ip_port() 119 proxies = get_proxies(ip_port) 120 print(str(proxies)) 121 return 122 123 soup = BeautifulSoup(response.content,'lxml') 124 125 results = soup.find_all('tr')#遍历所有的tr 126 127 for result in results[1:]:#这里第一个tr子标签是th,所以会报错 128 tdlist = result.find_all('td') 129 ip_port = tdlist[1].string+':'+tdlist[2].string 130 verifyProxyList(ip_port) 131 except: 132 print('请求异常......') 133 134 class myThread(threading.Thread): 135 def __init__(self, threadID, name, url): 136 threading.Thread.__init__(self) 137 self.threadID = threadID 138 self.name = name 139 self.url = url 140 141 142 143 def run(self): 144 print('正在执行线程:'+self.name)#没有验证这一行的可行性 145 id_num,ip_port = get_ip_port() 146 proxies = get_proxies(ip_port) 147 max_pagenum = get_max_pagenum(self.url) 148 #print(max_pagenum) 149 for i in range(1,max_pagenum): 150 url = self.url + '/' + str(i) 151 main(url,proxies) 152 153 #4线程爬取西刺的ip代理池 154 if __name__ == '__main__': 155 156 t1 = myThread(1,"Thread-1",urls[0]) 157 t2 = myThread(2,"Thread-2",urls[1]) 158 t3 = myThread(3,"Thread-3",urls[2]) 159 t4 = myThread(4,"Thread-4",urls[3]) 160 t1.start() 161 t2.start() 162 t3.start() 163 t4.start() 164 t1.join() 165 t2.join() 166 t3.join() 167 t4.join() 168 169