通过代理爬mzitu
#导入库
1 2 3 4 5 6 7 8 | import os import requests from bs4 import BeautifulSoup import time from config import * import random import re from requests import ConnectionError |
#生成mzitu请求headers
1 2 3 4 5 6 | def res_headers(): headers = { 'User-Agent' : random.choice(USER_AGENT_LIST), 'Referer' :random.choice(REFERER_LIST), } return headers |
#生成单个user-agent
1 2 3 4 5 | def get_header(): headers = { 'User-Agent' :random.choice(USER_AGENT_LIST) } return headers |
#获取list后checkip返回可用ip
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | def get_proxy_list(): ip_list = [] base_url = 'https://www.xicidaili.com/wt/' header = get_header() actual_url = base_url + str(random.randint(1,300)) try : res = requests. get (url=actual_url, headers=header) if res.status_code == 200: html = res.text pattern = '(\d+\.\d+\.\d+\.\d+)</td>\s*<td>(\d+)' re_list = re.findall(pattern, html) for ip_port in re_list: ip_port = ip_port[0] + ':' + ip_port[1] ip_list.append(ip_port) check_ip(ip_list) else :get_proxy_list() except ConnectionError: get_proxy_list() |
#check 有效ip
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | def check_ip(ip_list): # print('check_ip') url_baidu = 'https://www.mzitu.com/' proxy_ip = 'http://' + random.choice(ip_list) proxy_ip_dic = { 'http' : proxy_ip } header = get_header() # print(proxy_ip_dic) try : res = requests. get (url_baidu, headers=header, proxies=proxy_ip_dic, timeout=8) if res.status_code == 200: # print(proxy_ip_dic) return proxy_ip_dic except ConnectionError: get_proxy_list() |
#网站请求
1 2 3 4 5 6 7 8 9 | def get_page(url): headers=res_headers() # 创建session s = requests.session() s.keep_alive = False # 获取页面 res = s. get (url,headers=headers) html = res.text return html |
#获取页面all girls的详情页url
1 2 3 4 5 6 7 8 9 10 11 12 13 | def get_all_girls(url): html = get_page(url) # 构建soup页面 soup = BeautifulSoup(html, 'html.parser' ) # 获取 class_='archives' 下的所有 'a'标签 total_info = soup.find(class_= 'archives' ).find_all( 'a' ) # 遍历 'a' 标签,读取'href'值 all_list=[] for girls_info in total_info: link_url = girls_info[ 'href' ] all_list.append(link_url) # print(all_list) return all_list |
#获取girl的所有图片url
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | def get_girl_all_page(url): print( '获取girl的所有图片url' ) html=get_page(url) soup = BeautifulSoup(html, 'html.parser' ) # 在 class_='pagenavi' 中的倒数第3个标签,读取 'span' 的值(图片数量) max_page = soup.find(class_= 'pagenavi' ,).find_all( 'a' )[-2].find( 'span' ). string title = soup.find(class_= 'main-title' ). string # 循环读取详情页面中的'img'标签中的'src'值 headers = res_headers() proxy = get_proxy_list() pic_url_list = [] for i in range( int (max_page)): page_url = url + "/%s" %(i+1) html = requests. get (page_url, headers=headers, proxies=proxy).text # print(html) soup = BeautifulSoup(html, 'html.parser' ) # print(soup.text) # pic_url = soup.find('img').get('src') pic_url = soup.find( 'img' ). get ( 'src' ) # print(pic_url) pic_url_list.append(pic_url) time.sleep(0.1) # print(pic_url_list) download_Pic(title,pic_url_list) |
#下载图片,以标题为文件夹名
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | def download_Pic(title, pic_url_list): # 新建文件夹,路径 os.mkdir(title) headers = res_headers() proxy = get_proxy_list() # 自定义序列号 j = 1 # 下载图片 for item in pic_url_list: # 定义文件路径及名称 filename = '%s/%s.jpg' % (title, str(j)) print( 'downloading....%s : NO.%s' % (title, str(j))) with open(filename, 'wb' ) as f: img = requests. get (item, headers=headers,proxies=proxy).content f.write(img) j += 1 time.sleep(10) |
#主程序
1 2 3 4 5 | if __name__ == '__main__' : url = "https://www.mzitu.com/all" pic_list = get_all_girls(url) for i in pic_list: get_girl_all_page(i) |
#config.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | USER_AGENT_LIST = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" , "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11" , "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6" , "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6" , "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1" , "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5" , "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5" , "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3" , "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3" , "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3" , "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3" , "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3" , "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3" , "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3" , "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3" , "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3" , "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" , "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" , "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.21.8 (KHTML, like Gecko) Version/4.0.4 Safari/531.21.10" , "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8" , "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5" , "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12" , "Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1" , "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.8 (KHTML, like Gecko) Chrome/4.0.302.2 Safari/532.8" , "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3" , "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13" , "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.186 Safari/535.1" , "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.54 Safari/535.2" , "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7" , "Mozilla/5.0 (Macintosh; U; Mac OS X Mach-O; en-US; rv:2.0a) Gecko/20040614 Firefox/3.0.0 " , "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.0.3) Gecko/2008092414 Firefox/3.0.3" , "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5" , "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.14) Gecko/20110218 AlexaToolbar/alxf-2.0 Firefox/3.6.14" , "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10.5; en-US; rv:1.9.2.15) Gecko/20110303 Firefox/3.6.15" , "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1" ] REFERER_LIST= [ 'https://www.mzitu.com/215756' , 'https://www.mzitu.com/201236' , 'https://www.mzitu.com/214521' , 'https://www.mzitu.com/200253' , 'https://www.mzitu.com/214751' , 'https://www.mzitu.com/199934' , 'https://www.mzitu.com/214404' , 'https://www.mzitu.com/199190' , 'https://www.mzitu.com/214261' , 'https://www.mzitu.com/199970' , ] |
爬了1个小时后,自己的ip被代理网站屏蔽了,哈哈哈哈
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | Traceback (most recent call last): File "D:\Python38\lib\site-packages\urllib3\connection.py" , line 156, in _new_conn conn = connection.create_connection( File "D:\Python38\lib\site-packages\urllib3\util\connection.py" , line 84, in create_connection raise err File "D:\Python38\lib\site-packages\urllib3\util\connection.py" , line 74, in create_connection sock.connect(sa) TimeoutError: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。 During handling of the above exception, another exception occurred: Traceback (most recent call last): File "D:\Python38\lib\site-packages\urllib3\connectionpool.py" , line 665, in urlopen httplib_response = self._make_request( File "D:\Python38\lib\site-packages\urllib3\connectionpool.py" , line 376, in _make_request self._validate_conn(conn) File "D:\Python38\lib\site-packages\urllib3\connectionpool.py" , line 994, in _validate_conn conn.connect() File "D:\Python38\lib\site-packages\urllib3\connection.py" , line 334, in connect conn = self._new_conn() File "D:\Python38\lib\site-packages\urllib3\connection.py" , line 168, in _new_conn raise NewConnectionError( urllib3.exceptions.NewConnectionError: <urllib3.connection.VerifiedHTTPSConnection object at 0x000001E898850C40>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。 During handling of the above exception, another exception occurred: Traceback (most recent call last): File "D:\Python38\lib\site-packages\requests\adapters.py" , line 439, in send resp = conn.urlopen( File "D:\Python38\lib\site-packages\urllib3\connectionpool.py" , line 719, in urlopen retries = retries.increment( File "D:\Python38\lib\site-packages\urllib3\util\retry.py" , line 436, in increment raise MaxRetryError(_pool, url, error or ResponseError(cause)) urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host= 'www.mzitu.com' , port=443): Max retries exceeded with url: /194229/30 (Caused by NewConnectionError( '<urllib3.connection.VerifiedHTTPSConnection object at 0x000001E898850C40>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。' )) During handling of the above exception, another exception occurred: Traceback (most recent call last): File "D:/project/mzitu/spider.py" , line 132, in <module> get_girl_all_page(i,proxy) File "D:/project/mzitu/spider.py" , line 98, in get_girl_all_page html = requests. get (page_url,headers=res_headers(),proxies=proxy).text File "D:\Python38\lib\site-packages\requests\api.py" , line 75, in get return request( 'get' , url, params = params , **kwargs) File "D:\Python38\lib\site-packages\requests\api.py" , line 60, in request return session.request(method=method, url=url, **kwargs) File "D:\Python38\lib\site-packages\requests\sessions.py" , line 533, in request resp = self.send(prep, **send_kwargs) File "D:\Python38\lib\site-packages\requests\sessions.py" , line 646, in send r = adapter.send(request, **kwargs) File "D:\Python38\lib\site-packages\requests\adapters.py" , line 516, in send raise ConnectionError(e, request=request) requests.exceptions.ConnectionError: HTTPSConnectionPool(host= 'www.mzitu.com' , port=443): Max retries exceeded with url: /194229/30 (Caused by NewConnectionError( '<urllib3.connection.VerifiedHTTPSConnection object at 0x000001E898850C40>: Failed to establish a new connection: [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。' )) |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步