python之路_爬虫之requests模块初始
一、校花网视频爬取
1、正常爬取
import requests import re import hashlib import time import os downLoad_path=r'D:\DOWNLOAD' def get_page(url): response=requests.get(url) if response.status_code==200: return response.text def parse_index(index_contents): detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S) for detail_url in detail_urls: if not detail_url.startswith("http"): detail_url="http://www.xiaohuar.com"+detail_url print(detail_url) yield detail_url def parse_detail(detail_contents): movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S) if movie_urls: movie_url=movie_urls[0] if movie_url.endswith('mp4'): print(movie_url) yield movie_url def down_load(movie_url): try: response=requests.get(movie_url) if response.status_code==200: data=response.content m=hashlib.md5() m.update(str(time.time()).encode('utf-8')) m.update(movie_url.encode('utf-8')) file_path=os.path.join(downLoad_path,'%s.mp4' %m.hexdigest()) with open(file_path,'wb') as f: f.write(data) f.flush() print("下载成功") except Exception: pass def main(): raw_url="http://www.xiaohuar.com/list-3-{page_num}.html" for i in range(5): index_url=raw_url.format(page_num=i) index_contents=get_page(index_url) detail_urls=parse_index(index_contents) for detail_url in detail_urls: detail_contents=get_page(detail_url) movie_urls=parse_detail(detail_contents) for movie_url in movie_urls: down_load(movie_url) if __name__ == '__main__': main()
2、并发爬取
进程池并发进行 import requests import re import hashlib import time import os from concurrent.futures import ThreadPoolExecutor pool=ThreadPoolExecutor(50) downLoad_path=r'D:\DOWNLOAD' def get_page(url): try: response=requests.get(url) if response.status_code==200: return response.text except Exception: pass def parse_index(index_contents): index_contents = index_contents.result() detail_urls=re.findall('class="items".*?href="(.*?)"',index_contents,re.S) for detail_url in detail_urls: if not detail_url.startswith("http"): detail_url="http://www.xiaohuar.com"+detail_url print(detail_url) pool.submit(get_page, detail_url).add_done_callback(parse_detail) def parse_detail(detail_contents): detail_contents = detail_contents.result() movie_urls=re.findall('id="media".*?src="(.*?)"',detail_contents,re.S) if movie_urls: movie_url=movie_urls[0] if movie_url.endswith('mp4'): print(movie_url) pool.submit(down_load, movie_url) def down_load(movie_url): try: response=requests.get(movie_url) if response.status_code==200: data=response.content m=hashlib.md5() m.update(str(time.time()).encode('utf-8')) m.update(movie_url.encode('utf-8')) file_path=os.path.join(downLoad_path,'%s.mp4' %m.hexdigest()) with open(file_path,'wb') as f: f.write(data) f.flush() print("下载成功") except Exception: pass def main(): raw_url="http://www.xiaohuar.com/list-3-{page_num}.html" for i in range(5): index_url=raw_url.format(page_num=i) pool.submit(get_page, index_url).add_done_callback(parse_index) if __name__ == '__main__': main()
二、requests模块介绍
#介绍:使用requests可以模拟浏览器的请求,比起之前用到的urllib,requests模块的api更加便捷(本质就是封装了urllib3) #注意:requests库发送请求将网页内容下载下来以后,并不会执行js代码,这需要我们自己分析目标站点然后发起新的request请求 #安装:pip3 install requests #各种请求方式:常用的就是requests.get()和requests.post() >>> import requests >>> r = requests.get('https://api.github.com/events') >>> r = requests.post('http://httpbin.org/post', data = {'key':'value'}) >>> r = requests.put('http://httpbin.org/put', data = {'key':'value'}) >>> r = requests.delete('http://httpbin.org/delete') >>> r = requests.head('http://httpbin.org/get') >>> r = requests.options('http://httpbin.org/get') #建议在正式学习requests前,先熟悉下HTTP协议 http://www.cnblogs.com/linhaifeng/p/6266327.html
三、requests模块GET请求
在requests.get()方法中除了必须要有的url参数之外,还有如下的几种参数,现结合实际的应用介绍如下:
1、headers参数
通常我们在发送请求时都需要带上请求头,请求头是将自身伪装成浏览器的关键,常见的有用的请求头需要有如下几种信息,如Referer (大型网站通常都会根据该参数判断请求的来源)、User-Agent (客户端信息)及Cookie (Cookie信息虽然包含在请求头里,但requests模块有单独的参数来处理他,headers={}内可以就不放它),如下实例请求知乎网站时,就必须在headers里加入客户端内容,否则无法请求到:
response=requests.get("https://www.zhihu.com/explore", headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', }) print(response.status_code) print(response.text)
2、params参数
此参数主要用在带参数的get请求,通过此参数将参数拼接到请求的url里,应用实例如下:
方式一:
import requests from urllib.parse import urlencode params={ "wd":"美女", "pn":1 } url="https://www.baidu.com/s?%s" %urlencode(params,encoding='utf-8') print(url) #结果:https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3&pn=1 response=requests.get(url, headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } ) response.encoding='utf-8' with open("test.html","w",encoding="utf-8") as f: f.write(response.text)
如上,我们需要的拼接的参数以字典的形式进行列出,虽然我们在浏览器中会看到请求路径中的有些参数是以中文显示,但是那是浏览器给渲染的结果,真正的请求路径应该均为英文才对,所有可以通过urlencode模块对参数进行转码。
方式二(推荐):
response=requests.get("https://www.baidu.com/s?", params={ "wd":"美女", "pn":1 }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' } ) # print(response.text) with open("test.html","w",encoding="utf-8") as f: f.write(response.text)
如上,因为requests模块已经封装了urllib模块,故我们并不需要通过转码自己去拼接参数,通过如上params参数的自动进行url中参数的拼接。
3、cookies参数
在我们进行模拟请求时候,在大多时候cookies是必不可少的参数,可以通过如下两种方式进行发送。
方式一:
reponse=requests.get('https://github.com/settings/emails', headers={ 'Cookie':'user_session=TXvPZ4cH1z-aVuMnXOokwsQzYzg-yAdMT4oUbTCRfwZJyyE7', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', }, ) print('931880645@qq.com' in reponse.text) #true
如上,我们可以将cookie直接放在请求头里面,但是requests有自己的cookies参数,所以我们一般不这样做。推荐的做法如下方式二所示。
方式二(推荐):
reponse=requests.get('https://github.com/settings/emails', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', }, cookies={ 'user_session':'TXvPZ4cH1z-aVuMnXOokwsQzYzg-yAdMT4oUbTCRfwZJyyE7', } ) print('931880645@qq.com' in reponse.text) #true
四、requests模块POST请求
requests.post()用法与requests.get()完全一致,特殊的是requests.post()有一个data参数,用来存放请求体数据,它与get请求的对比介绍如下:
#GET请求 HTTP默认的请求方法就是GET * 没有请求体 * 数据必须在1K之内! * GET请求数据会暴露在浏览器的地址栏中 GET请求常用的操作: 1. 在浏览器的地址栏中直接给出URL,那么就一定是GET请求 2. 点击页面上的超链接也一定是GET请求 3. 提交表单时,表单默认使用GET请求,但可以设置为POST #POST请求 (1). 数据不会出现在地址栏中 (2). 数据的大小没有上限 (3). 有请求体 (4). 请求体中如果存在中文,会使用URL编码!
如下实例介绍了模拟自动登录github实例:
方式一(自己处理cookies):
import requests import re #第一步:向https://github.com/login发送GET请求,拿到未授权的cookie,拿到authenticity_token response=requests.get('https://github.com/login', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', }, ) cookies=response.cookies.get_dict() authenticity_token=re.findall('name="authenticity_token".*?value="(.*?)"',response.text,re.S)[0] #第二步:带着未授权的cookie,authenticity_token,账号密码,向https://github.com/session发送POST请求,拿到授权的cookie response=requests.post('https://github.com/session', cookies=cookies, headers={ 'Referer':'https://github.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', }, data={ 'commit' : 'Sign in', 'utf8' : '✓', 'authenticity_token': authenticity_token, 'login' : '用户名', 'password' : '明文密码' }, allow_redirects=False #设置登录成功后禁止跳转 ) login_cookies=response.cookies.get_dict() #获得响应cookie字典 # 第三步:带着cookie访问 reponse=requests.get('https://github.com/settings/emails', cookies=login_cookies, headers={ 'Referer':'https://github.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', }, ) print('931880645@qq.com' in response.text)
方式二(自动处理cookies):
import requests import re session=requests.session() #第一步:向https://github.com/login发送GET请求,拿到未授权的cookie,拿到authenticity_token r1=session.get('https://github.com/login', headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', }, ) authenticity_token=re.findall('name="authenticity_token".*?value="(.*?)"',r1.text,re.S)[0] #第二步:带着未授权的cookie,authenticity_token,账号密码,向https://github.com/session发送POST请求,拿到授权的cookie r2=session.post('https://github.com/session', headers={ 'Referer':'https://github.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', }, data={ 'commit' : 'Sign in', 'utf8' : '✓', 'authenticity_token': authenticity_token, 'login' : '用户名', 'password' : '明文密码' }, allow_redirects=False ) # 第三步:带着cookie访问 r3=session.get('https://github.com/settings/emails', headers={ 'Referer':'https://github.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', }, ) print('931880645@qq.com' in r3.text)