requests
requests模块在处理爬虫更加高效,快捷。
基于request发起get请求
- 需求:爬取搜狗首页的数据
import requests # 1.指定url url = 'https://www.sogou.com/' #2发起get请求:get方法会返回请求成功的响应对象 response = requests.get(url=url) #获取响应的数据:text可以获取响应对象中字符串形式的页面数据 page_data = response.text #print(page_data) # 4进行持久化存储 with open('./sougou.html','w',encoding='utf-8')as fp: fp.write(page_data)
response对象中其他重要属性
import requests #指定url url = 'https://www.sogou.com/' #2发起get请求:get方法会返回请求成功的响应对象 response = requests.get(url=url) #获取响应的数据:text可以获取响应对象中字符串形式的页面数据 # page_data = response.content # content 获取的是二进制数据 # print(page_data) #返回响应状态码 print(response.status_code) # 获取响应头信息 print(response.headers) # 获取请求指定的url print(response.url)
request模块如何处理带参数的get请求
- 方式一 需求:指定一个词条,获取搜狗搜索结果所对应的页面数据
import requests url = 'https://www.sogou.com/web?query=周杰伦&ie=utf8' # 不需要编码中文 response = requests.get(url=url) page_data = response.text # print(page_data) with open('./zhou.html','w',encoding='utf-8')as fp: fp.write(page_data)
- 方式二
import requests url = 'https://www.sogou.com/web' # 将参数封装到字典中 params = { 'query':'周杰伦', 'ie':'utf-8' } response = requests.get(url=url,params=params) response.status_code print(response.text)
自定义请求头信息
import requests url = 'https://www.sogou.com/web' # 将参数封装到字典中 params = { 'query':'周杰伦', 'ie':'utf-8' } # 自定义请求头信息 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } response=requests.get(url=url,params=params,headers=headers) print(response.status_code)
基于request发起post请求
- 登录豆瓣 获取登陆成功之后的页面数据
import requests # url = 'https://accounts.douban.com/login' url = "https://www.douban.com/accounts/login" # 封装post请求的数据 data ={ "sourse":"movie", "redir":"https://movie.douban.com/", "form_email":"15027900535", "form_password":"bobo@15027900535", "login":"登录" } # 自定义请求头信息 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } #发起post请求 response=requests.post(url=url,data=data,headers=headers) # 获取页面数据 page_text = response.text # print(page_text) # 持久化存储 with open('./douban.html','w',encoding='utf-8')as fp: fp.write(page_text)
基于ajax的get请求
- 需求:抓取豆瓣电影上电影详情的数据
import requests url = 'https://movie.douban.com/j/chart/top_list?' # 封装ajax数据 params={ "type":"24", "interval_id":"100:90", "action":"", "start":"0", # 从哪里开始 "limit":"10" # 获取多少条数据 } # 自定义请求头信息 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } #发起post请求 response=requests.get(url=url,params=params,headers=headers) # 获取页面数据 page_text = response.text print(page_text) with open('./douban1.html','w',encoding='GBK')as fp: # 写入文件会出现乱码,而且是纯文本 fp.write(page_text) print('ok')
基于ajax的post请求
- 需求:爬取肯德基
import requests url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' data = { "cname":'', 'pid':'', 'keyword':'上海', 'pageIndex':'1', 'pageSize':'10' } response = requests.post(url=url,data=data) print(response.text)
综合项目实战
- 需求:爬取搜狗知乎某一词条对应一定页码范围表示的页面数据
import requests import os # 创建一个文件夹 if not os.path.exists('./pages'): os.mkdir('./pages') word = input('enter a key-word:') start_num = int(input('enter start num:').strip()) end_num = int(input('enter end num:').strip()) url = 'http://zhihu.sogou.com/zhihu?' # 自定义请求头信息 headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } for page in range(start_num,end_num+1): params={ 'query':word, 'ie':'utf-8', 'page':page } response= requests.get(url=url,params=params,headers=headers) page_text = response.text file_name = word+str(page)+'.html' file_path='pages/'+file_name with open(file_path,'w',encoding='utf-8')as fp: fp.write(page_text) print('第%d页写入成功'%page)