爬虫之requests模块
安装requests模块
pip install requests
requests模块的常用方法, 属性, 参数
import requests ret = requests.get(url='https://www.baidu.com', ) # 原函数: get(url, params=None, **kwargs) ret.encoding = 'utf-8' # 指定解析数据是使用的编码格式 print(ret.content) # 响应的数据, bytes类型 print(ret.text) # 响应的数据, str类型 print(ret.url) # 当前访问的url print(ret.headers, type(ret.headers)) # 响应头, 类型<class 'requests.structures.CaseInsensitiveDict'>, 和字典操作类似 print(ret.json()) # 当响应的Content-Type为json时, 可以使用这个方法取json的数据 params = { # get请求URL中携带的请求的参数 "keyword": "O98K", } header = { # 请求头信息 "name": "SATH" } ret = requests.get(url='http://www.baidu.com', params=params, header=header)
data = { # POST请求携带的参数
"name": "sath"
}
爬虫案例一: 爬取搜狗指定词条搜索后的页面数据
import requests url = "https://www.sogou.com/web" params = { "query": "apple" } # 根据对搜狗的请求分析, 发现提交搜索关键字的是https://www.sogou.com/web # 并且是以get方式发送的请求 # 关键字是query ret = requests.get(url=url, params=params) with open('./sogou.html', 'w', encoding='utf-8') as f: f.write(ret.text)
爬虫案例二: 爬取豆瓣电影分类排行榜中的电影详情数据
import requests from multiprocessing import Pool import time url = 'https://movie.douban.com/j/new_search_subjects' header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36" } movie_title_list = [] def get_movie(start): params = { "sort": "U", "tags": "", "start": start, "genres": "喜剧", } ret = requests.get(url=url, params=params, headers=header) if ret.headers['Content-Type'] == "application/json; charset=utf-8": data = ret.json()["data"] for movie in data: movie_title_list.append(movie["title"]) print(movie["title"]) if __name__ == '__main__': p = Pool(20) start = time.time() for n in range(0, 10000, 20): a = p.apply_async(get_movie, args=(n,)) p.close() p.join() print(time.time() - start) # 14s, 还可以。。。。
爬虫案例三: 爬取肯德基餐厅查询中指定地点的餐厅数据
import requests import json url = "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36" } data = { "cname": "", "pid": "", "keyword": "邯郸", "pageIndex": "1", "pageSize": "10", } ret = requests.post(url=url, headers=header, data=data, params={"op": "keyword"}) res = json.loads(ret.text) print(res, type(res))
爬虫案例四: 药监局信息爬取
import requests from multiprocessing import Pool url = "http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList" header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3423.2 Safari/537.36" } ids = [] for page in range(20, 250): data = { "on": "true", "page": page, "pageSize": "15", "productName": "", "conditionType": "1", "applyname": "", "applysn": "", } ret = requests.post(url=url, headers=header, data=data) if ret.headers['Content-Type'] == "application/json;charset=UTF-8": res = ret.json()["list"] for n in res: ids.append(n['ID']) else: pass url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' def func(k): data = { "id": k } r = requests.post(url=url, headers=header, data=data) if r.headers['Content-Type'] == "application/json;charset=UTF-8": print(r.json()["businessPerson"]) if __name__ == '__main__': p = Pool(14) for k in ids: s = p.apply_async(func, k) p.close() p.join()