requests
requests的get和post常用参数:
- url
- headers
- data/params
- proxies
基础使用
1.搜狗主页
import requests url = 'https://www.sogou.com/' response = requests.get(url=url) page_text = response.text print(page_text) with open('./sougou.html','w',encoding='utf-8') as fp: fp.write(page_text) print('over!')
2.搜索
import requests wd = input('enter your word:') url = 'https://www.sogou.com/web?' param = { 'query':wd } headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0' } response = requests.get(url=url, params=param,headers=headers) #手动修改相应数据的编码 response.encoding = 'utf-8' page_text = response.text file_name = wd+'.html' with open(file_name, 'w', encoding='utf-8') as fp: fp.write(page_text) print(file_name,'爬取成功!!!')
3.百度翻译
import requests url = 'https://fanyi.baidu.com/sug' word = input('enter your word:') data = { 'kw':word, } headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' } reponse = requests.post(url=url, data=data, headers=headers) # text:字符串 json():对象 obj_json = reponse.json() print(obj_json)
cookie相关
-
处理cookie方式 ·手动处理:cookie有有效时长,并且可能是动态变化的 ·自动处理:使用会话对象Session
-
Session用法: ·实例化一个会话对象:requests.Session() ·session的作用:
·发送请求 ·如果请求发送的过程中产生了cookie,则cookie会自动存储到session中
import requests url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=20351345&count=15&category=-1' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' } session = requests.Session() # 这一步如果产生了cookie就自动存储到session中(自动获取) session.get(url='https://xueqiu.com',headers=headers) json_obj = session.get(url=url,headers=headers).json() print(json_obj)
代理操作
- 代理网站
· 快代理
· 西刺代理
· goubanjia.com
- 匿名度:
· 透明:对方服务器知道你使用了代理,并且知道你的真实IP
· 匿名:对方服务器知道你使用了代理,但不知道你的真实IP
· 高匿:对方服务器不知道你使用了代理,也不知道你的真实IP
- 类型:
· http:该类型的代理只可以发起http协议头对应的请求
· https:该类型的代理只可以发起https协议头对应的请求
import requests import random headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' } http = [ {'http':'106.75.244.137:80'}, {'http','27.208.83.255:8060'} ] https = [ {'https','114.99.54.65:8118'}, {'https','121.61.89.48:61234'} ] url = 'https://www.baidu.com/s?wd=ip' if url.split(':') == 'https': page_text = requests.get(url=url, headers=headers, proxies=random.choice(https)).text else: page_text = requests.get(url=url, headers=headers, proxies=random.choice(http)).text with open('./IP.html', 'w',encoding='utf-8') as fp: fp.write(page_text)
线程池
爬取梨视频
import requests from multiprocessing.dummy import Pool import re from lxml import etree import random def get_videoData(url): return requests.get(url=url,headers=headers).content def saveVideo(data): name = str(random.randint(0,9999))+'.mp4' with open(name,'wb') as fp: fp.write(data) print(name,"下载成功!!!") pool = Pool(4) url = 'https://www.pearvideo.com/category_1' ex = 'srcUrl="(.*?)",vdoUrl=' video_urls = [] headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' } page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) li_list = tree.xpath('//ul[@id="listvideoListUl"]/li') for li in li_list: detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0] detail_page_text = requests.get(url=detail_url,headers=headers).text video_src = re.findall(ex,detail_page_text,re.S)[0] video_urls.append(video_src) # 多线程下载视频 videoData_list = pool.map(get_videoData, video_urls) pool.map(saveVideo, videoData_list)
识别验证码
需要先在云打码,打码兔等打码网站注册后使用该网站的打码服务
网站提供的接口:

import http.client, mimetypes, urllib, json, time, requests ###################################################################### class YDMHttp: apiurl = 'http://api.yundama.com/api.php' username = '' password = '' appid = '' appkey = '' def __init__(self, username, password, appid, appkey): self.username = username self.password = password self.appid = str(appid) self.appkey = appkey def request(self, fields, files=[]): response = self.post_url(self.apiurl, fields, files) response = json.loads(response) return response def balance(self): data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['balance'] else: return -9001 def login(self): data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey} response = self.request(data) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['uid'] else: return -9001 def upload(self, filename, codetype, timeout): data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} file = {'file': filename} response = self.request(data, file) if (response): if (response['ret'] and response['ret'] < 0): return response['ret'] else: return response['cid'] else: return -9001 def result(self, cid): data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)} response = self.request(data) return response and response['text'] or '' def decode(self, filename, codetype, timeout): cid = self.upload(filename, codetype, timeout) if (cid > 0): for i in range(0, timeout): result = self.result(cid) if (result != ''): return cid, result else: time.sleep(1) return -3003, '' else: return cid, '' def report(self, cid): data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'} response = self.request(data) if (response): return response['ret'] else: return -9001 def post_url(self, url, fields, files=[]): for key in files: files[key] = open(files[key], 'rb'); res = requests.post(url, files=files, data=fields) return res.text
将示例代码中的可执行程序封装成函数:

def transformCodeImg(imgPath,imgType): # 普通用户名 username = 'tianqibucuo' # 密码 password = '026035' # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! appid = 9064 # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! appkey = '84b113d0c116041f882d78c5dced8472 ' # 图片文件 filename = imgPath # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html codetype = imgType # 超时时间,秒 timeout = 30 result = None # 检查 if (username == 'username'): print('请设置好相关参数再测试') else: # 初始化 yundama = YDMHttp(username, password, appid, appkey) # 登陆云打码 uid = yundama.login(); print('uid: %s' % uid) # 查询余额 balance = yundama.balance(); print('balance: %s' % balance) # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 cid, result = yundama.decode(filename, codetype, timeout); return result
对古诗词网模拟登陆,获取验证码并让打码平台进行识别:
from lxml import etree import requests session = requests.Session() url = 'https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx' headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36' } page_text = session.get(url=url,headers=headers).text tree = etree.HTML(page_text) img_src = 'https://so.gushiwen.org'+tree.xpath('//*[@id="imgCode"]/@src')[0] img_data = session.get(url=img_src,headers=headers).content with open('./yanzhengma.jpg','wb') as fp: fp.write(img_data) __VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0] __VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0] result = transformCodeImg('./yanzhengma.jpg',1004) print(result) post_url = 'https://so.gushiwen.org/user/login.aspx?from=' data ={ '__VIEWSTATE': __VIEWSTATE, '__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR, 'from': '', 'email': '2907183182@qq.com', 'pwd': '026035', 'code': result, 'denglu': '登录', } reponse = session.post(url=post_url, headers=headers,data=data) print(reponse.status_code) page_text = reponse.text with open('./gushi.html','w',encoding='utf-8')as fp: fp.write(page_text) print(page_text)