1. requests模块和BeautifulSoup的简单使用 参考文档:https://www.cnblogs.com/wupeiqi/articles/6283017.html
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 import uuid 2 import requests 3 from bs4 import BeautifulSoup 4 5 # 获取某个网站的html代码 6 response = requests.get('http://www.autohome.com.cn/news/') 7 8 # 获取网站代码时,中文有乱码,指定response.apparent_encoding表示网站以什么编码的,下载下来后就自动以什么解码 9 response.encoding = response.apparent_encoding 10 11 # 固定写法 12 soup = BeautifulSoup(response.text,features='html.parser') 13 14 # 获取某个标签的id 15 target = soup.find(id='auto-channel-lazyload-article') 16 17 # 获取当前标签下面的所有li标签,并以列表的形式输出(find_all是以列表的形式输出) 18 li_list = target.find_all('li') 19 20 # 循环列表,取列表中每个元素的a标签 21 for i in li_list: 22 a = i.find('a') # find是取单个元素 23 24 if a: 25 # 将取到的每个值都写到文件中 26 # with open('html.txt', mode='a', encoding='utf-8') as f: 27 28 # 取每个a标签的属性'href' 29 # href = a.attrs.get('href') 30 # f.write(str(href)+'\n') 31 32 # 取每个a标签下面的所有h3标签 33 # h = a.find('h3').text 34 # f.write(h+'\n') 35 36 # 通过find取标签,在通过标签对象取它的属性,用attrs.get的方式 37 img_url = a.find('img').attrs.get('src') 38 39 # 将http拼接到取到的url前面 40 img_url_http = 'http:'+img_url 41 42 # 通过requests的get方式去访问每个拼接好的url 43 img_response = requests.get(url=img_url_http) 44 45 # 使用uuid的方式生成不一样的文件名 46 file_name = str(uuid.uuid4())+'.jpg' 47 48 # 写入文件中,最后生成的是每一张图片 49 with open(file_name,'wb') as f1: 50 f1.write(img_response.content)
requests常用方法:
get 发送get请求
post 发送post请求
delete 发送delete请求
patch 发送patch请求
put 发送put请求
session 获取session
requests的request方法中常用参数:(requests的大部分方法都是通过reqeust封装而来的)
method # 指定访问网站的方式,如get,post,... url # 要请求的url headers # 伪装浏览器时指定的headers信息 data # 指定登录需要的参数的字典类型或者get方式传参数的一个字典类型 verify # 设置访问网站时忽略https的证书 proxies # 指定代理服务器 stream # 在下载数据时,指定为False,将数据一下子全部下载下来,指定为True,则分批下载数据 parmers # get方式发送数据时可以指定参数,以字典的方式 json # 通过请求体发送数据时,自动序列化成json格式的数据
requests对象的常用方法:
response = requests.get() # 先生成一个对象 response.encoding # 获取当前访问的网站的编码格式 response.apparent_encoding # 识别当前网站的编码方式,并自动解码 response.text # 获取当前访问页面的文本信息 response.content # 获取当前访问网站的bytes类型的信息 response.cookies.get_dict() # 获取网站的cookies,并以字典形式获取
2.自动登录抽屉新热榜网站
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 import requests 2 3 # 创建登录信息,按照网站的network的请求头中的form-data里面的信息创建 4 login_dict = { 5 'phone':'8615600120022', 6 'password':'xuyanpeng@1993', 7 'oneMonth':1 8 } 9 10 # 伪装成chrome浏览器 11 headers = { 12 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 13 } 14 15 # 给网站的登录页面发送post请求, 16 response = requests.post( 17 headers=headers, # 指定伪装的浏览器 18 url='https://dig.chouti.com/login', # 指定要访问的url 19 data=login_dict, # 将登录信息传进来 20 21 ) 22 23 print(response.text) # 登录成功返回状态信息 24 25 print(response.cookies.get_dict()) # 返回cookies信息,以字典的格式返回 26 27 # 登录成功后返回: 28 # {"result":{"code":"9999", "message":"", "data":{"complateReg":"0","destJid":"cdu_54564948794"}}} 29 # {'gpsd': 'c7695c501f7f45e70004d6ee89fa2a3b', 'puid': '9819851ca0c86763d78f3b0cda1fbb69', 'JSESSIONID': 'aaaehxgnnK_t0deugHEFw'}
3.自动登录抽屉并对某条新闻自动点赞(两种方式),抽屉网站有点特殊,他们做了cookies的伪造,所以只能先获取一次cookies
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
import requests login_dict = { 'phone':'8615600120022', 'password':'xuyanpeng@1993', 'oneMonth':1 } headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } response1 = requests.get( url='https://dig.chouti.com/', headers=headers ) response1_cookies = response1.cookies.get_dict() # 先获取第一次访问的页面的cookies # 登录网站 response2 = requests.post( headers=headers, url='https://dig.chouti.com/login', data=login_dict, cookies=response1_cookies, # 携带获取的cookies做认证 ) # 指定要点赞的一条数据发post请求,且要指定第一次获取的cookies里面的重要参数 response3 = requests.post( headers=headers, url='https://dig.chouti.com/link/vote?linksId=23903191', cookies={'gpsd':response1_cookies['gpsd']}, # cookies中的gpsd ) print(response3.text)
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
login_dict = { 'phone':'8615600120022', 'password':'xuyanpeng@1993', 'oneMonth':1 } headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } session = requests.session() # 先要获取session对象 # 先访问一次页面,获取session信息 response1 = session.get( url='https://dig.chouti.com/', headers=headers ) # 通过session信息登录网站 response2 = session.post( headers=headers, url='https://dig.chouti.com/login', data=login_dict, ) # 通过session信息点赞 response3 = session.post( headers=headers, verify=False, url='https://dig.chouti.com/link/vote?linksId=23901891', ) print(response3.text) 通过session的方式就不用再传cookies了
4.自动github,并获取项目下载地址
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 import requests 2 from bs4 import BeautifulSoup 3 4 ###############################################第一种方式(通过携带cookie的方式登录)############################################### 5 6 # 1 获取登录页面的的authenticity_token 7 response = requests.get(url='https://github.com/login') 8 soup1 = BeautifulSoup(response.text,features='lxml') 9 csrf = soup1.find(name='input',attrs={'name':'authenticity_token'},) 10 authenticity_token = csrf.attrs.get('value') 11 12 response_cookie = response.cookies.get_dict() 13 response.close() 14 15 # 2 携带者authenticity_token和用户名密码登录 16 login_dict = { 17 'authenticity_token':authenticity_token, 18 'commit':'Sign in', 19 'utf8':'', 20 'login':'845601256@qq.com', 21 'password':'xuyanpeng@1993', 22 23 } 24 headers = { 25 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 26 } 27 28 response2 = requests.post( 29 url='https://github.com/login', 30 data=login_dict, 31 headers=headers, 32 cookies=response_cookie, 33 ) 34 response2_cookie = response2.cookies.get_dict() 35 response_cookie.update(response2_cookie) 36 37 response3 = requests.get(url='https://github.com/HuaDD/salahi',) 38 39 soup2 = BeautifulSoup(response3.text,features='lxml') 40 41 val = soup2.find_all('input') 42 43 for i in val: 44 target = i.attrs.get('value') 45 if target and '//' in target: 46 print(target) 47 48 # print(val[-1].attrs.get('value'))
![](https://images.cnblogs.com/OutliningIndicators/ContractedBlock.gif)
1 ###############################################第二种方式(通过session登录)############################################### 2 session = requests.session() 3 4 response1 = session.get(url='https://github.com/login') 5 soup1 = BeautifulSoup(response1.text,features='lxml') 6 response1_cookie = response1.cookies.get_dict() 7 8 csrf = soup1.find(name='input',attrs={'name':'authenticity_token'},) 9 authenticity_token = csrf.attrs.get('value') 10 login_dict = { 11 'authenticity_token':authenticity_token, 12 'commit':'Sign in', 13 'utf8':'', 14 'login':'845601256@qq.com', 15 'password':'xuyanpeng@1993', 16 17 } 18 headers = { 19 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' 20 } 21 22 response2 = session.post( 23 url='https://github.com/login', 24 data=login_dict, 25 headers=headers 26 ) 27 28 response2_cookie = response2.cookies.get_dict() 29 30 response2_cookie.update(response1_cookie) 31 32 response3 = session.get(url='https://github.com/HuaDD/Web') 33 34 soup = BeautifulSoup(response3.text,features='lxml') 35 36 val = soup.find_all('input') 37 for i in val: 38 target = i.attrs.get('value') 39 if target and '//' in target: 40 print(target)