002 requests的使用方法以及xpath和beautifulsoup4提取数据
1、直接使用url,没用headers的请求
import requests url = 'http://www.baidu.com' # requests请求用get方法 response = requests.get(url) # 返回的content是字节 需要解码 data = response.content.decode() print(data) # 而text返回的是字符串类型 data = response.text print(data)
2、有headers的GET请求
requests的get参数,headers传入的参数是字典,不用转成字符串
import requests url = 'http://www.baidu.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } # requests请求用get方法 response = requests.get(url, headers=headers) # 1. 获取请求头 print(response.request.headers) # 2. 获取相应头 print(response.headers) # 3. 获取状态码 print(response.status_code) # 4. 请求的cookie print(response.request._cookies) # 5. 相应的cookie print(response.cookies)
而 url = 'http://www.baidu.com/s?wd=你好' 中的汉字会自动转义,不需要调用其他模块
import requests url = 'http://www.baidu.com/s?wd=你好' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } # requests请求用get方法 response = requests.get(url, headers=headers) data = response.content.decode() print(data)
import requests url = 'http://www.baidu.com/' params = { 'wd': '你好', } headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } # requests请求用get方法 response = requests.get(url, headers=headers, params=params) data = response.content.decode() print(data)
import requests url = 'http://baidu.com' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } params = { 'wd': '你好' } free_proxy = { 'https': '153.232.156.201:8080', } # 加代理IP response = requests.get(url, headers=headers, params=params, proxies=free_proxy) data = response.content print(data.decode())
3、有headers的POST请求
import requests url = 'http://iclass.ncut.edu.cn/iclass/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } data = { 'login': '17159010225', 'password': '******', } # requests请求用post方法 response = requests.post(url, headers=headers, data=data) data = response.content with open('01 登录界面.html', 'wb') as fp: fp.write(data)
4、使用xpath提取数据
import requests from lxml import etree url = 'https://www.qiushibaike.com/text/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } response = requests.get(url, headers=headers) data = response.content # 1. 转解析类型 xpath_data = etree.HTML(data) # 2. 调用xpath的方法 div_list = xpath_data.xpath('//div[@class="col1"]/div') for div in div_list: author = div.xpath('.//div[@class="author clearfix"]/a[2]/h2/text()')[0].strip('\n') # 打印作者名 print(author)
5、使用beautifulsoup提取信息
from bs4 import BeautifulSoup html_doc = ''' <div id="menu" class="menu-bar menu clearfix" style="margin:0 10px"> <a href="/" target="_blank" rel="nofollow">热门</a> <a href="/hot/" target="_blank">24小时</a> <a href="/imgrank/" target="_blank">热图</a> <a id="highlight" href="/text/" target="_blank">文字</a> <a href="/history/" target="_blank">穿越</a> <a href="/pic/" target="_blank">糗图</a> <a href="/textnew/" target="_blank">新鲜</a> </div> ''' # 1、转类型 soup = BeautifulSoup(html_doc, 'lxml') # 2、格式化输出 result = soup.prettify() # print(result) # 3、取标签 print(soup.a) # 4、取文本 print(soup.a.string) # 5、取属性 print(soup.a['target'])
from bs4 import BeautifulSoup html_doc = ''' <div id="menu" class="menu-bar menu clearfix" style="margin:0 10px"> <a href="/" target="_blank" rel="nofollow">热门</a> <a href="/hot/" target="_blank">24小时</a> <a href="/imgrank/" target="_blank">热图</a> <a id="highlight" href="/text/" target="_blank">文字</a> <a href="/history/" target="_blank">穿越</a> <a href="/pic/" target="_blank">糗图</a> <a href="/textnew/" target="_blank">新鲜</a> </div> ''' # 1、转类型 soup = BeautifulSoup(html_doc, 'lxml') # 2、通用解析方法 # find 返回符合查询条件的第一个标签 print(soup.find(name='a')) print(soup.find(attrs={"target": '_blank'})) # find_all 返回list(标签对象) print(soup.find_all(name='a', limit=3)) # select_one 返回css的选中器 print(soup.select_one('.menu')) # select 返回的是list print(soup.select('#highlight')) print(soup.select('a[target="_blank"]'))
6、其他常用知识点
查看response的编码格式
print(page_text.encoding)