爬虫基础-requests模块使用样例
1. 基础
import requests
if __name__ == "__main__":
# 1. 指定url
url = 'https://www.sougou.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# 2. 发起请求, get方法会返回一个响应对象
response = requests.get(url=url, headers=headers)
# 3. text(字符串) conten(二进制) json() (对象)
# 对应的保存方法:
# text: fp.write(page_text)
# json: json.dump(list_data, fp=fp, ensure_ascii=False)
# content: with open(img_path, 'wb') as fp:
# fp.write(igm_data)
page_text = response.text
print(page_text)
# 4. 持久化存储
with open('./sougou.html','w',encoding='utf-8') as fp:
fp.write(page_text)
fp.close()
print('爬取数据结束')
2. 人人网模拟登录-携带cookie
# -*- coding:utf-8 -*-
import requests
# 登录接口:
# http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020731013655
#
# 个人主页地址
# http://www.renren.com/266059096/profile
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020731013655'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
data = {
'email': 'wangshui898@sina.com',
'icode': '',
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '43deb19ec2e1ea71941ecc309e593afec110876b6181334542f6b125ffdfd675',
'rkey': '07eeb6b6485891ad0895d30f1a23db38',
'f': 'http%3A%2F%2Fwww.renren.com%2F266059096'
}
# 创建一个session对象
session = requests.Session()
# 使用session进行post请求
response = session.post(url=login_url, data=data, headers=headers)
print('登录页状态码: ', response.status_code)
# detail_url = 'http://www.renren.com/266059096/profile'
detail_url = 'http://www.renren.com/266059096'
detail_page= session.get(url=detail_url, headers=headers)
detail_page_text = detail_page.text
print('个人主页状态码: ', detail_page.status_code)
fp = open('./renren_detail.html', 'w', encoding='utf-8')
fp.write(detail_page_text)
fp.close()
3. 肯德基餐厅位置查询
# -*- coding:utf-8 -*-
import requests
import json
if __name__ == '__main__':
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'cookie': 'route-cell=ksa; ASP.NET_SessionId=2y4hig2mluq5mty4dgai3i4k; Hm_lvt_1039f1218e57655b6677f30913227148=1597132083,1597208991; Hm_lpvt_1039f1218e57655b6677f30913227148=1597208991; SERVERID=891dda8157e24744f56aa53dc4ec1dc1|1597209068|1597208989'
}
kw = input('请输入查询地址: ')
param = {
'op': 'keyword',
'cname': '',
'pid': '',
'keyword': kw,
'pageIndex': '1',
'pageSize': '10'
}
response = requests.get(url=url, params=param, headers=headers)
dic_data = response.json()
fileName = kw + '.json'
fp = open(fileName, 'w', encoding='utf-8')
json.dump(dic_data,fp=fp, ensure_ascii=False)
fp.close()
print('爬取完毕')
4. 抓取糗事百科图片-正则方式
# -*- coding:utf-8 -*-
import requests
import re
import os
# 需求: 爬取糗事百科中的糗图板块中所有图片
if __name__ == "__main__":
# 创建一个目录,保存图片
if not os.path.exists('./qiutuLibs'):
os.mkdir('./qiutuLibs')
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
#设置一个通用url模版
url = 'https://www.qiushibaike.com/imgrank/page/%d/'
for pageNum in range(1,13):
# 对应页码的url
new_url = format(url%pageNum)
# 使用通用爬虫对url对应的整张页面进行爬取
page_text = requests.get(url=new_url, headers=headers).text
# 使用聚焦爬虫对页面中所有的糗图进行解析/提取, 下面正则的意识是提取src后面的链接-- (.*?) 中的内容
ex = '<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
# re.S单行匹配, re.M多行匹配 re.I忽略大小写
img_src_list = re.findall(ex, page_text, re.S)
# print(img_src_list)
for img_src in img_src_list:
img_link = 'https:' + img_src
igm_data = requests.get(url=img_link, headers=headers).content
# 生成图片名称,以/为分割,取最后一列
img_name = img_link.split('/')[-1]
# 图片最终存储路径
img_path = './qiutuLibs/' + img_name
with open(img_path, 'wb') as fp:
fp.write(igm_data)
fp.close()
print(img_name, '下载成功!!!')
5. 下载梨视频mp4-正则方式
import requests
from lxml import etree
import re
from multiprocessing.dummy import Pool
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
# 对url发请求,解析视频详情页的url和视频名称
url = 'https://www.pearvideo.com/category_5'
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
for li in li_list:
deatil_url = 'https://www.pearvideo.com/' + li.xpath('.//a[@class="vervideo-lilink actplay"]/@href')[0]
detail_name = li.xpath('.//a/div[@class="vervideo-title"]/text()')[0] + '.mp4'
# 对详情页的url发起请求
print(detail_name, '开始下载')
detail_page = requests.get(url=deatil_url,headers=headers).text
ex = 'srcUrl="(.*\.mp4?)",vdoUrl=srcUrl'
video_url = re.findall(ex, detail_page)[0]
video_content = requests.get(video_url,headers=headers).content
fp = open(detail_name,'wb')
fp.write(video_content)
fp.close()
print(detail_name, '下载完成')