1-由浅入深学爬虫
爬虫
爬虫入门
urllib
from urllib import request
url = 'http://www.baidu.com'
# User-Agent: 模拟浏览器,防止服务器反爬
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# 使用request发送请求
# 创建请求对象
req = request.Request(url=url, headers=headers)
# 发送请求
response = request.urlopen(req)
# 响应数据
# print(response.read()) # 二进制
print(response.read().decode()) # 解码,得到字符串
# print(response.info()) # 响应信息
# print(response.status) # 状态码
urllib模拟百度搜索
from urllib import request
from urllib import parse
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
def baidu_search(params, key):
# 百度搜索url
url = f'https://www.baidu.com/s?{params}'
# 发送请求
req =request.Request(url, headers=headers)
res = request.urlopen(req)
content =res.read().decode()
print(content)
# 保存爬取的数据
with open(f'{key}.html', 'w', encoding='utf-8') as fp:
fp.write(content)
fp.flush()
'''
如果向服务器发送数据,那么data参数必须是一个有数据的bytes对象,否则为None。HTTP请求使用POST方法时,data必须有数据;使用GET方法时,data写成None
data = bytes(parse.urlencode({"pro": "value"}, encoding="utf8"))
response = request.urlopen("http://www.baidu.com", data=data)
'''
if __name__ == '__main__':
key = input('请输入要搜索的内容')
params ={'wd': key}
params = parse.urlencode(params) # 解决url中出现中文的问题
# print(params) # wd=%E5%91%A8%E6%9D%B0%E4%BC%A6
baidu_search(params, key)
urllib爬取51job
import re
from urllib import request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# url
url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,Python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# 发送请求
req = request.Request(url, headers=headers)
res = request.urlopen(req)
# 获取数据
content = res.read().decode('gbk')
# 使用正则
pattern = '"jobid_count":"(.*?)"' # 捕获
result = re.findall(pattern, content, re.S) # 让.可以匹配换行
print(result)
urllib下载图片
from urllib import request
# 下载图片
request.urlretrieve(
url='https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1600067380374&di=16addb0b6e336ab847a1403cebc09a43&imgtype=0&src=http%3A%2F%2Fgss0.baidu.com%2F-vo3dSag_xI4khGko9WTAnF6hhy%2Fzhidao%2Fpic%2Fitem%2Fb17eca8065380cd72cbb313da744ad34588281bd.jpg',
filename='人民币.png'
)
request.urlcleanup() # 清理缓存
urllib爬取豆瓣电影
import json
from urllib import request
import pymysql
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
url = 'https://movie.douban.com/j/new_search_subjects?sort=T&range=0,10&tags=&start=0'
req = request.Request(url, headers=headers)
res = request.urlopen(req)
# json解析:json反序列化
# json一定要用双引号
# 不能在json中注释
content = res.read().decode()
result = json.loads(content)
'''
# 1.将电影数据存入本地txt文件
movie_list = result['data']
for movie in movie_list:
title = movie['title']
url = movie['url']
with open('douban.txt', 'a', encoding='utf-8') as fp:
s = str((title, url)) + '\n'
fp.write(s)
fp.flush()
'''
# 2.将电影数据存储到MySQL
# 连接MySQL
db = pymysql.connect(
host='localhost', port=3306,
user='root', password='nzw19940611',
database='spider2003', charset='utf8mb4'
)
cur = db.cursor() # 游标:执行SQL
# 执行SQL
movie_list = result['data']
for movie in movie_list:
title = movie['title']
url = movie['url']
try:
# sql
sql = 'insert into tb_douban_movie(movie_title, url) values("%s", "%s")' % (title, url)
cur.execute(sql)
db.commit() # 事务提交
except Exception as e:
print('插入失败:', e)
db.rollback() # 回滚
print('--插入MySQL完成--')
# content = eval(res.read().decode())
# for i in range(len(content['data'])):
# with open('豆瓣.txt', 'a', encoding='utf-8') as fp:
# fp.write(content['data'][i]['title']+'\n')
# fp.flush()
urllib使用代理IP
import random
from urllib import request
import json
# 先获取芝麻代理ip
url = 'http://http.tiqu.alicdns.com/getip3?num=10&type=2&pro=0&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=&gm=4'
# 请求芝麻代理API
response = request.urlopen(url)
content = response.read().decode()
# print(content)
# json解析,提取ip和port
result = json.loads(content)
ip_list = result['data']
# 把ip格式化后存入proxy_list
proxy_list = []
for ip in ip_list:
ip_dict = {
'HTTP': f'{ip["ip"]}:{ip["port"]}'
}
proxy_list.append(ip_dict)
# print(proxy_list) # {'http': 'http://58.218.92.13:6905'}......
# url = UserAgent池
UserAgentList = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",
"Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1",
"Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Mobile Safari/537.36"
]
# 获取随机的代理IP
proxy = random.choice(proxy_list)
# 随机的UA
ua = random.choice(UserAgentList)
# 使用代理IP和UA
proxy_handler = request.ProxyHandler(proxies=proxy) # 构建一个代理
opener = request.build_opener(proxy_handler) # 使用构建的代理创建一个opener对象
# 发送请求
req = request.Request('http://www.baidu.com')
req.add_header('User-Agent', ua) # 随机的ua
# 使用带代理的opener对象打开某个url/request
response = opener.open(req) # 等价于request.urlopen()
res = response.read().decode()
print(res)
requests基础
import requests
# get请求
'''
response = requests.get('http://www.baidu.com')
# print(response) # <Response [200]>
print(response.text) # 默认使用utf-8解码,内容字符串
print(response.content) # 二进制
# print(response.json()) # json解析
# print(response.headers) # 头部信息
# print(response.cookies) # 响应的cookie
# print(response.status_code) # 状态码
'''
'''
# get请求:百度搜索
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
url = 'https://www.baidu.com/s?wd=hello'
response = requests.get(url, headers=headers)
print(response.text)
'''
# post请求:有道翻译
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
kw = input('请输入要翻译的单词:')
# data是post的参数
data = {
"i": kw,
"from": "AUTO",
"to": "AUTO",
"smartresult": "dict",
"client": "fanyideskweb",
"salt": "16000738465941",
"sign": "bf2e220fb6fe0ec8e03524a390dc0b5c",
"lts": "1600073846594",
"bv": "e915c77f633538e8cf44c657fe201ebb",
"doctype": "json",
"version": "2.1",
"keyfrom": "fanyi.web",
"action": "FY_BY_CLICKBUTTION"
}
response = requests.post(url, data=data, headers=headers)
result = response.json() # json解析,解析成字典
src = result['translateResult'][0][0]['src']
tgt = result['translateResult'][0][0]['tgt']
print(src, tgt)
bs4和xpath
requests使用代理
import random
import requests
'''
58.218.200.228:9150
58.218.200.223:4432
58.218.200.226:8256
58.218.200.228:7837
58.218.200.223:8915
'''
# proxy
proxy_list = [
{"HTTP": "58.218.200.228:9150"},
{"HTTP": "58.218.200.223:4432"},
{"HTTP": "58.218.200.226:8256"},
{"HTTP": "58.218.200.228:7837"},
{"HTTP": "58.218.200.223:8915"}
]
# 获取随机代理IP
proxy = random.choice(proxy_list)
# 使用代理
res = requests.get('http://www.baidu.com', proxies=proxy)
print(res.text)
requests使用session
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
url = 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=1&s=1&click=0'
# 使用session
session = requests.session()
# 使用session发送请求:保持会话,存储cookie
response = session.get(url, headers=headers)
print(response.text)
# 当继续使用session访问其他url时,会自动携带之前的cookie
url2 = 'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=2&s=27&scrolling=y&log_id=1600483717480.6970&tpl=3_M&isList=1&show_items='
response2 = session.get(url2, headers=headers)
print(response2.text)
requests使用cookies
import requests
url = 'http://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
cookies = {
"PSTM": "1600136817",
"BDRCVFR[feWj1Vr5u3D]": "mk3SLVN4HKm",
"BAIDUID": " E922D90277D06E37B8B783C0082C650A:FG=1",
"delPer": "0",
"BD_CK_SAM": "1",
"PSINO": "6",
"H_PS_PSSID": "7506_32606_1424_7605_32116_31709_26350",
"BIDUPSID": "89E6649E57A3DC9DABE613D88595BA0D",
"BDORZ": "B490B5EBF6F3CD402E515D22BCDA1598",
"BD_UPN": "12314753",
"COOKIE_SESSION": "16_0_2_5_3_11_0_0_0_2_0_0_67596_0_0_0_1600136510_0_1600136818%7C5%230_0_1600136818%7C1",
"H_PS_645EC": "3fcbYEWAxGp5VGowaCXsud%2BK436DuYp%2Bu6fs%2FUwAz9UFcCyuSSHqbS7CSMLQBpsMjeN%2F"
}
response = requests.get(url, headers=headers, cookies=cookies)
# print(response.text)
# print(response.cookies)
# 将服务器返回的cookiejar,转换成字典dict
cookie_dict = requests.utils.dict_from_cookiejar(response.cookies)
print(cookie_dict)
bs4基本用法
from bs4 import BeautifulSoup
# 安装Beautifulsop4
# pip install Beacutifulsoup4
# 安装HTML解析器lxml
html_doc = """
<html>
<head>
<title>呵呵</title>
</head>
<body>
<p class="title">
<b>哈哈</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="first" class="sister" id="link1">first</a>,
<a href="second" class="sister" id="link2">second</a> and
<a href="third" class="sister" id="link3">third</a>;
</p>
<p class="story">end</p>
</body>
</html>
"""
# 使用bs4
# 创建bs4对象
soup = BeautifulSoup(html_doc, 'lxml')
# print(soup)
# print(type(soup))
# tag标签
# print(soup.head)
# print(type(soup.head)) # <class 'bs4.element.Tag'>
# print(soup.title) # title标签
# print(soup.b) # 哈哈
# print(soup.body.p.b)
# attribute属性
# print(soup.p.attrs) # {'class': ['title']}第一个p所有属性
# print(soup.a.attrs) # {'href': 'first', 'class': ['sister'], 'id': 'link1'}第一个a的所有属性
# print(soup.a.attrs['href']) # 获取某个属性值
# 文本内容,建议使用text
# print(soup.b.string) # 哈哈
# print(soup.b.text) # 哈哈
# print(soup.p.string) # None
# print(soup.p.text) # 哈哈
# find_all():找到所有匹配的节点
# print(soup.find_all('p')) # 所有p节点
# print(soup.find_all('p')[2])
# 根据属性来查找
# print(soup.find_all('p', attrs={'class': 'story'}))
# print(soup.find_all('a', attrs={'id': 'link1'}))
# print(soup.find_all('a', id='link1'))
# print(soup.find_all('a', limit=2)) # 前两个a标签
# print(soup.find_all(['a', 'b'])) # 找所有a标签和b标签
# css选择器
# soup.select()
# print(soup.select('p')) # 标签选择器
# print(soup.select('#link2')) # id选择器
# print(soup.select('.sister')) # class选择器
# print(soup.select('p #link3')) # 后代选择器
# 从文档中获取所有文字内容
print(soup.get_text())
bs4解析股票基金数据
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# 股票网址
url = 'http://quote.stockstar.com/fund/stock.shtml'
response = requests.get(url, headers=headers)
content = response.content.decode('gb2312')
# print(content)
# bs4解析网页内容
soup = BeautifulSoup(content, 'lxml')
tr_list = soup.select('#datalist tr')
# print(tr_list)
for tr in tr_list:
s_code = tr.find_all('td')[0].a.text # 基金代码
s_name = tr.find_all('td')[1].a.text # 基金名称
s_unit = tr.find_all('td')[2].text # 单位
s = str((s_code, s_name, s_unit)) + '\n'
with open('fund.txt', 'a', encoding='utf-8') as fp:
fp.write(s)
fp.flush()
xpath的基本使用
# xpath需要安装lxml
# pip install lxml
from lxml import etree
html_doc = """
<html>
<head>
<title>呵呵</title>
</head>
<body>
<ul>
<li class="item" id="box1">
<a href="aabb">打仗1</a>
</li>
<li class="item" id="box2">
<a href="aabbcc">打仗2</a>
</li>
<li class="item" id="box3">
<a href="bbccdd">打仗3</a>
</li>
<li class="item" id="box4">
<a href="ddee">打仗4</a>
</li>
</ul>
<p class="item">
<a href="aabb">打仗5</a>
</p>
</body>
</html>
"""
# 使用xpath
# 创建etree对象
mytree = etree.HTML(html_doc)
# print(mytree) # <Element html at 0x1feda822e08>
# print(type(mytree)) # <class 'lxml.etree._Element'>
# /:子节点
# //:后代节点
# print(mytree.xpath('/html')) # html标签
# print(mytree.xpath('/html/head')) # head标签
# print(mytree.xpath('/html/body/ul/li')) # 所有li标签
# print(mytree.xpath('//li')) # 所有li标签
# print(mytree.xpath('//li')[1]) # 第二个li标签,得到etree对象
# print(mytree.xpath('//li[2]/@id'))
# text():文本内容
# li_list = mytree.xpath('//li')
# for li in li_list:
# # 里面的.表示当前节点,不能省略
# content = li.xpath('./a/text()') # 文本内容
# attr = li.xpath('./@id') # 属性值
# print(content, attr)
# 谓语:加条件
# 谓词写在[]中
# print(mytree.xpath('//li[1]/a/text()')) # ['打仗1']
# print(mytree.xpath('//li[last()]/a/text()')) # ['打仗4']
# print(mytree.xpath('//li[last()-1]/a/text()')) # ['打仗3'],倒数第二个
# print(mytree.xpath('//li[position()<3]/a/text()')) # ['打仗1', '打仗2']
# print(mytree.xpath('//li[position()>=3]/a/text()')) # ['打仗3', '打仗4']
# print(mytree.xpath('//li[@id="box1"]/a/text()')) # ['打仗1']
# print(mytree.xpath('//li[@class="item"]/a/text()')) # ['打仗1', '打仗2', '打仗3', '打仗4']
# *通配符
# print(mytree.xpath('//*[@class="item"]/a/text()')) # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']
# |或
# print(mytree.xpath('//li[@class="item"]/a/text() | //p[@class="item"]/a/text()')) # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5'] # ['打仗1', '打仗2', '打仗3', '打仗4', '打仗5']
# 包含contains()
# print(mytree.xpath('//li/a[contains(@href, "aa")]/text()')) # ['打仗1', '打仗2']
print(mytree.xpath('//li/a[contains(text(), "2")]/text()')) # ['打仗2']
xpath解析股票基金数据
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
# 股票网址
url = 'http://quote.stockstar.com/fund/stock.shtml'
response = requests.get(url, headers=headers)
content = response.content.decode('gb2312')
# print(content)
# xpath解析网页内容
mytree = etree.HTML(content)
tr_list = mytree.xpath('//tbody[@id="datalist"]/tr')
for i, tr in enumerate(tr_list):
f_code = tr.xpath('./td[1]/a/text()')[0]
f_name = tr.xpath('./td[2]/a/text()')[0]
f_unit = tr.xpath('./td[3]/text()')[0]
# csv文件
with open('fund.csv', 'a', encoding='gb2312') as fp:
if i==0:
fp.write('基金代码,基金名称,单位净值\n')
f = f'{f_code},{f_name},{f_unit}\n'
fp.write(f)
fp.flush()
selenium和验证码破解
超级鹰破解验证码
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == '__main__':
chaojiying = Chaojiying_Client('lotuslaw', '******', '908114')
# 用户中心>>软件ID 生成一个替换 96001
img = open('../a.jpg', 'rb').read()
#本地图片文件路径 来替换 a.jpg
print(chaojiying.PostPic(img, 1902))
# 1902 验证码类型
selenium的基本用法
import time
from selenium import webdriver
# 创建浏览器驱动
# 可以手动配置驱动的路径
# 将chromedriver.exe放到python.exe同目录
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
# 打开浏览器
driver.get('http://www.baidu.com')
# 获取网页源码
# print(driver.page_source)
# 关闭
# time.sleep(5)
# driver.close() # 只关闭一个窗口
# driver.quit() # 退出,关闭所有窗口
# 百度贴吧
driver.get('https://tieba.baidu.com/index.html')
'''
# 查找元素
wd1 = driver.find_element_by_id('wd1')
# wd1.send_keys('美女', Keys.ENTER) # 给输入框填充内容,自动按回车
time.sleep(2)
wd1.send_keys('美女')
# 点击按钮
btn = driver.find_element_by_xpath('//a[@class="search_btn search_btn_enter_ba j_enter_ba"]')
# btn.click()
# 获取内容和属性值
print(btn.get_attribute('innerText')) # innerText, innerHTML
print(wd1.get_attribute('value')) # 输入框的值
'''
# 执行js
time.sleep(3)
# 执行JS脚本
# driver.execute_script('window.scrollBy(0, 5000)')
for i in range(5):
driver.execute_script('window.scrollBy(0,5000)')
time.sleep(2)
# 截图
# driver.save_screenshot('teiba.png')
selenium登录知乎
import time
from selenium import webdriver
# 知乎登录页面
url = 'https://www.zhihu.com/signin?next=%2F'
# 打开知乎页面
driver = webdriver.Chrome()
driver.get(url)
time.sleep(2)
# 点击qq
driver.find_element_by_xpath('//*[@class="Button Login-socialButton Button--plain"][2]').click()
# 停10秒,手动扫码登录
time.sleep(10)
# 刷新页面
driver.refresh()
# 获取页面
print(driver.page_source)
print(driver.get_cookies())
headless无头浏览器
from selenium import webdriver
from selenium.webdriver import ChromeOptions
# 在正常页面跑通流程后,使用无头浏览器节约资源
options = ChromeOptions()
options.add_argument('--headless') # 无头浏览器
options.add_argument('--disable-gpu') # 禁用GPU
# 创建驱动对象
driver = webdriver.Chrome(options=options)
driver.get('http://www.baidu.com')
print(driver.page_source)
selenium设置代理
from selenium import webdriver
options = webdriver.ChromeOptions()
# 设置代理IP
options.add_argument('--proxy-sever=http://58.218.200.226:8256')
# 创建驱动
driver = webdriver.Chrome(options=options)
driver.get('http://www.baidu.com')
print(driver.page_source)
超级鹰人人网验证码破解登录
from Day03.chaojiying import chaojiying
import requests
import random
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36"
}
def get_code():
url = 'http://icode.renren.com/getcode.do?t=web_login&rnd=' + str(random.random())
# 获取验证码图片
res = session.get(url, headers=headers)
content = res.content # 图片二进制
# 使用超级鹰破解
cjy = chaojiying.Chaojiying_Client('lotuslaw', '******', '908114')
code = cjy.PostPic(content, 1902)
# print(code)
return code
def login(code):
# 登录接口抓取:给一个错误的密码进行登录
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2020831616448'
login_data = {
"email": "18566218480",
"icode": code,
"origURL": "http://www.renren.com/home",
"domain": "renren.com",
"key_id": "1",
"captcha_type": "web_login",
"password": "88d7f48bf698c0f1b0dcca94bfb40361c6c82ced70f8cbf0619d725e0341d2e5",
"rkey": "e8d80414c49ceb424291126858ee6226",
"f": ''
}
# 发送请求
res = session.post(login_url, data=login_data, headers=headers)
content = res.text
print(content)
# 登录后访问个人中心
def user_center():
url = 'http://www.renren.com/480089210/profile'
res = session.get(url, headers=headers)
print(res.text)
if __name__ == '__main__':
session = requests.session()
code = get_code()
login(code)
user_center()
Scrapy基础
-
Scrapy框架介绍
- Scrapy是用纯Python实现的一个为了爬取网站数据、提取结构性数据而编写的应用框架,用途非常广泛
- 用户只需要定制开发几个模块就可以轻松的实现一个爬虫,用来抓取网页内容以及各种图片,非常之方便
- Scrapy 使用了Twisted(其主要对手是Tornado)多线程异步网络框架来处理网络通讯,可以加快我们的下载速度,不用自己去实现异步框架,并且包含了各种中间件接口,可以灵活的完成各种需求
-
Scrapy框架组件
- Scrapy Engine(引擎)
- 负责Spider、ItemPipeline、Downloader、Scheduler中间的通讯,信号、数据传递等
- Scheduler(调度器)
- 它负责接受
引擎
发送过来的Request请求,并按照一定的方式进行整理排列,入队,当引擎
需要时,交还给引擎
- 它负责接受
- Downloader(下载器)
- 负责下载
Scrapy Engine(引擎)
发送的所有Requests请求,并将其获取到的Responses交还给Scrapy Engine(引擎)
,由引擎
交给Spider
来处理
- 负责下载
- Spider(爬虫)
- 它负责处理所有Responses,从中分析提取数据,获取Item字段需要的数据,并将需要跟进的URL提交给
引擎
,再次进入Scheduler(调度器)
- 它负责处理所有Responses,从中分析提取数据,获取Item字段需要的数据,并将需要跟进的URL提交给
- Item Pipeline(管道)
- 它负责处理
Spider
中获取到的Item,并进行后期处理(详细分析、过滤、存储等)的地方
- 它负责处理
- Downloader Middlewares(下载中间件)
- 你可以当作是一个可以自定义扩展下载功能的组件
- Spider Middlewares(Spider中间件)
- 你可以理解为是一个可以自定扩展和操作
引擎
和Spider
中间通信
的功能组件(比如进入Spider
的Responses和从Spider
出去的Requests)
- 你可以理解为是一个可以自定扩展和操作
- Scrapy Engine(引擎)
-
安装使用
-
安装
- pip install scrapy -i https://pypi.douban.com/simple
-
使用
-
新建项目
- 进入项目存放的目录
- scrapy startproject meiju
-
创建爬虫程序
- 进入项目目录
- scrapy genspider meijuSpider meijutt.tv
- meijuSpider为爬虫文件名
- meijutt.tv为爬取网址的域名
- 工程文件介绍
- scrapy.cfg
- 项目的配置信息,主要为Scrapy命令行工具提供一个基础的配置信息。(真正爬虫相关的配置信息在settings.py文件中)
- items.py
- 设置数据存储模板,用于结构化数据,如:Django的Model
- pipelines
- 数据处理行为,如:一般结构化的数据持久化
- settings.py
- 配置文件,如:递归的层数、并发数,延迟下载等
- spiders
- 爬虫目录,如:创建文件,编写爬虫规则
- scrapy.cfg
-
定义Item
-
class MeijuItem(scrapy.Item): name = scrapy.Field()
-
-
编写爬虫
-
修改起始爬取的url
- start_urls = ['http://www.meijutt.tv/new100.html']
-
数据处理
-
def parse(self, response): item = MeijuItem() item['name'] = name yield item
-
-
启用一个Item Pipeline组件,在settings.py开启
-
ITEM_PIPELINES = { 'meiju.pipelines.MeijuPipeline': 300, }
-
-
编写 Pipeline 来存储提取到的Item
-
快速存储
-
在运行文件中存储
-
scrapy crawl meijuSpider -o meiju.json scrapy crawl meijuSpider -o meiju.csv scrapy crawl meijuSpider -o meiju.xml
-
-
-
运行爬虫
-
命令行运行
- scrapy crawl meijuSpider
- scrapy crawl meijuSpider --nolog
-
新建一个运行文件start
-
# 执行scrapy命令:开启爬虫 scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju']) # scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju', '--nolog']) scrapy.cmdline.execute('scrapy crawl mymeiju'.split()) # scrapy.cmdline.execute('scrapy crawl mymeiju --nolog'.split())
-
-
-
Scrapy爬取美剧网
- mymeiju.py
import scrapy
from ..items import MeijuItem
class MymeijuSpider(scrapy.Spider):
# 爬虫名:唯一
name = 'mymeiju'
# 允许的域名列表
allowed_domains = ['meijutt.tv']
# 开始的url列表:启动项目后回直接自动爬取的url列表
start_urls = ['https://www.meijutt.tv/new100.html']
# 解析数据方法:
# 1.当start_urls中的网页请求完成后回自动调用当前的parse方法,并返回响应
def parse(self, response, **kwargs):
print('*' * 100)
# print(response)
# print(type(response)) # <class 'scrapy.http.response.html.HtmlResponse'>
# print(response.text) # 获取文本内容
# print(response.body) # 二进制内容
# print(response.json()) # 解析json
print('*' * 100)
# 解析数据:xpath
li_list = response.xpath('//ul[@class="top-list fn-clear"]/li')
for li in li_list:
# 有3种方式获取内容
# name = li.xpath('./h5/a/text()').get()
# name = li.xpath('./h5/a/text()')[0].extract()
# name = li.xpath('./h5/a/text()').extract_first()
# name = li.xpath('./span/text()').getall() # 获取所有匹配的内容,他是一个列表
name = li.xpath('./h5/a/text()').get() # 剧名
state = li.xpath('./span[1]/font/text()').get() # 状态:级数
mjzm = li.xpath('./span[2]/em/text()').get() # 字幕
mjjq = li.xpath('./span[3]/text()').get() # 分类
mjtv = li.xpath('./span[4]/text()').get() # 电视台
mjtime = li.xpath('./div[last]/font/text()').get() # 更新时间
if not mjtime:
mjtime = li.xpath('./div[last()]/text()').get()
# print(name)
# item:封装每个数据
# item = MeijuItem()
# item['name'] = name # 不能用点语法
item = MeijuItem(
name=name, state=state, mjzm=mjzm,
mjjq=mjjq, mjtv=mjtv, mjtime=mjtime
)
# 生成器,既是迭代器,又是可迭代对象
yield item
# 这里的item回传入到pipelines中,需要做两个事情
# 1.需要在parse方法中yield item
# 2.需要在settings中将ITEM_PIPELINES设置好
# yield返回2种值
# 1.返回item
# 2.返回Request/FormRequest
- items.py
import scrapy
# Item: 类似Django种的Model
class MeijuItem(scrapy.Item):
name = scrapy.Field()
state = scrapy.Field()
mjzm = scrapy.Field()
mjjq = scrapy.Field()
mjtv = scrapy.Field()
mjtime = scrapy.Field()
- pipelines.py
from itemadapter import ItemAdapter
# pipeline:专门用来存储数据
class MeijuPipeline:
# 开始爬虫:自动调用该函数一次
def open_spider(self, spider):
pass
# 打开文件
# self.fp = open('meiju.txt', 'a', encoding='utf-8')
# print('开始爬取......')
# 关闭爬虫:自动调用
def close_spider(self, spider):
pass
# 关闭文件
# self.fp.close()
# print('爬虫结束!')
# process_item:会被调用很多次(取决于yield item的次数)
def process_item(self, item, spider):
# print(spider.name) # 爬虫名
# print(f'item:{item}', type(item))
# 写入文件
# with open('meiju.txt', 'a', encoding='utf-8') as fp:
# fp.write(str(item) + '\n')
# self.fp.write(str(item) + '\n')
# print(f'{item["name"]}写入成功')
return item
- settings.py
BOT_NAME = 'meiju'
SPIDER_MODULES = ['meiju.spiders']
NEWSPIDER_MODULE = 'meiju.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'meiju.pipelines.MeijuPipeline': 300,
}
- start.py
import scrapy.cmdline
# 执行scrapy命令:开启爬虫
# scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju'])
# scrapy.cmdline.execute(['scrapy', 'crawl', 'mymeiju', '--nolog'])
# 使用split
# scrapy.cmdline.execute('scrapy crawl mymeiju'.split())
# scrapy.cmdline.execute('scrapy crawl mymeiju --nolog'.split())
# 快速存储成指定格式的文件
# 支持的文件格式('json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle')
# scrapy.cmdline.execute('scrapy crawl mymeiju -o meiju2.json'.split())
scrapy.cmdline.execute('scrapy crawl mymeiju -o meiju2.csv'.split())
Scrapy爬取当当网
- dangdang_spider.py
import scrapy
from ..items import DangdangItem
class DangdangSpiderSpider(scrapy.Spider):
name = 'dangdang_spider'
allowed_domains = ['dangdang.com']
start_urls = ['http://category.dangdang.com/pg1-cp01.01.02.00.00.00.html']
def parse(self, response, **kwargs):
li_list = response.xpath('//ul[@id="component_59"]/li')
for li in li_list:
book_name = li.xpath('./a/@title').get()
book_price = li.xpath('./p[@class="price"]/span[@class="search_now_price"]/text()').get()
book_author = li.xpath('./p[@class="search_book_author"]/span[1]/a/text()').get()
book_publishers = li.xpath('./p[@class="search_book_author"]/span[3]/a/text()').get()
book_star = li.xpath('./p[@class="search_star_line"]/span/span/@style').get()[6:-1]
book_comment = li.xpath('./p[4]/a/text()').get()
book_picture = li.xpath('./a/img/@data-original')
if book_picture:
book_picture = book_picture.get()
else:
book_picture = li.xpath('./a/img/@src').get()
print(book_picture)
item = DangdangItem(
book_name=book_name,
book_price=book_price,
book_author=book_author,
book_publishers=book_publishers,
book_star=book_star,
book_comment=book_comment,
book_picture=book_picture
)
yield item
- items.py
import scrapy
class DangdangItem(scrapy.Item):
book_name = scrapy.Field()
book_price = scrapy.Field()
book_author = scrapy.Field()
book_publishers = scrapy.Field()
book_star = scrapy.Field()
book_comment = scrapy.Field()
book_picture = scrapy.Field()
- pipelines.py
import pymysql
class DangdangPipeline:
def open_spider(self, spider):
print('开始爬取')
self.db = pymysql.connect(
host='localhost',
port=3306,
user='root',
password='******',
database='spider2003',
charset='utf8'
)
self.cur = self.db.cursor()
def close_spider(self, spider):
print('爬取结束')
self.cur.close()
self.db.close()
def process_item(self, item, spider):
# item['name'].replace('"', "'") # 单引号替换双引号
sql = 'insert into dangdang(book_name, book_price, book_author, book_publishers, book_star, book_comment, book_picture) values ("%s", "%s", "%s", "%s", "%s", "%s", "%s")' % (item['book_name'], item['book_price'], item['book_author'], item['book_publishers'], item['book_star'], item['book_comment'], item['book_picture'])
try:
self.cur.execute(sql)
self.db.commit()
except Exception as e:
print(e)
self.db.rollback()
return item
- settings.py
BOT_NAME = 'dangdang'
SPIDER_MODULES = ['dangdang.spiders']
NEWSPIDER_MODULE = 'dangdang.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'dangdang.pipelines.DangdangPipeline': 300,
}
- start.py
import scrapy.cmdline
scrapy.cmdline.execute('scrapy crawl dangdang_spider'.split())
Scrapy进阶
-
Scrapy Shell
- 概述
- Scrapy终端是一个交互终端,我们可以在未启动spider的情况下尝试及调试代码
- 启动Scrapy Shell
- scrapy shell "https://hr.tencent.com/position.php?&start=0#a"
- 继续操作
- response.text等
- 概述
-
Selectors选择器
- Scrapy Selectors 内置 XPath 和 CSS Selector 表达式机制
- Selector四个基本的方法(xpath最常用)
- xpath()
- 传入xpath表达式,返回该表达式所对应的所有节点的selector list列表
- extract()
- 序列化该节点为Unicode字符串并返回list, extract_first()
- css()
- 传入CSS表达式,返回该表达式所对应的所有节点的selector list列表,语法同 BeautifulSoup4中soup.select()
- re()
- 根据传入的正则表达式对数据进行提取,返回Unicode字符串list列表
- xpath()
-
Spider类
- 概述
- Spider类定义了如何爬取某个(或某些)网站。包括了爬取的动作(例如:是否跟进链接)以及如何从网页的内容中提取结构化数据(爬取item)。 换句话说,Spider就是你定义爬取的动作及分析某个网页(或者是有些网页)的地方
- scrapy.Spider是最基本的类,所有编写的爬虫必须继承这个类
- 主要用到的函数及调用顺序
- _init_()
- 初始化爬虫名字和start_urls列表
- start_requests()
- 调用make_requests_from_url():生成Requests对象交给Scrapy下载并返回response
- parse(self, response)
- 解析response,并返回Item或Requests(需指定回调函数)
- Item传给Item pipline持久化,而Requests交由Scrapy下载,并由指定的回调函数处理(默认parse()),一直进行循环,直到处理完所有的数据为止
- _init_()
- 主要属性和方法
- name
- 定义spider名字的字符串。唯一
- allowed_domains
- 包含了spider允许爬取的域名(domain)的列表,可选
- start_urls
- 初始URL元组/列表。当没有指定特定的URL时,spider将从该列表中开始进行爬取
- start_requests(self)
- 该方法必须返回一个可迭代对象(iterable)。该对象包含了spider用于爬取(默认实现是使用 start_urls 的url)的第一个Request
- 当spider启动爬取并且未指定start_urls时,该方法被调用
- parse(self, response)
- 当请求url返回网页没有指定回调函数时,默认的Request对象回调函数。用来处理网页返回的response,以及生成Item或者Request对象
- log(self, message[, level, component])
- 使用 scrapy.log.msg() 方法记录日志信息
- name
- 概述
-
CrawlSpider类
- 概述
- CrawlSpider是Spider的派生类
- Spider类的设计原则是只爬取start_urls列表中的网页
- CrawlSpider类定义了一些规则(rule)来提供跟进link的方便的机制,从爬取的网页中获取link并继续爬取的工作更适合
- 自动翻页
- LinkExtractors
- 概述
- 使用LinkExtractors 的目的: 提取链接
- 每个LinkExtractor有唯一的公共方法是 extract_links(),它接收一个 Response 对象,并返回一个 scrapy.link.Link 对象
- 主要参数
- allow
- 满足括号中“正则表达式”的值会被提取,如果为空,则全部匹配
- deny
- 与这个正则表达式(或正则表达式列表)匹配的URL一定不提取
- allow_domains
- 会被提取的链接的domains/域名
- deny_domains
- 一定不会被提取链接的domains
- restrict_xpaths
- 使用xpath表达式,和allow共同作用过滤链接/范围
- allow
- 概述
- rules
- 概述
- 在rules中包含一个或多个Rule对象,每个Rule对爬取网站的动作定义了特定操作
- 如果多个Rule匹配了相同的链接,则根据规则在本集合中被定义的顺序,第一个会被使用
- 主要参数
- link_extractor
- 是一个Link Extractor对象,用于定义需要提取的链接
- callback
- 从link_extractor中每获取到链接时,参数所指定的值作为回调函数,该回调函数接受一个response作为其第一个参数(尽量避免使用parse)
- follow
- 是一个布尔(boolean)值,指定了根据该规则从response提取的链接是否需要跟进
- follow=True
- 跟随:会自动匹配子网页中的其他符合规则的链接并爬取
- process_links
- 指定该spider中哪个的函数将会被调用,从link_extractor中获取到链接列表时将会调用该函数
- 该方法主要用来过滤
- process_request
- 指定该spider中哪个的函数将会被调用, 该规则提取到每个request时都会调用该函数(用来过滤request)
- link_extractor
- 概述
- 概述
-
Robots协议
- 概述
- Robots协议(也称为爬虫协议、机器人协议等)的全称是“网络爬虫排除标准”(Robots Exclusion Protocol),网站通过Robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取
- robots.txt文件是一个文本文件。当一个搜索蜘蛛访问一个站点时,它会首先检查该站点根目录下是否存在robots.txt,如果存在,搜索机器人就会按照该文件中的内容来确定访问的范围;如果该文件不存在,所有的搜索蜘蛛将能够访问网站上所有没有被口令保护的页面
- 使用
- 禁止robots协议将 ROBOTSTXT_OBEY = True改为False
- 概述
-
深度爬取
-
爬取到链接,进入链接继续爬取,爬取到链接,再次进入链接爬取......
-
yield scrapy.Request( url=href, # url链接 callback=self.parse_detail, # 回调函数:请求成功后的响应 meta={'name': name} # 传入到parse_detail中的数据 )
- scrapy.Request异步爬取
-
name = response.meta['name']
- 取出小说名
- 逐级传递
-
yield BiqugeItem(name=name, zj_name=zj_name, zj_content=zj_content)
- 将数据传入管道
-
-
循环遍历实现翻页
-
# 爬取下一页 if self.page <= 100: print(f'---开始爬取{self.page}页---') self.page = self.page + 1 url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_%d.shtml' % self.page yield scrapy.Request(url, callback=self.parse)
-
Scrapy爬取笔趣阁
- biquege_spider.py
import requests
import scrapy
from ..items import BiqugeItem
class BiqugeSpiderSpider(scrapy.Spider):
name = 'biquge_spider'
allowed_domains = ['biquge5200.cc']
start_urls = ['https://www.biquge5200.cc/xuanhuanxiaoshuo/']
# 爬取笔趣阁的首页
def parse(self, response, **kwargs):
# 解析数据
li_list = response.xpath('//div[@class="l"]/ul/li')
for li in li_list:
name = li.xpath('./span[@class="s2"]/a/text()').get() # 小说名
href = li.xpath('./span[@class="s2"]/a/@href').get() # 小说链接
# requests:同步
# print(len(requests.get(href).text))
# print('-' * 100)
# 异步:scrapy.Request
# 请求小说详情页
yield scrapy.Request(
url=href, # url链接
callback=self.parse_detail, # 回调函数:请求成功后的响应
meta={'name': name} # 传入到parse_detail中的数据
)
# 详情页
def parse_detail(self, response):
# 取出小说名
name = response.meta['name']
# 解析数据
dd_list = response.xpath('//div[@id="list"]/dl/dd')
for dd in dd_list:
zj_name = dd.xpath('./a/text()').get() # 章节名称
zj_href = dd.xpath('./a/@href').get() # 章节内容链接
# 请求每个章节的小说内容
yield scrapy.Request(
url=zj_href,
callback=self.parse_content,
meta={'name': name, 'zj_name': zj_name}
)
# 小说内容页
def parse_content(self, response):
# 取出小说名及章节名
name = response.meta['name']
zj_name = response.meta['zj_name']
# 解析数据
p_list = response.xpath('//*[@id="content"]/p/text()').getall()
zj_content = '\n'.join(p_list)
# item
# 将数据传入管道
yield BiqugeItem(name=name, zj_name=zj_name, zj_content=zj_content)
- items.py
import scrapy
class BiqugeItem(scrapy.Item):
name = scrapy.Field()
zj_name = scrapy.Field()
zj_content = scrapy.Field()
- pipelines.py
import os
from itemadapter import ItemAdapter
class BiqugePipeline:
# def __init__(self):
# self.path = r'C:\Users\86188\Desktop\Spider\Day05\scrapy_project\biquge\books'
def process_item(self, item, spider):
if not os.path.isdir("books/%s" % item['name']):
os.mkdir("books/%s" % item['name'])
else:
with open('books/%s/%s.txt' % (item["name"], item["zj_name"]), 'a', encoding='utf-8') as fp:
fp.write(item["zj_content"])
fp.flush()
print(f'item:{item["name"]}-{item["zj_name"]}')
return item
- settings.py
BOT_NAME = 'biquge'
SPIDER_MODULES = ['biquge.spiders']
NEWSPIDER_MODULE = 'biquge.spiders'
USER_AGENT = 'biquge (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'biquge.pipelines.BiqugePipeline': 300,
}
- start.py
import scrapy.cmdline
scrapy.cmdline.execute('scrapy crawl biquge_spider'.split())
Scrapy爬取京东
- jd_spider.py
import scrapy
from selenium import webdriver
from ..items import JdItem
'''
常见的反爬虫策略之一。
这个参数的值,表明你是从哪个网页跳转过来的。
比如说我请求获得淘宝评论的时候,他的referer是商品详情页面,表明我从这件商品详情页请求的相关评论,没有referer就不会给你这个评论
from fake_useragent import UserAgent
#伪装成浏览器
ua = UserAgent()
headers = {'User-Agent':ua.random} #一般网站伪装成这样也就够了,但是如果想爬图片,图片反盗链的话。如下
#其实很好理解,就是告诉你要下载的那个图片页面,我是从主页面来的,现在把数据给我。
headers = {'User-Agent':ua.random,'Referer':'这里放入图片的主页面'}
#然后在后续requests中传入header即可
'''
class JdSpiderSpider(scrapy.Spider):
name = 'jd_spider'
allowed_domains = ['jd.com']
start_urls = [
# 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=1&s=1&click=0',
# 'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=4&s=79&scrolling=y&log_id=1600660067305.2410&tpl=3_M&isList=1&show_items=',
'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page=6&s=131&scrolling=y&log_id=1600661434422.8716&tpl=3_M&isList=1&show_items='
]
page1 = 1
# page2 = 2
s1 = 1
# s2 = 27
def parse(self, response, **kwargs):
# driver = webdriver.Chrome()
# driver.execute_script('window.scrollBy(0,10000)')
li_list = response.xpath('//li[@class="gl-item"]')
print(len(li_list))
for li in li_list:
shoes_name = li.xpath('./div/div[@class="p-img"]/a/@title').get()
shoes_price = li.xpath('./div/div[@class="p-price"]/strong/i/text()').get()
shoes_picture = li.xpath('./div/div[@class="p-img"]/a/img/@data-lazy-img').get()
print(shoes_name, shoes_price, shoes_picture)
yield JdItem(shoes_name=shoes_name, shoes_price=shoes_price, shoes_picture=shoes_picture)
# driver.close()
# if self.page1 <= 10:
# # if self.page2 <= 200:
# print(f'---开始爬取{self.page1}页---')
# # print(f'---开始爬取{self.page2}页---')
# self.page1 = self.page1 + 2
# self.s1 = self.s1 + 52
# # self.page2 = self.page2 + 2
# # self.s2 = self.s2 + 52
# url = f'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page={self.page1}&s={self.s1}&click=0'
# # url = f'https://list.jd.com/listNew.php?cat=1318%2C12099%2C9756&page={self.page2}&s={self.s2}&scrolling=y&log_id=1600431181482.2679&tpl=3_M&isList=1&show_items='
#
#
# yield scrapy.Request(url, callback=self.parse)
- items.py
import scrapy
class JdItem(scrapy.Item):
shoes_name = scrapy.Field()
shoes_price = scrapy.Field()
shoes_picture = scrapy.Field()
- pipelines.py
import pymysql
from itemadapter import ItemAdapter
class JdPipeline:
def open_spider(self, spider):
print('连接数据库')
self.db = pymysql.connect(
user='root', password='******',database='spider2003'
)
self.cur = self.db.cursor()
def close_spider(self, spider):
print('关闭连接')
self.cur.close()
self.db.close()
def process_item(self, item, spider):
sql = 'insert into jd (shoes_name, shoes_price, shoes_picture) values ("%s", "%s", "%s")' % (item['shoes_name'], item['shoes_price'], item['shoes_picture'])
try:
self.cur.execute(sql)
self.db.commit()
except Exception as e:
print(e)
self.db.rollback()
return item
- settings.py
BOT_NAME = 'jd'
SPIDER_MODULES = ['jd.spiders']
NEWSPIDER_MODULE = 'jd.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'referer': 'https://list.jd.com/list.html?cat=1318%2C12099%2C9756&page=3&s=53&click=0'
}
ITEM_PIPELINES = {
'jd.pipelines.JdPipeline': 300,
}
- start.py
import scrapy.cmdline
# scrapy.cmdline.execute('scrapy crawl jd_spider --nolog'.split())
scrapy.cmdline.execute('scrapy crawl jd_spider'.split())
Scrapy爬取糗事百科
- qsbk_spider.py
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from ..items import QiushibaikeItem
# 导入日志模块
import logging
# 配置日志输出格式
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(module)s - %(message)s" # 设置输出格式
DATE_FORMAT = "%Y/%m/%d %H:%M:%S" # 设置时间格式
logging.basicConfig(filename='qsbk.log', filemode='a+', format=LOG_FORMAT, datefmt=DATE_FORMAT)
class QsbkSpiderSpider(CrawlSpider):
# class QsbkSpiderSpider(scrapy.Spider):
name = 'qsbk_spider'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
rules = [
Rule(
LinkExtractor(
allow=('/text/page/\d+/',),
restrict_xpaths=('//ul[@class="pagination"]',)
),
callback="parse_item",
follow=True
)
]
def parse_item(self, response, **kwargs):
div_list = response.xpath('//div[@class="col1 old-style-col1"]/div')
for div in div_list:
author = div.xpath('./div[@class="author clearfix"]/a[2]/h2/text()').get()
content = div.xpath('./a[@class="contentHerf"]/div/span/text()').getall() # 有br换行时,要用getall,但是要处理结果
logging.info(f'download:{author}')
yield QiushibaikeItem(author=author, content=content)
- items.py
import scrapy
class QiushibaikeItem(scrapy.Item):
author = scrapy.Field()
content = scrapy.Field()
- pipelines.py
import os
import random
from itemadapter import ItemAdapter
class QiushibaikePipeline:
def process_item(self, item, spider):
with open('cross_talk/%s-%f.txt' % (item['author'].replace('\n', ''), random.random()), 'w', encoding='utf-8') as fp:
fp.write((''.join(item['content'])).replace('\n', ''))
fp.flush()
return item
- settings.py
BOT_NAME = 'qiushibaike'
SPIDER_MODULES = ['qiushibaike.spiders']
NEWSPIDER_MODULE = 'qiushibaike.spiders'
USER_AGENT = 'qiushibaike (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'qiushibaike.pipelines.QiushibaikePipeline': 300,
}
- start.py
import scrapy.cmdline
# scrapy.cmdline.execute('scrapy crawl qsbk_spider --nolog'.split())
scrapy.cmdline.execute('scrapy crawl qsbk_spider'.split())
Scrapy爬取新浪新闻
- news_spider.py
import scrapy
from ..items import SinaNewsItem
class NewsSpiderSpider(scrapy.Spider):
name = 'news_spider'
allowed_domains = ['sina.com.cn']
start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_1.shtml']
# 自定义类属性
page = 1
def parse(self, response, **kwargs):
li_list = response.xpath('//ul[@class="list_009"]/li')
for li in li_list:
news = li.xpath('./a/text()').get()
news_time = li.xpath('./span/text()').get()
news_link = li.xpath('./a/@href').get()
item = SinaNewsItem(
news=news,
news_time=news_time,
news_link=news_link,
)
yield item
# 爬取下一页
if self.page <= 100:
print(f'---开始爬取{self.page}页---')
self.page = self.page + 1
url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_%d.shtml' % self.page
yield scrapy.Request(url, callback=self.parse)
- items.py
import scrapy
class SinaNewsItem(scrapy.Item):
news = scrapy.Field()
news_time = scrapy.Field()
news_link = scrapy.Field()
- pipelines.py
import pymysql
from itemadapter import ItemAdapter
class SinaNewsPipeline:
def open_spider(self, spider):
print('开始爬取')
self.db = pymysql.connect(
host='localhost',
port=3306,
user='root',
password='******',
database='spider2003',
charset='utf8'
)
self.cur = self.db.cursor()
def close_spider(self, spider):
print('爬取结束')
self.cur.close()
self.db.close()
def process_item(self, item, spider):
news = item['news']
news_time = item['news_time']
news_link = item['news_link']
try:
sql = 'insert into sina_news(news, news_time, news_link) values ("%s", "%s", "%s")' % (news, news_time, news_link)
self.cur.execute(sql)
self.db.commit()
except Exception as e:
print(e)
self.db.rollback()
return item
- settings.py
BOT_NAME = 'sina_news'
SPIDER_MODULES = ['sina_news.spiders']
NEWSPIDER_MODULE = 'sina_news.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'sina_news.pipelines.SinaNewsPipeline': 300,
}
- start.py
import scrapy.cmdline
scrapy.cmdline.execute('scrapy crawl news_spider'.split())
Scrapy高级
-
日志logging
-
Scrapy提供的log功能
-
可以修改配置文件settings.py,任意位置添加下面两行,效果会清爽很多
-
LOG_ENABLED = True # 开启 LOG_FILE = "mySpider.log" #日志文件名 LOG_LEVEL = "INFO" #日志级别
-
-
Log levels
- Scrapy提供5层logging级别
- CRITICAL - 严重错误(critical)
- ERROR - 一般错误(regular errors)
- WARNING - 警告信息(warning messages)
- INFO - 一般信息(informational messages)
- DEBUG - 调试信息(debugging messages)
- Scrapy提供5层logging级别
-
logging设置
- 通过在setting.py中进行以下设置可以被用来配置logging
- LOG_ENABLED
- 默认: True,启用logging
- LOG_ENCODING
- 默认: 'utf-8',logging使用的编码
- LOG_FILE
- 默认: None,在当前目录里创建logging输出文件的文件名
- LOG_LEVEL
- 默认: 'DEBUG',log的最低级别
- LOG_ENABLED
- scrapy的日志模块已经被scrapy弃用
- 通过在setting.py中进行以下设置可以被用来配置logging
-
-
使用python自带日志模块
-
import logging LOG_FORMAT = "%(asctime)s - %(levelname)s - %(module)s - %(message)s" # 设置输出格式 DATE_FORMAT = "%Y/%m/%d %H:%M:%S" # 设置时间格式 logging.basicConfig(filename='sina.log', filemode='a+', format=LOG_FORMAT, datefmt=DATE_FORMAT) logging.warning('错误')
-
-
-
settings配置
-
概述
- Scrapy设置(settings)提供了定制Scrapy组件的方法。可以控制包括核心(core),插件(extension),pipeline及spider组件
-
设置
-
BOT_NAME
- 默认: 'scrapybot'
- Scrapy项目实现的bot的名字(也为项目名称)。 这将用来构造默认 User-Agent,同时也用来log
- 当您使用startproject命令创建项目时其也被自动赋值
-
CONCURRENT_ITEMS
- 默认: 100
- Item Processor(即 Item Pipeline同时处理每个response(item)的最大值
-
CONCURRENT_REQUESTS
- 默认: 16
- Scrapy downloader 并发请求(concurrent requests)的最大值
-
CONCURRENT_REQUESTS_PER_DOMAIN
- 默认: 8
- 对单个网站进行并发请求的最大值
-
CONCURRENT_REQUESTS_PER_IP
- 默认: 0
- 对单个IP进行并发请求的最大值
- 如果非0,则忽略CONCURRENT_REQUESTS_PER_DOMAIN,设定, 使用该设定
-
DEFAULT_REQUEST_HEADERS
-
DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', }
-
Scrapy HTTP Request使用的默认header,由DefaultHeadersMiddleware产生
-
-
DEPTH_LIMIT
- 默认: 0
- 爬取网站最大允许的深度(depth)值。如果为0,则没有限制
-
DOWNLOADER
- 默认: 'scrapy.core.downloader.Downloader'
- 用于crawl的downloader
-
DOWNLOADER_MIDDLEWARES
- 默认:{}
- 保存项目中启用的下载中间件及其顺序的字典
-
DOWNLOADER_MIDDLEWARES_BASE
- 默认
- 包含Scrapy默认启用的下载中间件的字典。 永远不要在项目中修改该设定
-
DOWNLOAD_DELAY
- 默认: 0下载器在下载同一个网站下一个页面前需要等待的时间。该选项可以用来限制爬取速度, 减轻服务器压力。同时也支持小数
-
DOWNLOAD_TIMEOUT
- 默认: 180
- 下载器超时时间(单位: 秒)
-
ITEM_PIPELINES
- 默认: {}
- 保存项目中启用的pipeline及其顺序的字典。该字典默认为空,值(value)任意。 不过值(value)习惯设定在0-1000范围内,越小,优先级越高
-
ITEM_PIPELINES_BASE
- 默认: {}
- 保存项目中默认启用的pipeline的字典。 永远不要在项目中修改该设定,而是修改ITEM_PIPELINES
-
LOG_ENABLED
- 默认:True
- 是否启用logging
-
LOG_ENCODING
- 默认: 'utf-8'
- logging使用的编码
-
LOG_FILE
- 默认: None
- logging输出的文件名。如果为None,则使用标准错误输出(standard error)
-
LOG_LEVEL
- 默认: 'DEBUG'
- 可选的级别有: CRITICAL、 ERROR、WARNING、INFO、DEBUG
-
LOG_STDOUT
- 默认: False
- 如果为 True ,进程所有的标准输出(及错误)将会被重定向到log中。例如, 执行 print 'hello' ,其将会在Scrapy log中显示
-
REDIRECT_MAX_TIMES
- 默认: 20
- 定义request允许重定向的最大次数。超过该限制后该request直接返回获取到的结果。 对某些任务我们使用Firefox默认值
-
ROBOTSTXT_OBEY
- 默认:True
- 如果启用,Scrapy将会遵守 robots.txt策略
-
SCHEDULER
- 默认:{}
- 保存项目中启用的下载中间件及其顺序的字典
-
SPIDER_MIDDLEWARES_BASE
- 默认:
- 保存项目中默认启用的spider中间件的字典。 永远不要在项目中修改该设定,而是修改SPIDER_MIDDLEWARES
-
SPIDER_MODULES
- 默认: []
- Scrapy搜索spider的模块列表
-
URLLENGTH_LIMIT
- 默认: 2083
- 爬取URL的最大长度
-
USER_AGENT
- 默认: "Scrapy/VERSION (+http://scrapy.org)"
- 爬取的默认User-Agent,除非被覆盖
-
REACTOR_THREADPOOL_MAXSIZE
- 线程池数量,默认10条
-
-
-
自定义中间件
-
中间件种类
-
process_request(self, request, spider)
- 当每个request通过下载中间件时,该方法被调用
-
process_response(self, request, response, spider)
- 当下载器完成http请求,传递响应给引擎的时候调用
-
自定义
-
创建中间件类
-
# 随机的User-Agent class RandomUserAgent(object): def process_request(self, request, spider): useragent = random.choice(USER_AGENTS) request.headers.setdefault("User-Agent", useragent)
-
# 随机代理IP class RandomProxy(object): def process_request(self, request, spider): proxy = random.choice(PROXIES) request.meta['proxy'] = "http://" + proxy['ip_port']
-
配置中间件
-
最后设置setting.py里的DOWNLOADER_MIDDLEWARES,添加自己编写的下载中间件类
-
DOWNLOADER_MIDDLEWARES = { 'baidu.middlewares.BaiduDownloaderMiddleware': 543, # 配置中间件 'baidu.middlewares.UADownloaderMiddleware': 300, 'baidu.middlewares.ProxyDownloaderMiddleware': 200, } USER_AGENTS = [ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" ] PROXIES = [ {'ip_port': '58.218.200.214:8730'}, {'ip_port': '58.218.200.247:2359'}, {'ip_port': '58.218.200.248:8503'}, {'ip_port': '58.218.200.229:4612'}, {'ip_port': '58.218.200.214:5570'}, {'ip_port': '58.218.200.214:8801'}, ]
-
-
-
-
-
POST请求
-
如果第一个请求是post
-
需要注释掉start_urls属性,并重写start_request方法
-
def start_requests(self): yield scrapy.FormRequest( url='http://fanyi.baidu.com/sug', formdata={'kw': 'wolf'}, callback=self.parse_item )
-
-
如果第一个请求不是post
-
response = requests.post("http://www.baidu.com/", data = data, headers=headers) yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse_item)
-
-
爬取新片场(综合)
- xpc_spider.py
import scrapy
from ..items import *
class XpcSpiderSpider(scrapy.Spider):
name = 'xpc_spider'
allowed_domains = ['xinpianchang.com']
start_urls = ['https://www.xinpianchang.com/channel/index/sort-like?from=navigator']
def parse(self, response, **kwargs):
# 解析数据
# 视频列表数据
li_list = response.xpath('//ul[@class="video-list"][1]/li')
for li in li_list:
# 作品id
pid = li.xpath('./@data-articleid').get()
# 作品标题
title = li.xpath('./div/div[1]/a/p/text()').get()
# 缩略图
thumbnail = li.xpath('./a/img/@_src').get()
category_list = li.xpath('.//div[@class="new-cate"]/span[@class="fs_12 fw_300 c_b_9"]/text()').getall()
# 分类
category = '|'.join(category_list)
category = category.replace(' ', '').replace('\n', '').replace('\t', '')
# 发布时间
created_at = li.xpath('.//p[@class="fs_12"]/text()').get()
# item
item = PostsItem()
item['pid'] = pid
item['title'] = title
item['thumbnail'] = thumbnail
item['category'] = category
item['created_at'] = created_at
# 进入详情页
post_url = f'https://www.xinpianchang.com/a{pid}?from=ArticleList'
request = scrapy.Request(url=post_url, callback=self.post_detail)
request.meta['post_item'] = item
yield request
# 作品详情页
def post_detail(self, response):
post_item = response.meta.get('post_item')
pid = post_item['pid']
# 解析数据
# 作品描述
description_list = response.xpath('//p[@class="desc line-hide fs_14 c_b_3 fw_300 line-hide-3"]/text()').getall()
description = ''.join(description_list)
description = description.replace(' ', '').replace('\n', '').replace('\t', '')
post_item['description'] = description
# 播放次数
play_counts = response.xpath('//i[@class="fs_12 fw_300 c_b_6 v-center play-counts"]/@data-curplaycounts').get()
post_item['play_counts'] = play_counts
# 点赞次数
like_counts = response.xpath('//span[@class="v-center like-counts fs_12 c_w_f fw_300"]/@data-counts').get()
post_item['like_counts'] = like_counts
# 视频数据
# video_url = 'https://mod-api.xinpianchang.com/mod/api/v2/media/ryM1l4365Wzwod2V?appKey=61a2f329348b3bf77&extend=userInfo%2CuserStatus'
vid = response.xpath('//a[@class="collection-star hollow-star"]/@data-vid').get()
video_url = f'https://mod-api.xinpianchang.com/mod/api/v2/media/{vid}?appKey=61a2f329348b3bf77&extend=userInfo%2CuserStatus'
# 请求视频数据
request = scrapy.Request(url=video_url, callback=self.video_detail)
request.meta['post_item'] = post_item
yield request
# 创作者数据
li_list = response.xpath('//div[@class="filmplay-creator right-section"]/ul/li')
for li in li_list:
# 创作者id
cid = li.xpath('./a/@data-userid').get()
# item
composer_item = ComposersItem()
composer_item['cid'] = cid
# 创作者url
composer_url = li.xpath('./a/@href').get()
composer_url = 'https://www.xinpianchang.com/' + composer_url
# 访问创作者详情页
request2 = scrapy.Request(url=composer_url, callback=self.composer_detail)
request2.meta['composer_item'] = composer_item
yield request2
# 版权/角色数据
cr_item = CopyrightsItem()
cr_item['pcid'] = f'{pid}_{cid}'
cr_item['pid'] = pid
cr_item['cid'] = cid
cr_item['roles'] = li.xpath('.//span[@class="roles fs_12 fw_300 c_b_9"]/text()').get()
yield cr_item
# 评论数据
comment_url = f'https://app.xinpianchang.com/comments?resource_id={pid}&type=article&page=1&per_page=24'
yield scrapy.Request(
url=comment_url,
callback=self.comment_detail
)
# 视频数据
def video_detail(self, response):
post_item = response.meta.get('post_item')
# 解析数据
content = response.json()
# 视频预览图
preview = content['data']['cover']
# 视频链接
video = content['data']['resource']['progressive'][0]['url']
# 视频格式
video_format = content['data']['resource']['progressive'][0]['mime']
# 视频时长
duration = content['data']['duration']
# item
post_item['preview'] = preview
post_item['video'] = video
post_item['video_format'] = video_format
post_item['duration'] = duration
# print(post_item)
yield post_item
# 创作者详情页
def composer_detail(self, response):
composer_item = response.meta.get('composer_item')
# banner图
banner = response.xpath('//div[@class="banner-wrap"]/@style').get()
banner = banner[banner.find('(')+1: -1]
# 用户头像
avatar = response.xpath('//div[@class="banner-wrap"]/div/span/img/@src').get()
# 是否加V
verified = response.xpath('//div[@class="banner-wrap"]/div/span/span[contains(@class, "author-v")]').get()
verified = 'yes' if verified else 'no'
# 名字
name = response.xpath('//p[@class="creator-name fs_26 fw_600 c_b_26"]/text()').get()
# 自我介绍
intro = response.xpath('//p[@class="creator-desc fs_14 fw_300 c_b_3 line-hide-1"]/text()').get()
# 被点赞次数
like_counts = response.xpath('//span[@class="like-counts fw_600 v-center"]/text()').get()
like_counts = like_counts.replace(',', '')
# 被关注数量
fans_counts = response.xpath('//span[@class="fans-counts fw_600 v-center"]/text()').get()
fans_counts = fans_counts.replace(',', '')
# 关注数量
follow_counts = response.xpath('//span[@class="follow-wrap"]/span[@class="fw_600 v-center"]/text()').get()
follow_counts = follow_counts.replace(',', '')
# 所在位置
location = response.xpath('//span[@class="icon-location v-center"]/following-sibling::*/text()').get()
location = location if location else ''
# 职业
career = response.xpath('//span[@class="icon-career v-center"]/following-sibling::*/text()').get()
career = career if career else ''
# item
composer_item['banner'] = banner
composer_item['avatar'] = avatar
composer_item['verified'] = verified
composer_item['name'] = name
composer_item['intro'] = intro
composer_item['like_counts'] = like_counts
composer_item['fans_counts'] = fans_counts
composer_item['follow_counts'] = follow_counts
composer_item['location'] = location
composer_item['career'] = career
yield composer_item
# 评论数据
def comment_detail(self, response):
content = response.json()
comment_list = content['data']['list']
for comment in comment_list:
# 评论其他评论的数量
reply = comment.get('referer')
if reply:
reply = reply.get('id')
else:
reply = 0
item = CommentsItem(
commentid=comment['id'],
pid=comment['resource_id'],
cid=comment['userid'],
avatar=comment['userInfo']['avatar'],
uname=comment['userInfo']['username'],
created_at=comment['addtime'],
content=comment['content'],
like_counts=comment['count_approve'],
reply=reply
)
yield item
- items.py
from scrapy import Item, Field
# 作品
class PostsItem(Item):
table_name = 'posts' # 表名
pid = Field()
title = Field()
thumbnail = Field()
preview = Field()
video = Field()
video_format = Field()
category = Field()
duration = Field()
created_at = Field()
description = Field()
play_counts = Field()
like_counts = Field()
class ComposersItem(Item):
table_name = 'composers' # 表名
cid = Field()
banner = Field()
avatar = Field()
verified = Field()
name = Field()
intro = Field()
like_counts = Field()
fans_counts = Field()
follow_counts = Field()
location = Field()
career = Field()
class CommentsItem(Item):
table_name = 'comments' # 表名
commentid = Field()
pid = Field()
cid = Field()
avatar = Field()
uname = Field()
created_at = Field()
content = Field()
like_counts = Field()
reply = Field()
# 版权:作者在作品中的角色
class CopyrightsItem(Item):
table_name = 'copyrights' # 表名
pcid = Field()
pid = Field()
cid = Field()
roles = Field()
- pipelines.py
import pymysql
from itemadapter import ItemAdapter
class XpcPipeline:
def open_spider(self, spider):
print('---开始存入MySQL---')
self.db = pymysql.connect(user='root', password='nzw19940611', database='xpc_2020')
self.cur = self.db.cursor()
def close_spider(self, spider):
print('---存入MySQL结束---')
self.cur.close()
self.db.close()
def process_item(self, item, spider):
# 表名
table_name = item.table_name
keys = list(item.keys())
values = list(item.values())
# 所有字段组成的字符串
key_str = ','.join(["`%s`" % key for key in keys])
# 所有的值组成的字符串
# value_str = ','.join(['"%s"' % value for value in values])
value_str = ','.join(["%s"] * len(values))
# 如果key冲突,则用新数据更新旧数据
update_str = ','.join(["`{}`=%s".format(key) for key in keys])
# sql
sql = 'insert into `{}` ({}) values ({}) on duplicate key update {}'.format(
table_name,
key_str,
value_str,
update_str
)
# 执行sql
self.cur.execute(sql, values*2)
self.db.commit()
print(f'---插入成功:{table_name}---')
return item
- settings.py
BOT_NAME = 'xpc'
SPIDER_MODULES = ['xpc.spiders']
NEWSPIDER_MODULE = 'xpc.spiders'
USER_AGENT = 'xpc (+http://www.yourdomain.com)'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'xpc.pipelines.XpcPipeline': 300,
}
- start.py
import scrapy.cmdline
scrapy.cmdline.execute('scrapy crawl xpc_spider --nolog'.split())
# scrapy.cmdline.execute('scrapy crawl xpc_spider'.split())