爬虫
爬虫
介绍
# 爬虫流程
模拟发送http请求 ---> 解析数据(清洗数据) ---> 入库
# 百度、谷歌...(大爬虫)
百度搜索:输入关键字 ---> 搜的是百度的数据库 ---> 页面展示 ---> 点击具体内容 ---> 网页跳转
seo优化:主动让百度爬到你
sem:花钱做广告买关键词
# 爬虫协议
哪部分允许爬取,哪部分不允许爬取(https://www.csdn.net/robots.txt)
# python中爬虫相关内容
模拟发送http请求(requests,slenium) ---> 解析数据(清洗数据)(json、bs4...) --->入库
(文件、mysql、redis、Excel、MongoDB)
反爬:
封ip --- 代理池
封账号 --- cookie池
请求头中带特殊校验 --- 相应破解出那些字段
数据加密 --- js解析出加密方式,自行组装数据
html --- css反爬,字体反爬
requests库介绍
# requests模块,基于urllib3封装,方便的发出http请求
# pip install requests
requests发送get请求
- 普通请求
res=requests.get('https://www.cnblogs.com/xiaoyuanqujing/articles/11805698.html')
print(res.text) # 返回的数据
search = input('请输入要搜索的内容:')
res = requests.get('https://www.baidu.com/s?wd=' + search,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
'Host': 'www.baidu.com',
})
print(res.text)
with open('search.html','w',encoding='utf-8') as f:
f.write(res.text)
- 携带参数
import requests
response=requests.get('https://www.sogou.com/web',
headers={
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36',
},params={'query':'美女'})
print(response.text)
with open('search.html','w',encoding='utf-8') as f:
f.write(response.text)
# url编码和解码
from urllib.parse import quote,unquote
# res=quote('美女')
# print(res) #%E7%BE%8E%E5%A5%B3
res=unquote('%E7%BE%8E%E5%A5%B3')
print(res)
from urllib.parse import urlencode
res=urlencode({'wd':'美女','age':19},encoding='utf-8')
print(res)
- 携带请求头
# 如果被做了反爬,但是用浏览器可以,一定是模拟的不像
header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
'Cookie':'BIDUPSID=185720F0FEA0DC697147E75D48AFB1D8; PSTM=1593942899; BAIDUID=185720F0FEA0DC69D0675C2EEDB05721:SL=0:NR=10:FG=1; sug=3; ORIGIN=0; bdime=0; sugstore=1; BD_UPN=12314753; __yjs_duid=1_61812ebe639caffca8271e1786971c8b1617936053918; BDUSS=lhDOTR6OWU0UWdmcFBLTDdxRUlqQXJtdFFjajlxfjFhVUpRLTNDNEd0VW51Uk5oSVFBQUFBJCQAAAAAAAAAAAEAAACwPo3XwM~E0Lqiyc-6o9Cjx~gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACcs7GAnLOxgW; BDUSS_BFESS=lhDOTR6OWU0UWdmcFBLTDdxRUlqQXJtdFFjajlxfjFhVUpRLTNDNEd0VW51Uk5oSVFBQUFBJCQAAAAAAAAAAAEAAACwPo3XwM~E0Lqiyc-6o9Cjx~gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACcs7GAnLOxgW; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID_BFESS=uQLOJeC62GhrmIcHWL4ru7XUvDmJR3TTH6aoUKcPVTiblG6zzmh3EG0Pbf8g0K4bdMXhogKK0eOTHkuF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tRk8oK-aJKvbfP0kKno_MtCsqxby26n9-Rb9aJ5y-J7nhMTz5Mn1DT_OQl_fXpQq5m3ion3vQpbZ8h5D34vW-fLRDmct-p5MQ26xKl0MLPbcsU5nBU4VhnkD2fnMBMPj5mOnaIQc3fAKftnOM46JehL3346-35543bRTLnLy5KJYMDFCjTA-D6QyeUbQa4JWHD6QB4TaajrjDnCrBPjUXUI82h5y05OkbmteaU3PJMnhMUna54ovynKZDnORXx745j5b-bA-Bh3tfKJKbPQ63ML1Db3JqP7M0aQtsCouan3oepvoD-oc3MvByPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEJJkO_D_atKvDqTrP-trf5DCShUFsWPKJB2Q-XPoO3KJZfqRhyhJIjpk0jn7P-tQiW5cpoMbgylRM8P3y0bb2DUA1y4vpK-ogQgTxoUJ2fnRJEUcGqj5Ah--ebPRiJPQ9Qg-qahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hD0wjT0-DjcM-Uv05-PXKCJ0X458HJOoDDvFqfbcy4LdjG5NeRvbLnc7-hRu2PKboM5Cbxbmj4Pu3-Aq54RIL5505tnqtMcNb-0xeJrhQfbQ0bjuqP-jW5Ta-qI-HR7JOpkxbfnxy-P0QRPH-Rv92DQMVU52QqcqEIQHQT3m5-5bbN3ht6IHJJIq_I82JIvbfP0k5R35hnjH-UIs-lorB2Q-5KL-3bnKDqTnyhJdjbD0jn7P-f3LWHue-UbdJJjoSqvn0hjxMtDjQNjEhtr3t2TxoUJt5DnJhhkm-4OYW-kebPRiJPQ9QgbWLlQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0M5DK0HPonHjAKDjvP; delPer=0; BD_CK_SAM=1; PSINO=3; BAIDUID_BFESS=185720F0FEA0DC69D0675C2EEDB05721:SL=0:NR=10:FG=1; H_PS_PSSID=34300_34335_34273_31254_34377_33848_34092_34107_34111_26350_34360; COOKIE_SESSION=3698_4_8_9_18_23_0_1_7_3_0_11_2964_0_0_0_1628046671_1627963945_1628050426%7C9%23101_72_1627963943%7C9; H_PS_645EC=4707Tvcdepk6pvKnFnabHvwqrLGAFZiyVAOXDTeK8IdAgRrAQD714rlnFSA; BA_HECTOR=0ha5al208l240k2hf91ggk5e30q'}
res = requests.get('https://www.baidu.com/s?wd=帅哥',headers=header)
print(res.text)
with open('search.html','w',encoding='utf-8') as f:
f.write(res.text)
- 携带cookie
res = requests.get('http://www.aa7a.cn/', headers={
# 'cookie': 'ECS_ID=b435f5897f41c2f2c322fa3065165c9fbc56ddd5; ECS[visit_times]=1; _jzqa=1.4423902263705487400.1628049030.1628049030.1628049030.1; _jzqc=1; _jzqy=1.1628049030.1628049030.1.jzqsr=baidu.-; _jzqckmp=1; UM_distinctid=17b0f48bcda509-0e53827f49b667-5e422810-1fa400-17b0f48bcdb54c; CNZZDATA4603183=cnzz_eid%3D1414483188-1628045968-null%26ntime%3D1628045968; Hm_lvt_c29657ca36c6c88e02fed9a397826038=1628049030; CNZZDATA1260462072=271803043-1628045968-null%7C1628045968; Qs_lvt_201322=1628049030; mediav=%7B%22eid%22%3A%22179539%22%2C%22ep%22%3A%22%22%2C%22vid%22%3A%22%22%2C%22ctn%22%3A%22%22%2C%22vvid%22%3A%22%22%2C%22_mvnf%22%3A1%2C%22_mvctn%22%3A0%2C%22_mvck%22%3A1%2C%22_refnf%22%3A0%7D; _qzjc=1; __xsptplusUT_422=1; __xsptplus422=422.1.1628049032.1628049032.1%234%7C%7C%7C%7C%7C%23%23HXwWiieCoDk4evQa5H5dKIEBtnxLTY12%23; ECS[username]=616564099%40qq.com; ECS[user_id]=61399; ECS[password]=4a5e6ce9d1aba9de9b31abdf303bbdc2; _qzja=1.1591066928.1628049030327.1628049030327.1628049030327.1628049032756.1628049055229.616564099%2540qq_com.1.0.3.1; _qzjb=1.1628049030327.3.0.0.0; _qzjto=3.1.0; _jzqb=1.8.10.1628049030.1; Qs_pv_201322=2246468261428716000%2C1231523507243942000; Hm_lpvt_c29657ca36c6c88e02fed9a397826038=1628049055; cto_bundle=Gd60IF9TJTJGRHFuTzdidXZYZGEyVW9ydFFJV25YY0RqSlBRODRlTDdjSG9RT01NUlg4NmYyVjhPMzNmenolMkJDMlRiQjJWTHA2UlBoUUdNOGtBTnoyTkZqdmJMOEI5Vk14aVU4JTJGbHdyTXFqaCUyRlY1dWt3JTNE'
})
print('616564099@qq.com ' in res.text)
requests发送post请求
- 自动登录某网站
res = requests.post('http://www.aa7a.cn/user.php', data={
'username': '616564099@qq.com',
'password': 'lqz123',
'captcha': 'zxv7',
'remember': 1,
'ref': 'http://www.aa7a.cn/',
'act': 'act_login'
})
# print(res.text)
## 取出cookie,登录成功的cookie
cookie=res.cookies # CookieJar对象
print(cookie)
res2=requests.get('http://www.aa7a.cn/',cookies=cookie)
# res2=requests.get('http://www.aa7a.cn/')
print('616564099@qq.com' in res2.text)
- body体中携带数据
### 6 body体携带数据
# res = requests.post('',data={}) # urlencoded方式
# res = requests.post('',json='json格式字符串') # aplication/json方式
# res = requests.post('',json='',headers={
# 'content-type': 'application/json;charset=utf-8'
# })
- response属性、编码问题,获取二进制,解析json
## 7 response属性,
# respone=requests.get('http://www.aa7a.cn/')
#
#
# print(respone.text) # 响应体的字符串
# print('----------------------------------')
# print(respone.content) # 响应体的二进制(图片,视频,页面)
# print('----------------------------------')
# print(respone.status_code) # 响应的状态码
# print(respone.headers) # 响应头
# print(respone.cookies) # 返回的cookie
# print(respone.cookies.get_dict()) # cookieJar对象转成字典
# print(respone.cookies.items()) # 相当于字典的items
#
# print(respone.url) # 当次请求地址
# print(respone.history) # 重定向过才有值
#
# print(respone.encoding) # 响应的编码格式
#关闭:response.close()
# from contextlib import closing
# with closing(requests.get('xxx',stream=True)) as response:
# for line in response.iter_content():
# pass
# 8 编码问题,
# 可能会遇到打印respone.text出现乱码,在浏览器页面中看不会出现乱码
# respone=requests.get('http://www.aa7a.cn/')
# # respone.encoding='gbk' # 修改编码方式
# respone.encoding=respone.apparent_encoding # 页面使用的编码方式
# print(respone.text) # 响应体的字符串
# 9 获取二进制,
res=requests.get('http://www.aa7a.cn/data/afficheimg/20201102gophex.png')
# print(res.content)
# with open('致命诱惑.png','wb') as f:
# f.write(res.content)
# with open('致命诱惑.png','wb') as f:
# for line in res.iter_content(1024):
# f.write(line)
# 10 解析json
# import json
# res=requests.get('https://api.luffycity.com/api/v1/course/category/actual/?courseType=actual')
# # print(json.loads(res.text))
#
# print(res.json())
爬取视频
# 爬取视频
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0
# import re
# res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0')
#
# # print(res.text)
# # 如果使用bs4,非常简单
#
# video_list=re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',res.text)
# # print(video_list)
# for video in video_list:
# video_url='https://www.pearvideo.com/'+video
# # print(video_url)
# video_id=video.split('_')[-1]
#
# header={
# 'Referer':video_url
# }
#
# res2=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.5165499193941832'%video_id,headers=header)
#
# video_f_url=res2.json()['videoInfo']['videos']['srcUrl']
# video_real_url=video_f_url.replace(video_f_url.rsplit('/')[-1].split('-')[0], 'cont-%s' % video_id)
# print(video_real_url)
#
# res3=requests.get(video_real_url)
# with open('%s.mp4'%video_id,'wb') as f:
# for line in res3.iter_content(1024):
# f.write(line)
# 分析过程稿
# referer:上一次访问的地址,可以做图片防盗链
# header={
# 'Referer': 'https://www.pearvideo.com/video_1737590'
# }
#
# res=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=1737590&mrd=0.5165499193941832',headers=header)
# print(res.text)
## 可以播放的视频
# 'https://video.pearvideo.com/mp4/short/20210729/cont-1736870-15732687-hd.mp4'
# ## 不可以播放的视频
# 'https://video.pearvideo.com/mp4/short/20210729/1628062847275-15732687-hd.mp4'
#
#
# 'https://video.pearvideo.com/mp4/short/20210729/ cont-1736870 -15732687-hd.mp4'
# 'https://video.pearvideo.com/mp4/short/20210729/ 1628062847275 -15732687-hd.mp4'
#
# s='https://video.pearvideo.com/mp4/short/20210729/ 1628062847275 -15732687-hd.mp4'
# s.replace(s.rsplit('/')[-1].split('-')[0],'cont-%s'%video_id)
补充
# 长链转短链服务
核心:重定向
requests高级用法
- 补充
1 正向代理和反向代理
正向代理:代理客户端
反向代理:代理服务端(nginx就是反向代理服务器)
2 requests使用的代理:正向代理
- SSL Cert Verification(了解)
## 不验证证书
# import requests
# respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200
# print(respone.status_code)
#
#
# ## 携带证书
# import requests
# respone=requests.get('https://www.12306.cn',
# cert=('/path/server.crt',
# '/path/key'))
# print(respone.status_code)
- 使用代理
# import requests
# proxies = {
# 'http':'http://117.69.230.132:3256',
# }
# respone=requests.get('https://www.12306.cn',
# proxies=proxies)
#
# print(respone.status_code)
# import requests
# proxies = {
# 'http':'http://117.69.230.132:3256',
# }
# # respone=requests.get('http://127.0.0.1:8000',proxies=proxies)
# respone=requests.get('http://127.0.0.1:8000')
#
# print(respone.text)
# import requests
# proxies = {
# 'http':'http://103.228.245.98:3128',
# }
# respone=requests.get('http://101.133.225.166:8888/',proxies=proxies)
# # respone=requests.get('http://101.133.225.166:8888/')
#
# print(respone.text)
## 如果你有很多代理,每次发请求,随机取一个代理ip,发送,这样我们的ip就不会被封
### 花钱买
### 白嫖
# import requests
# res=requests.get('http://demo.spiderpy.cn/get/').json()['proxy']
# print(res)
#
# proxies = {
# 'https':'https://%s'%res,
# }
# print()
#
# respone=requests.get('http://www.baidu.com',proxies=proxies)
# # respone=requests.get('http://www.baidu.com')
# print(respone.text)
### 借助于第三方,自己搭建(读一读人家源码)
#https://github.com/jhao104/proxy_pool
- 超时时间
# respone=requests.get('https://www.baidu.com',timeout=0.0001)
- 认证(像老款路由器的登录)
# import requests
# from requests.auth import HTTPBasicAuth
# r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
# print(r.status_code)
- 异常处理
# import requests
# from requests.exceptions import * #可以查看requests.exceptions获取异常类型
#
# try:
# r=requests.get('http://www.baidu.com',timeout=0.00001)
# # except ReadTimeout:
# # print('===:')
# # except ConnectionError: #网络不通
# # print('-----')
# # except Timeout:
# # print('aaaaa')
#
# except Exception:
# print('Error')
- 文件上传
# import requests
# files={'myfile':open('1 自动处理cookie.py','rb')}
# respone=requests.post('http://127.0.0.1:8000/upload_file/',files=files)
# print(respone.text)
抽屉自动点赞
import requests
header={
'Cookie':'',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
}
# res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'31857081'},headers=header)
# print(res.text)
# 所有的都点赞,----》id解析---》bs4模块(解析xml)
res=requests.get('https://dig.chouti.com/top/24hr?_=1628136305346',headers=header).json()
for item in res['data']:
id=item['id']
res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'%s'%id},headers=header)
print(res.text)
爬取汽车之家新闻
# pip3 install beautifulsoup4
import requests
from bs4 import BeautifulSoup
for i in range(1,100):
res=requests.get('https://www.autohome.com.cn/news/%s/#liststart'%i)
# print(res.text)
# 第一个参数,要解析的内容,第二参数:使用的解析器 html.parser bs4内置的解析器 lxml
soup=BeautifulSoup(res.text,'html.parser')
# pip3 install lxml
# soup=BeautifulSoup(res.text,'lxml')
# find_all找所有
ul_list=soup.find_all(name='ul',class_='article')
# ul_list=soup.find_all(name='ul')
# print(len(ul_list))
for ul in ul_list:
li_list=ul.find_all(name='li')
for li in li_list:
h3=li.find(name='h3')
if h3:
title=h3.text # 获取标签的文本内容,标签对象.text
# print(title)
desc=li.find(name='p').text
# print(desc)
img_url=li.find(name='img')['src']
if not img_url.startswith('http'):
img_url='https:'+img_url
# print(img_url)
url='https:'+li.find(name='a')['href']
print(url)
print('''
新闻标题:%s
新闻摘要:%s
新闻图片:%s
新闻地址:%s
'''%(title,desc,img_url,url))
bs4遍历文档树
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story <span>lqz</span></b><span>egon</span></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html_doc,'html.parser')
# res=soup.prettify() # 美化
# print(res)
#1、用法
# html=soup.html
# title=soup.html.head.title
# title=soup.title
# print(title)
#2、获取标签的名称 ---> 标签对象.name
# a=soup.body.a
# a=soup.a.name
# print(a)
# print(soup.body.name)
#3、获取标签的属性 ---->标签对象['标签名']
# href=soup.body.a['href']
# attrs=soup.body.a.attrs # 所有属性,---》字典
# href=soup.body.a.attrs['href']
# print(attrs['class'])
# c=soup.p.attrs['class']
# print(c)
#4、获取标签的内容
# res=soup.b.text # 拿到当前标签子子孙所有的text
# res=soup.p.text
# res=soup.p.string # 当前标签有且只有一个文本内容才能拿出来
# res=soup.b.string # 当前标签有且只有一个文本内容才能拿出来
# res=soup.p.strings # 把子子孙放到生成器中
#
# print(list(res))
#5、嵌套选择
# res=soup.html.body.p
# print(type(res)) # bs4.element.Tag
from bs4.element import Tag
####了解
#6、子节点、子孙节点
# print(soup.p.contents) #p下所有子节点,放到列表中
# print(soup.p.children) #得到一个迭代器,包含p下所有子节点
# for i,child in enumerate(soup.p.children):
# print(i,child)
# print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来
# for i,child in enumerate(soup.p.descendants):
# print(i,child)
#7、父节点、祖先节点
# print(soup.a.parent) #获取a标签的父节点
# print(soup.body.parent)
# print(soup.a.parents) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...
# print(list(soup.a.parents))
# print(len(list(soup.a.parents)))
#8、兄弟节点
# print(soup.a.next_sibling) #下一个兄弟
# print(soup.a.previous_sibling) #上一个兄弟
#
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象
bs4搜索文档树
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body id='body'>
<p class="title"><b>The Dormouse's story <span>lqz</span></b><span>egon</span></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup=BeautifulSoup(html_doc,'html.parser')
# 搜索文档树 find find_all
# 五种过滤器: 字符串、正则表达式、列表、True、方法
##### 字符串
# res=soup.find(name='body')
# res=soup.find(name='p',class_='story')
# 查找id为link2的标签
# res=soup.find(id='link2',name='a',class_='sister',href='http://example.com/lacie')
# res=soup.find(href='http://example.com/lacie')
# print(res)
# res=soup.find(attrs={'class':['sister']})
# print(res)
#### 正则表达式
import re
# res=soup.find_all(name=re.compile('^b')) #找出b开头的标签,结果有body和b标签
# res=soup.find(name=re.compile('^b'))
# res=soup.find_all(class_=re.compile('^s'))
# res=soup.find_all(href=re.compile('^http'))
# res=soup.find_all(id=re.compile('^l'))
# print(res)
####列表、
# res=soup.find_all(name=['body','b'])
# res=soup.find_all(id=['link1','link2'])
# res=soup.find_all(attrs={'id':['link1','link2']})
#
# print(res)
# True、
# links=soup.find_all(href=True)
# print(links)
# res=soup.find_all(name=True)
# res=soup.find_all(id=True)
# print(res)
#方法
# def has_class_but_no_id(tag):
# return tag.has_attr('class') and not tag.has_attr('id')
#
# print(len(soup.find_all(name=has_class_but_no_id)))
# 拿出当前页面所有图片
soup.find_all(name='img',href=True)
## 建议 遍历文档树和搜索文档树混用
# soup.body.div.find
### 其他参数 find,find_all
#limit
# soup.find()
# res=soup.find_all(name='a',href=True,limit=2) # 限制获取的条数
# print(res)
# recursive 是否递归查找
# res=soup.find_all(name='a',recursive=False)
# res=soup.find_all(name='html',recursive=False)
# print(res)
css选择器(与xpath是通用的)
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
<b>The Dormouse's story <p>asdfasdf</p></b>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">
<span>Elsie</span>
</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
<div class='panel-1'>
<ul class='list' id='list-1'>
<li class='element'>Foo</li>
<li class='element'>Bar</li>
<li class='element'>Jay</li>
</ul>
<ul class='list list-small' id='list-2'>
<li class='element'><h1 class='yyyy'>Foo</h1></li>
<li class='element xxx'>Bar</li>
<li class='element'>Jay</li>
</ul>
</div>
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'html.parser')
'''
#id
.类名
标签
标签>标签
标签 标签
'''
# res=soup.p.select('.sister') # 使用css选择器
# res=soup.p.select('#link1') # 使用css选择器
# res=soup.select('body>p') # 使用css选择器 body的子标签p
res=soup.select('body p') # 使用css选择器 body的子子孙孙标签p
print(len(res))
### css选择器是通用的:bs4,lxml解析也可以是css选择器
##css选择器不会写怎么办?
'#maincontent > div:nth-child(3) > table > tbody > tr:nth-child(13) > td:nth-child(3)'
## xpath选择
'//*[@id="maincontent"]/div[2]/table/tbody/tr[18]/td[2]'
selenium使用
# 如果使用requests模块,发送请求获取的数据不全,它不能执行js
# selenium:可以使用代码控制模拟人操作浏览器
## 操作某个浏览器,就需要有浏览器驱动
# http://npm.taobao.org/mirrors/chromedriver/ 谷歌驱动的淘宝镜像站
# 谷歌浏览器版本要跟驱动版本对应
## 92.0.4515.131 下载相应版本驱动,放到项目代码中
# pip3 install selenium
# from selenium import webdriver
# import time
# # 打开一个谷歌浏览器
# bro=webdriver.Chrome(executable_path='chromedriver.exe')
#
# #地址栏中输入百度
# bro.get('https://www.cnblogs.com/')
#
# time.sleep(2)
#
# print(bro.page_source) #当前页面的html内容
#
# bro.close() # 关闭浏览器
# import requests
#
# res=requests.get('https://dig.chouti.com/',headers={
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
# })
# print(res.text)
基本使用
from selenium import webdriver
import time
# 浏览器对象
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10) # 隐式等待,去找控件,如果没有会等10s
bro.get('https://www.baidu.com/')
# sub_button=bro.find_element_by_css_selector('#s-top-loginbtn')
sub_button = bro.find_element_by_id('s-top-loginbtn') # 如果有id,优先用它
# 点击
sub_button.click()
# 找到用户名密码登录
user_btn = bro.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__footerULoginBtn"]')
# user_btn=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
user_btn.click()
username = bro.find_element_by_id('TANGRAM__PSP_11__userName')
password = bro.find_element_by_id('TANGRAM__PSP_11__password')
# 往输入框中写东西
username.send_keys('6666666@qq.com')
password.send_keys('lqz12345')
sumbit_btn = bro.find_element_by_id('TANGRAM__PSP_11__submit')
time.sleep(3)
sumbit_btn.click()
time.sleep(3)
bro.close()
无头浏览器
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
driver=webdriver.Chrome(executable_path='chromedriver.exe',chrome_options=chrome_options)
driver.get('https://www.baidu.com')
print(driver.page_source)
driver.close()
获取元素位置,属性,大小
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://kyfw.12306.cn/otn/resources/login.html')
driver.implicitly_wait(10)
user_login=driver.find_element_by_css_selector('.login-hd-account>a')
user_login.click()
time.sleep(2)
img=driver.find_element_by_id('J-loginImg')
print(img)
print(img.id) #selenium提供的id,忽略
print(img.tag_name) # 标签名
print('-----')
print(img.location) # img标签的位置
print(img.size) # img标签大小
# 获取属性
# print(img.get_attribute('src'))
print(img.get_attribute('class'))
driver.close()
等待元素被加载
from selenium import webdriver
# 两种等待方式
# 显示等待
# 隐式等待:只需要写一句话,等待所有要获取的标签
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
'''
# 两种等待方式
# 显示等待(忽略掉)
wait=WebDriverWait(driver,10)
wait.until(EC.presence_of_element_located((By.ID,'content_left')))
contents=browser.find_element(By.CSS_SELECTOR,'#content_left')
# 隐式等待:
-driver.implicitly_wait(10)
-driver.find_element_by_css_selector()
-只需要写一句话,等待所有要获取的标签
'''
driver.implicitly_wait(10)
print(driver.page_source)
# 再找控件,只要没加载成功,就会等待,最多等10s
driver.close()
元素操作
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
driver.implicitly_wait(10)
## 点击,清空,输入操作
input_search=driver.find_element_by_id('kw')
input_search.send_keys('美女') # 输入
time.sleep(3)
input_search.clear() # 清空
time.sleep(2)
input_search.send_keys('性感美女')
time.sleep(2)
btn=driver.find_element_by_id('su')
btn.click() # 点击
time.sleep(10)
driver.close()
执行js
from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('http://127.0.0.1:8000/')
driver.implicitly_wait(10)
driver.execute_script("name='egon';") # 这里面写js代码
driver.execute_script("alert(name)") # 这里面写js代码
time.sleep(5)
# driver.close()
切换选项卡
import time
from selenium import webdriver
browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')
print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.switch_to_window(browser.window_handles[1])
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(5)
# browser.switch_to_window(browser.window_handles[0])
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()
模拟前进后退
import time
from selenium import webdriver
browser=webdriver.Chrome(executable_path='chromedriver.exe')
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')
browser.back()
time.sleep(3)
browser.forward()
browser.close()
异常处理
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
browser = webdriver.Chrome()
try:
browser.get('http://www.baidu.com')
except Exception as e:
print(e)
finally:
browser.close()
selenium登录cnblogs获取cookie
#selenium登录cnblogs获取cookie
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
import time
import json
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.implicitly_wait(10)
#### 登录过程
# try:
# browser.get('http://www.cnblogs.com')
# submit_btn=browser.find_element_by_link_text('登录') # a标签的内容
# submit_btn.click()
#
# username=browser.find_element_by_id('mat-input-0')
# password=browser.find_element_by_id('mat-input-1')
# username.send_keys('616564099@qq.com')
# password.send_keys('1111')
# input('等会')
# sub_btn=browser.find_element_by_css_selector('body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper')
# sub_btn.click()
#
# # 人工参与,滑动
# input('等会')
#
# # 获取到登录后的cookie
# print(browser.get_cookies())
#
# with open('cookie.json','w') as f:
# json.dump(browser.get_cookies(),f)
#
#
# except Exception as e:
# print(e)
# finally:
# browser.close()
### 不登录了,把cookie写入浏览器
# browser.get('http://www.cnblogs.com')
# with open('cookie.json','r') as f:
# cookie=json.load(f)
# time.sleep(5)
# for item in cookie: # 设置cookie必须用字典,cookie的json文件是列表,所以用循环往里放
# browser.add_cookie(item)
#
#
#
# browser.refresh() # 刷新页面
#
# time.sleep(5)
#
# browser.close()
抽屉半自动点赞
from selenium import webdriver
import json
import time
#### 登录过程
# bro=webdriver.Chrome(executable_path='chromedriver.exe')
# bro.implicitly_wait(10)
# bro.get('https://dig.chouti.com/')
# try:
# sub_btn=bro.find_element_by_id('login_btn')
# print(sub_btn)
#
# # sub_btn.click() # 报错
# bro.execute_script('arguments[0].click();',sub_btn)
#
# # username=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')
# username=bro.find_element_by_css_selector('div.input-item>input.login-phone')
# username.send_keys('18953675221')
# # password=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div')
# password = bro.find_element_by_css_selector('div.input-item>input.pwd-password-input')
# password.send_keys('lqz123')
#
# time.sleep(3)
# btn=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')
#
# btn.click()
#
# input('等')
#
# with open('chouti.json','w') as f:
# json.dump(bro.get_cookies(),f)
#
#
#
#
# finally:
# bro.close()
import requests
bro=webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)
bro.get('https://dig.chouti.com/')
# 把屏幕滑倒最底下
bro.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# bro.find_elements_by_css_selector('.link-item')
cookie={}
##从文件中读出cookie
with open('chouti.json','r') as f:
res=json.load(f)
for item in res:
cookie[item['name']]=item['value']
print(cookie) # requests能够使用的cookie
div= bro.find_element_by_class_name('link-con')
time.sleep(2)
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
div_list=div.find_elements_by_class_name('link-item')
for div in div_list:
article_id=div.get_attribute('data-id')
print(article_id)
# 使用requests发送请求
res=requests.post('https://dig.chouti.com/link/vote',data={'linkId': article_id},cookies=cookie,headers=header)
print(res.text)
bro.close()
打码平台使用
# 人工破解
# 图像识别模块---》数字,字母组合
# 验证码破解平台---》云打码,超级鹰
-给它一张图片---》结果返回 (收费的)
#!/usr/bin/env python
# coding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
if __name__ == '__main__':
chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641') # 用户中心>>软件ID 生成一个替换 96001
im = open('a.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
print(chaojiying.PostPic(im, 1902)) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
xpath使用
1 一门在html中查找数据的语言
2 记住的语法:
/ 取当前路径下的xx
// 取所有路径下的xx
. 当前路径
.. 上一层
@ 取属性
4 lxml解析模块提供的xpath
doc='''
<html>
<head>
<base href='http://example.com/' />
<title>Example website</title>
</head>
<body>
<div id='images'>
<a href='image1.html' name='sss'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
<a href='image2.html' name='lqz'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
<a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
<a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
<a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
<a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
</div>
</body>
</html>
'''
from lxml import etree
# 传入要解析的内容
html=etree.HTML(doc)
# res=html.xpath('//body')
# print(res)
# 1 所有节点
# a=html.xpath('//*')
# 2 指定节点(结果为列表)
# a=html.xpath('//head')
# 3 子节点,子孙节点
# a=html.xpath('//div/a')
# a=html.xpath('//body//a') #无数据
# a=html.xpath('//body//a')
# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
# a=html.xpath('//body//a')
# a=html.xpath('//body//a[@href="image1.html"]')
# a=html.xpath('//body//a[1]/..')
# 也可以这样
# a=html.xpath('//body//a[1]/parent::*')
# a=html.xpath('//body//a[1]/parent::p')
# 5 属性匹配
# a=html.xpath('//a[@href="image1.html"]')
# a=html.xpath('//a[@name="sss"]')
# 6 文本获取 text()
# a=html.xpath('//a[@href="image1.html"]/text()')
# a=html.xpath('//a/text()')
# 7 属性获取
# a=html.xpath('//a/@href')
# a=html.xpath('//a[1]/@name')
# # 注意从1 开始取(不是从0)
# a=html.xpath('//body//a[2]/@href')
# 8 属性多值匹配
# a 标签有多个class类,直接匹配就不可以了,需要用contains
# a=html.xpath('//a[@class="li"]')
# a=html.xpath('//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# a=html.xpath('//a[2]/@name')
# 取最后一个
# a=html.xpath('//a[last()]/@href')
# 位置小于3的
# a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
# a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor:祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
# a=html.xpath('//a/ancestor::div')
# attribute:属性值
# a=html.xpath('//a[1]/attribute::*')
# child:直接子节点
# a=html.xpath('//a[1]/child::*')
# a=html.xpath('//a[1]/child::img/@src')
# descendant:所有子孙节点
# a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]/text()')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')
print(a)
自动登录12306
from selenium import webdriver
import base64
from PIL import Image
import time
from chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains
# 不让程序检测出是用驱动控制
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
bro=webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=options)
bro.get('https://kyfw.12306.cn/otn/resources/login.html')
bro.implicitly_wait(10)
# 把窗口设置全屏
bro.maximize_window()
try:
username_login_btn=bro.find_element_by_css_selector('.login-hd-account>a')
username_login_btn.click()
username=bro.find_element_by_id('J-userName')
password=bro.find_element_by_id('J-password')
login_btn=bro.find_element_by_id('J-login')
username.send_keys('liuqingzheng')
password.send_keys('lqz12345')
img_code=bro.find_element_by_id('J-loginImg')
print(img_code.size)
print(img_code.location)
# 获取验证码图片的两种方案
# 方案一:整体截图,根据位置抠出验证码图片
# bro.save_screenshot('main.png') # 对整个页面进行截图,main.png
#
# location=img_code.location
# size=img_code.size
# print(location)
# print(size)
# #验证码的坐标
# img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
# #使用pillow打开截图
# img=Image.open('./main.png')
# #从截图中按照位置扣除验证码
# code_img=img.crop(img_tu)
# # 把扣出来的图,保存到本地
# code_img.save('./code2.png')
# 方案二:把图片的base64编码转成图片保存到本地
img_base64=img_code.get_attribute('src')
img_base64_real=img_base64.split(',')[-1]
img_1=base64.b64decode(img_base64_real)
with open('code.jpg','wb') as f:
f.write(img_1)
# 调用超级鹰,完成验证码破解
# 调用超级鹰识别
chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641') # 用户中心>>软件ID 生成一个替换 96001
im = open('code.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
res=chaojiying.PostPic(im, 9004) # 1902 验证码类型 官方网站>>价格体系 3.4+版 print 后要加()
# 123,155|42,135|11,77---->[[123,155],[42,135],[11,77]]
print(res)
result=res['pic_str']
all_list = []
if '|' in result:
list_1 = result.split('|')
count_1 = len(list_1)
for i in range(count_1):
xy_list = []
x = int(list_1[i].split(',')[0])
y = int(list_1[i].split(',')[1])
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
else:
x = int(result.split(',')[0])
y = int(result.split(',')[1])
xy_list = []
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
print(all_list)
### 在页面中点击破解的图案
#点击 [[123,155],[42,135],[11,77]]
for item in all_list:
ActionChains(bro).move_to_element_with_offset(img_code,item[0],item[1]).click().perform()
time.sleep(1)
time.sleep(5)
login_btn.click()
time.sleep(1)
# 滑动滑块
span=bro.find_element_by_id('nc_1_n1z')
ActionChains(bro).drag_and_drop_by_offset(span, 300, 0).perform()
time.sleep(30)
print(bro.get_cookies())
except Exception as e:
print(e)
finally:
bro.close()
bro.quit() # 关闭整个浏览器
愿君前程似锦,归来仍是少年