爬虫

介绍

# 爬虫流程
	模拟发送http请求 ---> 解析数据(清洗数据) ---> 入库
    
# 百度、谷歌...(大爬虫)
	百度搜索：输入关键字 ---> 搜的是百度的数据库 ---> 页面展示 ---> 点击具体内容 ---> 网页跳转
    
	seo优化：主动让百度爬到你
    
    sem：花钱做广告买关键词
    
# 爬虫协议
	哪部分允许爬取，哪部分不允许爬取(https://www.csdn.net/robots.txt)
    
# python中爬虫相关内容
	模拟发送http请求(requests，slenium) ---> 解析数据(清洗数据)(json、bs4...) --->入库
    (文件、mysql、redis、Excel、MongoDB)
    反爬：
    	封ip  			  --- 代理池
        封账号 			 --- cookie池
        请求头中带特殊校验	 --- 相应破解出那些字段
        数据加密		    --- js解析出加密方式，自行组装数据
        html			   ---  css反爬，字体反爬

requests库介绍

# requests模块，基于urllib3封装，方便的发出http请求

# pip install requests

requests发送get请求

普通请求

res=requests.get('https://www.cnblogs.com/xiaoyuanqujing/articles/11805698.html')

print(res.text) # 返回的数据

search = input('请输入要搜索的内容：')
res = requests.get('https://www.baidu.com/s?wd=' + search,
                   headers={
                       'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
                       'Host': 'www.baidu.com',
                   })

print(res.text)
with open('search.html','w',encoding='utf-8') as f:
    f.write(res.text)

携带参数

import requests
response=requests.get('https://www.sogou.com/web',
                      headers={
                        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36',
                      },params={'query':'美女'})
print(response.text)
with open('search.html','w',encoding='utf-8') as f:
    f.write(response.text)

# url编码和解码
from urllib.parse import quote,unquote
# res=quote('美女')
# print(res)  #%E7%BE%8E%E5%A5%B3

res=unquote('%E7%BE%8E%E5%A5%B3')
print(res)


from urllib.parse import urlencode
res=urlencode({'wd':'美女','age':19},encoding='utf-8')
print(res)

携带请求头

# 如果被做了反爬，但是用浏览器可以，一定是模拟的不像


header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
        'Cookie':'BIDUPSID=185720F0FEA0DC697147E75D48AFB1D8; PSTM=1593942899; BAIDUID=185720F0FEA0DC69D0675C2EEDB05721:SL=0:NR=10:FG=1; sug=3; ORIGIN=0; bdime=0; sugstore=1; BD_UPN=12314753; __yjs_duid=1_61812ebe639caffca8271e1786971c8b1617936053918; BDUSS=lhDOTR6OWU0UWdmcFBLTDdxRUlqQXJtdFFjajlxfjFhVUpRLTNDNEd0VW51Uk5oSVFBQUFBJCQAAAAAAAAAAAEAAACwPo3XwM~E0Lqiyc-6o9Cjx~gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACcs7GAnLOxgW; BDUSS_BFESS=lhDOTR6OWU0UWdmcFBLTDdxRUlqQXJtdFFjajlxfjFhVUpRLTNDNEd0VW51Uk5oSVFBQUFBJCQAAAAAAAAAAAEAAACwPo3XwM~E0Lqiyc-6o9Cjx~gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACcs7GAnLOxgW; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID_BFESS=uQLOJeC62GhrmIcHWL4ru7XUvDmJR3TTH6aoUKcPVTiblG6zzmh3EG0Pbf8g0K4bdMXhogKK0eOTHkuF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF_BFESS=tRk8oK-aJKvbfP0kKno_MtCsqxby26n9-Rb9aJ5y-J7nhMTz5Mn1DT_OQl_fXpQq5m3ion3vQpbZ8h5D34vW-fLRDmct-p5MQ26xKl0MLPbcsU5nBU4VhnkD2fnMBMPj5mOnaIQc3fAKftnOM46JehL3346-35543bRTLnLy5KJYMDFCjTA-D6QyeUbQa4JWHD6QB4TaajrjDnCrBPjUXUI82h5y05OkbmteaU3PJMnhMUna54ovynKZDnORXx745j5b-bA-Bh3tfKJKbPQ63ML1Db3JqP7M0aQtsCouan3oepvoD-oc3MvByPjdJJQOBKQB0KnGbUQkeq8CQft20b0EeMtjW6LEJJkO_D_atKvDqTrP-trf5DCShUFsWPKJB2Q-XPoO3KJZfqRhyhJIjpk0jn7P-tQiW5cpoMbgylRM8P3y0bb2DUA1y4vpK-ogQgTxoUJ2fnRJEUcGqj5Ah--ebPRiJPQ9Qg-qahQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hD0wjT0-DjcM-Uv05-PXKCJ0X458HJOoDDvFqfbcy4LdjG5NeRvbLnc7-hRu2PKboM5Cbxbmj4Pu3-Aq54RIL5505tnqtMcNb-0xeJrhQfbQ0bjuqP-jW5Ta-qI-HR7JOpkxbfnxy-P0QRPH-Rv92DQMVU52QqcqEIQHQT3m5-5bbN3ht6IHJJIq_I82JIvbfP0k5R35hnjH-UIs-lorB2Q-5KL-3bnKDqTnyhJdjbD0jn7P-f3LWHue-UbdJJjoSqvn0hjxMtDjQNjEhtr3t2TxoUJt5DnJhhkm-4OYW-kebPRiJPQ9QgbWLlQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0M5DK0HPonHjAKDjvP; delPer=0; BD_CK_SAM=1; PSINO=3; BAIDUID_BFESS=185720F0FEA0DC69D0675C2EEDB05721:SL=0:NR=10:FG=1; H_PS_PSSID=34300_34335_34273_31254_34377_33848_34092_34107_34111_26350_34360; COOKIE_SESSION=3698_4_8_9_18_23_0_1_7_3_0_11_2964_0_0_0_1628046671_1627963945_1628050426%7C9%23101_72_1627963943%7C9; H_PS_645EC=4707Tvcdepk6pvKnFnabHvwqrLGAFZiyVAOXDTeK8IdAgRrAQD714rlnFSA; BA_HECTOR=0ha5al208l240k2hf91ggk5e30q'}
res = requests.get('https://www.baidu.com/s?wd=帅哥',headers=header)
print(res.text)

with open('search.html','w',encoding='utf-8') as f:
    f.write(res.text)

携带cookie

res = requests.get('http://www.aa7a.cn/', headers={
    # 'cookie': 'ECS_ID=b435f5897f41c2f2c322fa3065165c9fbc56ddd5; ECS[visit_times]=1; _jzqa=1.4423902263705487400.1628049030.1628049030.1628049030.1; _jzqc=1; _jzqy=1.1628049030.1628049030.1.jzqsr=baidu.-; _jzqckmp=1; UM_distinctid=17b0f48bcda509-0e53827f49b667-5e422810-1fa400-17b0f48bcdb54c; CNZZDATA4603183=cnzz_eid%3D1414483188-1628045968-null%26ntime%3D1628045968; Hm_lvt_c29657ca36c6c88e02fed9a397826038=1628049030; CNZZDATA1260462072=271803043-1628045968-null%7C1628045968; Qs_lvt_201322=1628049030; mediav=%7B%22eid%22%3A%22179539%22%2C%22ep%22%3A%22%22%2C%22vid%22%3A%22%22%2C%22ctn%22%3A%22%22%2C%22vvid%22%3A%22%22%2C%22_mvnf%22%3A1%2C%22_mvctn%22%3A0%2C%22_mvck%22%3A1%2C%22_refnf%22%3A0%7D; _qzjc=1; __xsptplusUT_422=1; __xsptplus422=422.1.1628049032.1628049032.1%234%7C%7C%7C%7C%7C%23%23HXwWiieCoDk4evQa5H5dKIEBtnxLTY12%23; ECS[username]=616564099%40qq.com; ECS[user_id]=61399; ECS[password]=4a5e6ce9d1aba9de9b31abdf303bbdc2; _qzja=1.1591066928.1628049030327.1628049030327.1628049030327.1628049032756.1628049055229.616564099%2540qq_com.1.0.3.1; _qzjb=1.1628049030327.3.0.0.0; _qzjto=3.1.0; _jzqb=1.8.10.1628049030.1; Qs_pv_201322=2246468261428716000%2C1231523507243942000; Hm_lpvt_c29657ca36c6c88e02fed9a397826038=1628049055; cto_bundle=Gd60IF9TJTJGRHFuTzdidXZYZGEyVW9ydFFJV25YY0RqSlBRODRlTDdjSG9RT01NUlg4NmYyVjhPMzNmenolMkJDMlRiQjJWTHA2UlBoUUdNOGtBTnoyTkZqdmJMOEI5Vk14aVU4JTJGbHdyTXFqaCUyRlY1dWt3JTNE'
})
print('616564099@qq.com ' in res.text)

requests发送post请求

自动登录某网站

res = requests.post('http://www.aa7a.cn/user.php', data={
    'username': '616564099@qq.com',
    'password': 'lqz123',
    'captcha': 'zxv7',
    'remember': 1,
    'ref': 'http://www.aa7a.cn/',
    'act': 'act_login'
})

# print(res.text)
## 取出cookie,登录成功的cookie
cookie=res.cookies  # CookieJar对象
print(cookie)


res2=requests.get('http://www.aa7a.cn/',cookies=cookie)
# res2=requests.get('http://www.aa7a.cn/')
print('616564099@qq.com' in res2.text)

body体中携带数据

### 6 body体携带数据
# res = requests.post('',data={})  # urlencoded方式
# res = requests.post('',json='json格式字符串')  # aplication/json方式
# res = requests.post('',json='',headers={
# 'content-type': 'application/json;charset=utf-8'
# })

response属性、编码问题，获取二进制，解析json

## 7  response属性，


# respone=requests.get('http://www.aa7a.cn/')
#
#
# print(respone.text)   # 响应体的字符串
# print('----------------------------------')
# print(respone.content) # 响应体的二进制（图片，视频，页面）
# print('----------------------------------')
# print(respone.status_code) # 响应的状态码
# print(respone.headers)     # 响应头
# print(respone.cookies)     # 返回的cookie
# print(respone.cookies.get_dict())  # cookieJar对象转成字典
# print(respone.cookies.items())    # 相当于字典的items
#
# print(respone.url)               # 当次请求地址
# print(respone.history)           # 重定向过才有值
#
# print(respone.encoding)          # 响应的编码格式

#关闭：response.close()
# from contextlib import closing
# with closing(requests.get('xxx',stream=True)) as response:
#     for line in response.iter_content():
#     pass


# 8 编码问题，
# 可能会遇到打印respone.text出现乱码，在浏览器页面中看不会出现乱码
# respone=requests.get('http://www.aa7a.cn/')
# # respone.encoding='gbk'  # 修改编码方式
# respone.encoding=respone.apparent_encoding   # 页面使用的编码方式
# print(respone.text)   # 响应体的字符串



# 9 获取二进制，
res=requests.get('http://www.aa7a.cn/data/afficheimg/20201102gophex.png')
# print(res.content)
# with open('致命诱惑.png','wb') as f:
#     f.write(res.content)


# with open('致命诱惑.png','wb') as f:
#     for line in res.iter_content(1024):
#         f.write(line)



# 10 解析json
# import json
# res=requests.get('https://api.luffycity.com/api/v1/course/category/actual/?courseType=actual')
# # print(json.loads(res.text))
#
# print(res.json())

爬取视频

#  爬取视频
#https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0

# import re
# res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=5&start=0')
#
# # print(res.text)
# # 如果使用bs4，非常简单
#
# video_list=re.findall('<a href="(.*?)" class="vervideo-lilink actplay">',res.text)
# # print(video_list)
# for video in video_list:
#     video_url='https://www.pearvideo.com/'+video
#     # print(video_url)
#     video_id=video.split('_')[-1]
#
#     header={
#         'Referer':video_url
#     }
#
#     res2=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.5165499193941832'%video_id,headers=header)
#
#     video_f_url=res2.json()['videoInfo']['videos']['srcUrl']
#     video_real_url=video_f_url.replace(video_f_url.rsplit('/')[-1].split('-')[0], 'cont-%s' % video_id)
#     print(video_real_url)
#
#     res3=requests.get(video_real_url)
#     with open('%s.mp4'%video_id,'wb') as f:
#         for line in res3.iter_content(1024):
#             f.write(line)









# 分析过程稿
# referer:上一次访问的地址，可以做图片防盗链
# header={
#     'Referer': 'https://www.pearvideo.com/video_1737590'
# }
#
# res=requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=1737590&mrd=0.5165499193941832',headers=header)
# print(res.text)



## 可以播放的视频
# 'https://video.pearvideo.com/mp4/short/20210729/cont-1736870-15732687-hd.mp4'
# ## 不可以播放的视频
# 'https://video.pearvideo.com/mp4/short/20210729/1628062847275-15732687-hd.mp4'
#
#
# 'https://video.pearvideo.com/mp4/short/20210729/   cont-1736870   -15732687-hd.mp4'
# 'https://video.pearvideo.com/mp4/short/20210729/   1628062847275   -15732687-hd.mp4'
#
# s='https://video.pearvideo.com/mp4/short/20210729/  1628062847275 -15732687-hd.mp4'
# s.replace(s.rsplit('/')[-1].split('-')[0],'cont-%s'%video_id)

补充

# 长链转短链服务
	核心：重定向

requests高级用法

补充

1 正向代理和反向代理
	正向代理：代理客户端
    反向代理：代理服务端(nginx就是反向代理服务器)
    
2 requests使用的代理：正向代理

SSL Cert Verification（了解）

## 不验证证书
# import requests
# respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200
# print(respone.status_code)
#
#
# ## 携带证书
# import requests
# respone=requests.get('https://www.12306.cn',
#                      cert=('/path/server.crt',
#                            '/path/key'))
# print(respone.status_code)

使用代理

# import requests
# proxies = {
#     'http':'http://117.69.230.132:3256',
# }
# respone=requests.get('https://www.12306.cn',
#                      proxies=proxies)
#
# print(respone.status_code)

# import requests
# proxies = {
#     'http':'http://117.69.230.132:3256',
# }
# # respone=requests.get('http://127.0.0.1:8000',proxies=proxies)
# respone=requests.get('http://127.0.0.1:8000')
#
# print(respone.text)


# import requests
# proxies = {
#     'http':'http://103.228.245.98:3128',
# }
# respone=requests.get('http://101.133.225.166:8888/',proxies=proxies)
# # respone=requests.get('http://101.133.225.166:8888/')
#
# print(respone.text)

## 如果你有很多代理，每次发请求，随机取一个代理ip，发送，这样我们的ip就不会被封
### 花钱买
### 白嫖
# import requests
# res=requests.get('http://demo.spiderpy.cn/get/').json()['proxy']
# print(res)
#
# proxies = {
#     'https':'https://%s'%res,
# }
# print()
#
# respone=requests.get('http://www.baidu.com',proxies=proxies)
# # respone=requests.get('http://www.baidu.com')

# print(respone.text)



### 借助于第三方，自己搭建(读一读人家源码)
#https://github.com/jhao104/proxy_pool

超时时间

# respone=requests.get('https://www.baidu.com',timeout=0.0001)

认证（像老款路由器的登录）

# import requests
# from requests.auth import HTTPBasicAuth
# r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
# print(r.status_code)

异常处理

# import requests
# from requests.exceptions import * #可以查看requests.exceptions获取异常类型
#
# try:
#     r=requests.get('http://www.baidu.com',timeout=0.00001)
# # except ReadTimeout:
# #     print('===:')
# # except ConnectionError: #网络不通
# #     print('-----')
# # except Timeout:
# #     print('aaaaa')
#
# except Exception:
#     print('Error')

文件上传

# import requests
# files={'myfile':open('1 自动处理cookie.py','rb')}
# respone=requests.post('http://127.0.0.1:8000/upload_file/',files=files)
# print(respone.text)

抽屉自动点赞

import requests

header={
    'Cookie':'',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
}
# res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'31857081'},headers=header)
# print(res.text)


# 所有的都点赞，----》id解析---》bs4模块（解析xml）

res=requests.get('https://dig.chouti.com/top/24hr?_=1628136305346',headers=header).json()
for item in res['data']:
    id=item['id']
    res=requests.post('https://dig.chouti.com/link/vote',data={'linkId':'%s'%id},headers=header)
    print(res.text)

爬取汽车之家新闻

# pip3 install beautifulsoup4
import requests
from bs4 import BeautifulSoup

for i in range(1,100):
    res=requests.get('https://www.autohome.com.cn/news/%s/#liststart'%i)
    # print(res.text)
    # 第一个参数，要解析的内容，第二参数：使用的解析器  html.parser  bs4内置的解析器   lxml
    soup=BeautifulSoup(res.text,'html.parser')

    # pip3 install lxml
    # soup=BeautifulSoup(res.text,'lxml')


    # find_all找所有
    ul_list=soup.find_all(name='ul',class_='article')
    # ul_list=soup.find_all(name='ul')
    # print(len(ul_list))

    for ul in ul_list:
        li_list=ul.find_all(name='li')
        for li in li_list:
            h3=li.find(name='h3')
            if h3:
                title=h3.text   # 获取标签的文本内容，标签对象.text
                # print(title)
                desc=li.find(name='p').text
                # print(desc)
                img_url=li.find(name='img')['src']
                if not img_url.startswith('http'):
                    img_url='https:'+img_url
                # print(img_url)
                url='https:'+li.find(name='a')['href']
                print(url)

                print('''
                新闻标题：%s
                新闻摘要：%s
                新闻图片：%s
                新闻地址：%s          
                '''%(title,desc,img_url,url))

bs4遍历文档树


from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story <span>lqz</span></b><span>egon</span></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup=BeautifulSoup(html_doc,'html.parser')


# res=soup.prettify()  # 美化
# print(res)

#1、用法
# html=soup.html
# title=soup.html.head.title
# title=soup.title
# print(title)



#2、获取标签的名称 ---> 标签对象.name
# a=soup.body.a
# a=soup.a.name
# print(a)
# print(soup.body.name)


#3、获取标签的属性  ---->标签对象['标签名']
# href=soup.body.a['href']
# attrs=soup.body.a.attrs  # 所有属性，---》字典
# href=soup.body.a.attrs['href']
# print(attrs['class'])

# c=soup.p.attrs['class']
# print(c)

#4、获取标签的内容

# res=soup.b.text  # 拿到当前标签子子孙所有的text
# res=soup.p.text

# res=soup.p.string # 当前标签有且只有一个文本内容才能拿出来
# res=soup.b.string # 当前标签有且只有一个文本内容才能拿出来

# res=soup.p.strings   # 把子子孙放到生成器中
#
# print(list(res))



#5、嵌套选择
# res=soup.html.body.p
# print(type(res))  # bs4.element.Tag
from bs4.element import Tag


####了解
#6、子节点、子孙节点
# print(soup.p.contents) #p下所有子节点，放到列表中

# print(soup.p.children) #得到一个迭代器,包含p下所有子节点

# for i,child in enumerate(soup.p.children):
#     print(i,child)

# print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来
# for i,child in enumerate(soup.p.descendants):
#     print(i,child)


#7、父节点、祖先节点

# print(soup.a.parent) #获取a标签的父节点

# print(soup.body.parent)

# print(soup.a.parents) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...
# print(list(soup.a.parents))
# print(len(list(soup.a.parents)))


#8、兄弟节点
# print(soup.a.next_sibling) #下一个兄弟
# print(soup.a.previous_sibling) #上一个兄弟
#
# print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
# print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象

bs4搜索文档树

from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body id='body'>
<p class="title"><b>The Dormouse's story <span>lqz</span></b><span>egon</span></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup=BeautifulSoup(html_doc,'html.parser')


# 搜索文档树  find  find_all

# 五种过滤器: 字符串、正则表达式、列表、True、方法


##### 字符串
# res=soup.find(name='body')
# res=soup.find(name='p',class_='story')

# 查找id为link2的标签
# res=soup.find(id='link2',name='a',class_='sister',href='http://example.com/lacie')
# res=soup.find(href='http://example.com/lacie')
# print(res)

# res=soup.find(attrs={'class':['sister']})
# print(res)


#### 正则表达式
import re
# res=soup.find_all(name=re.compile('^b')) #找出b开头的标签，结果有body和b标签
# res=soup.find(name=re.compile('^b'))


# res=soup.find_all(class_=re.compile('^s'))
# res=soup.find_all(href=re.compile('^http'))
# res=soup.find_all(id=re.compile('^l'))
# print(res)


####列表、

# res=soup.find_all(name=['body','b'])
# res=soup.find_all(id=['link1','link2'])

# res=soup.find_all(attrs={'id':['link1','link2']})
#
# print(res)

# True、

# links=soup.find_all(href=True)
# print(links)

# res=soup.find_all(name=True)
# res=soup.find_all(id=True)
# print(res)



#方法
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
#
# print(len(soup.find_all(name=has_class_but_no_id)))


# 拿出当前页面所有图片
soup.find_all(name='img',href=True)



## 建议 遍历文档树和搜索文档树混用
# soup.body.div.find




### 其他参数  find，find_all

#limit
# soup.find()
# res=soup.find_all(name='a',href=True,limit=2)  # 限制获取的条数
# print(res)


# recursive 是否递归查找
# res=soup.find_all(name='a',recursive=False)
# res=soup.find_all(name='html',recursive=False)
# print(res)

css选择器(与xpath是通用的)

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title">
    <b>The Dormouse's story  <p>asdfasdf</p></b>
    Once upon a time there were three little sisters; and their names were
    <a href="http://example.com/elsie" class="sister" id="link1">
        <span>Elsie</span>
    </a>
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
    <div class='panel-1'>
        <ul class='list' id='list-1'>
            <li class='element'>Foo</li>
            <li class='element'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
        <ul class='list list-small' id='list-2'>
            <li class='element'><h1 class='yyyy'>Foo</h1></li>
            <li class='element xxx'>Bar</li>
            <li class='element'>Jay</li>
        </ul>
    </div>
    and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,'html.parser')


'''
#id
.类名
标签
标签>标签
标签 标签
'''

# res=soup.p.select('.sister')  # 使用css选择器
# res=soup.p.select('#link1')  # 使用css选择器
# res=soup.select('body>p')  # 使用css选择器 body的子标签p
res=soup.select('body p')  # 使用css选择器 body的子子孙孙标签p
print(len(res))


### css选择器是通用的：bs4，lxml解析也可以是css选择器

##css选择器不会写怎么办？
'#maincontent > div:nth-child(3) > table > tbody > tr:nth-child(13) > td:nth-child(3)'

## xpath选择
'//*[@id="maincontent"]/div[2]/table/tbody/tr[18]/td[2]'

selenium使用

# 如果使用requests模块，发送请求获取的数据不全，它不能执行js

# selenium:可以使用代码控制模拟人操作浏览器


## 操作某个浏览器，就需要有浏览器驱动
# http://npm.taobao.org/mirrors/chromedriver/  谷歌驱动的淘宝镜像站
# 谷歌浏览器版本要跟驱动版本对应

## 92.0.4515.131  下载相应版本驱动，放到项目代码中

# pip3 install selenium

# from selenium import webdriver
# import time
# # 打开一个谷歌浏览器
# bro=webdriver.Chrome(executable_path='chromedriver.exe')
#
# #地址栏中输入百度
# bro.get('https://www.cnblogs.com/')
#
# time.sleep(2)
#
# print(bro.page_source)  #当前页面的html内容
#
# bro.close()  # 关闭浏览器


# import requests
#
# res=requests.get('https://dig.chouti.com/',headers={
#     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
# })
# print(res.text)

基本使用

from selenium import webdriver
import time

# 浏览器对象
bro = webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)  # 隐式等待，去找控件，如果没有会等10s

bro.get('https://www.baidu.com/')

# sub_button=bro.find_element_by_css_selector('#s-top-loginbtn')
sub_button = bro.find_element_by_id('s-top-loginbtn')  # 如果有id，优先用它
# 点击
sub_button.click()

# 找到用户名密码登录
user_btn = bro.find_element_by_xpath('//*[@id="TANGRAM__PSP_11__footerULoginBtn"]')
# user_btn=bro.find_element_by_id('TANGRAM__PSP_11__footerULoginBtn')
user_btn.click()

username = bro.find_element_by_id('TANGRAM__PSP_11__userName')
password = bro.find_element_by_id('TANGRAM__PSP_11__password')

# 往输入框中写东西
username.send_keys('6666666@qq.com')
password.send_keys('lqz12345')

sumbit_btn = bro.find_element_by_id('TANGRAM__PSP_11__submit')
time.sleep(3)
sumbit_btn.click()

time.sleep(3)
bro.close()

无头浏览器


from selenium import webdriver

from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument('window-size=1920x3000') #指定浏览器分辨率
chrome_options.add_argument('--disable-gpu') #谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--hide-scrollbars') #隐藏滚动条, 应对一些特殊页面
chrome_options.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度
chrome_options.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败


driver=webdriver.Chrome(executable_path='chromedriver.exe',chrome_options=chrome_options)
driver.get('https://www.baidu.com')
print(driver.page_source)
driver.close()

获取元素位置，属性，大小

from selenium import webdriver
import time
driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://kyfw.12306.cn/otn/resources/login.html')
driver.implicitly_wait(10)

user_login=driver.find_element_by_css_selector('.login-hd-account>a')

user_login.click()
time.sleep(2)
img=driver.find_element_by_id('J-loginImg')
print(img)

print(img.id)    #selenium提供的id，忽略
print(img.tag_name) # 标签名



print('-----')
print(img.location) # img标签的位置
print(img.size)     # img标签大小

# 获取属性
# print(img.get_attribute('src'))
print(img.get_attribute('class'))

driver.close()

等待元素被加载

from selenium import webdriver

# 两种等待方式
# 显示等待
# 隐式等待：只需要写一句话，等待所有要获取的标签

driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
'''
# 两种等待方式
# 显示等待(忽略掉)
    wait=WebDriverWait(driver,10)
    wait.until(EC.presence_of_element_located((By.ID,'content_left')))
    contents=browser.find_element(By.CSS_SELECTOR,'#content_left')
# 隐式等待：
    -driver.implicitly_wait(10)
    -driver.find_element_by_css_selector()
    -只需要写一句话，等待所有要获取的标签

'''

driver.implicitly_wait(10)


print(driver.page_source)
# 再找控件，只要没加载成功，就会等待，最多等10s
driver.close()

元素操作

from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('https://www.baidu.com')
driver.implicitly_wait(10)

## 点击，清空，输入操作

input_search=driver.find_element_by_id('kw')
input_search.send_keys('美女')  # 输入
time.sleep(3)
input_search.clear() # 清空

time.sleep(2)
input_search.send_keys('性感美女')
time.sleep(2)
btn=driver.find_element_by_id('su')
btn.click()  # 点击
time.sleep(10)

driver.close()

执行js

from selenium import webdriver
import time

driver=webdriver.Chrome(executable_path='chromedriver.exe')
driver.get('http://127.0.0.1:8000/')
driver.implicitly_wait(10)

driver.execute_script("name='egon';") # 这里面写js代码
driver.execute_script("alert(name)") # 这里面写js代码


time.sleep(5)
# driver.close()

切换选项卡

import time
from selenium import webdriver

browser=webdriver.Chrome()
browser.get('https://www.baidu.com')
browser.execute_script('window.open()')

print(browser.window_handles) #获取所有的选项卡
# browser.switch_to_window(browser.window_handles[1])
# browser.switch_to_window(browser.window_handles[1])
browser.switch_to.window(browser.window_handles[1])
browser.get('https://www.taobao.com')
time.sleep(5)
# browser.switch_to_window(browser.window_handles[0])
browser.switch_to.window(browser.window_handles[0])
browser.get('https://www.sina.com.cn')
browser.close()

模拟前进后退

import time
from selenium import webdriver

browser=webdriver.Chrome(executable_path='chromedriver.exe')
browser.get('https://www.baidu.com')
browser.get('https://www.taobao.com')
browser.get('http://www.sina.com.cn/')

browser.back()
time.sleep(3)
browser.forward()
browser.close()

异常处理

from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException

browser = webdriver.Chrome()
try:
    browser.get('http://www.baidu.com')

except Exception as e:
    print(e)
finally:
    browser.close()

selenium登录cnblogs获取cookie

#selenium登录cnblogs获取cookie
from selenium import webdriver
from selenium.common.exceptions import TimeoutException,NoSuchElementException,NoSuchFrameException
import time
import json
browser = webdriver.Chrome(executable_path='chromedriver.exe')
browser.implicitly_wait(10)

####  登录过程
# try:
#     browser.get('http://www.cnblogs.com')
#     submit_btn=browser.find_element_by_link_text('登录')  # a标签的内容
#     submit_btn.click()
#
#     username=browser.find_element_by_id('mat-input-0')
#     password=browser.find_element_by_id('mat-input-1')
#     username.send_keys('616564099@qq.com')
#     password.send_keys('1111')
#     input('等会')
#     sub_btn=browser.find_element_by_css_selector('body > app-root > mat-sidenav-container > mat-sidenav-content > div > div > app-sign-in > app-content-container > div > div > div > form > div > button > span.mat-button-wrapper')
#     sub_btn.click()
#
#     # 人工参与，滑动
#     input('等会')
#
#     # 获取到登录后的cookie
#     print(browser.get_cookies())
#
#     with open('cookie.json','w') as f:
#         json.dump(browser.get_cookies(),f)
#
#
# except Exception as e:
#     print(e)
# finally:
#     browser.close()


### 不登录了，把cookie写入浏览器
# browser.get('http://www.cnblogs.com')
# with open('cookie.json','r') as f:
#     cookie=json.load(f)
# time.sleep(5)
# for item in cookie:  # 设置cookie必须用字典，cookie的json文件是列表，所以用循环往里放
#     browser.add_cookie(item)
#
#
#
# browser.refresh()  # 刷新页面
#
# time.sleep(5)
#
# browser.close()

抽屉半自动点赞

from selenium import webdriver
import json
import time

#### 登录过程
# bro=webdriver.Chrome(executable_path='chromedriver.exe')
# bro.implicitly_wait(10)
# bro.get('https://dig.chouti.com/')
# try:
#     sub_btn=bro.find_element_by_id('login_btn')
#     print(sub_btn)
#
#     # sub_btn.click()  # 报错
#     bro.execute_script('arguments[0].click();',sub_btn)
#
#     # username=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-body > div.form-item.login-item.clearfix.phone-item.mt24 > div.input-item.input-item-short.left.clearfix > input')
#     username=bro.find_element_by_css_selector('div.input-item>input.login-phone')
#     username.send_keys('18953675221')
#     # password=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div.form-item.login-item.clearfix.mt24 > div')
#     password = bro.find_element_by_css_selector('div.input-item>input.pwd-password-input')
#     password.send_keys('lqz123')
#
#     time.sleep(3)
#     btn=bro.find_element_by_css_selector('body > div.login-dialog.dialog.animated2.scaleIn > div > div.login-footer > div:nth-child(4) > button')
#
#     btn.click()
#
#     input('等')
#
#     with open('chouti.json','w') as f:
#         json.dump(bro.get_cookies(),f)
#
#
#
#
# finally:
#     bro.close()
import requests

bro=webdriver.Chrome(executable_path='chromedriver.exe')
bro.implicitly_wait(10)
bro.get('https://dig.chouti.com/')



# 把屏幕滑倒最底下
bro.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# bro.find_elements_by_css_selector('.link-item')
cookie={}
##从文件中读出cookie
with open('chouti.json','r') as f:
    res=json.load(f)
for item in res:
    cookie[item['name']]=item['value']

print(cookie) # requests能够使用的cookie


div= bro.find_element_by_class_name('link-con')
time.sleep(2)
header={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
div_list=div.find_elements_by_class_name('link-item')
for div in div_list:
    article_id=div.get_attribute('data-id')
    print(article_id)
    # 使用requests发送请求
    res=requests.post('https://dig.chouti.com/link/vote',data={'linkId': article_id},cookies=cookie,headers=header)
    print(res.text)
bro.close()

打码平台使用

# 人工破解
# 图像识别模块---》数字，字母组合
# 验证码破解平台---》云打码，超级鹰
	-给它一张图片---》结果返回   （收费的）

    
    
#!/usr/bin/env python
# coding:utf-8

import requests
from hashlib import md5


class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password = password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
                          headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
    chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')  # 用户中心>>软件ID 生成一个替换 96001
    im = open('a.jpg', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    print(chaojiying.PostPic(im, 1902))  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()

xpath使用

1 一门在html中查找数据的语言
2 记住的语法：
	/   取当前路径下的xx   
    //  取所有路径下的xx   
    .   当前路径    
    ..   上一层
	@    取属性
    
4 lxml解析模块提供的xpath
doc='''
<html>
 <head>
  <base href='http://example.com/' />
  <title>Example website</title>
 </head>
 <body>
  <div id='images'>
   <a href='image1.html' name='sss'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a>
   <a href='image2.html' name='lqz'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a>
   <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a>
   <a href='image4.html' class='li'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a>
   <a href='image5.html' class='li li-item' name='items'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a>
   <a href='image6.html' name='items'><span><h5>test</h5></span>Name: My image 6 <br /><img src='image6_thumb.jpg' /></a>
  </div>
 </body>
</html>
'''
from lxml import etree

# 传入要解析的内容
html=etree.HTML(doc)

# res=html.xpath('//body')
# print(res)

# 1 所有节点
# a=html.xpath('//*')




# 2 指定节点（结果为列表）
# a=html.xpath('//head')
# 3 子节点，子孙节点
# a=html.xpath('//div/a')
# a=html.xpath('//body//a') #无数据
# a=html.xpath('//body//a')
# 4 父节点
# a=html.xpath('//body//a[@href="image1.html"]/..')
# a=html.xpath('//body//a')
# a=html.xpath('//body//a[@href="image1.html"]')
# a=html.xpath('//body//a[1]/..')
# 也可以这样
# a=html.xpath('//body//a[1]/parent::*')
# a=html.xpath('//body//a[1]/parent::p')
# 5 属性匹配
# a=html.xpath('//a[@href="image1.html"]')
# a=html.xpath('//a[@name="sss"]')

# 6 文本获取  text()
# a=html.xpath('//a[@href="image1.html"]/text()')
# a=html.xpath('//a/text()')

# 7 属性获取
# a=html.xpath('//a/@href')
# a=html.xpath('//a[1]/@name')
# # 注意从1 开始取（不是从0）
# a=html.xpath('//body//a[2]/@href')
# 8 属性多值匹配
#  a 标签有多个class类，直接匹配就不可以了，需要用contains
# a=html.xpath('//a[@class="li"]')
# a=html.xpath('//a[contains(@class,"li")]')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 9 多属性匹配
# a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')
# a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')
# a=html.xpath('//body//a[contains(@class,"li")]/text()')
# 10 按序选择
# a=html.xpath('//a[2]/text()')
# a=html.xpath('//a[2]/@href')
# a=html.xpath('//a[2]/@name')
# 取最后一个
# a=html.xpath('//a[last()]/@href')
# 位置小于3的
# a=html.xpath('//a[position()<3]/@href')
# 倒数第二个
# a=html.xpath('//a[last()-2]/@href')
# 11 节点轴选择
# ancestor：祖先节点
# 使用了* 获取所有祖先节点
# a=html.xpath('//a/ancestor::*')
# # 获取祖先节点中的div
# a=html.xpath('//a/ancestor::div')
# attribute：属性值
# a=html.xpath('//a[1]/attribute::*')
# child：直接子节点
# a=html.xpath('//a[1]/child::*')
# a=html.xpath('//a[1]/child::img/@src')
# descendant：所有子孙节点
# a=html.xpath('//a[6]/descendant::*')
# following:当前节点之后所有节点
# a=html.xpath('//a[1]/following::*')
# a=html.xpath('//a[1]/following::*[1]/@href')
# following-sibling:当前节点之后同级节点
# a=html.xpath('//a[1]/following-sibling::*')
# a=html.xpath('//a[1]/following-sibling::a')
# a=html.xpath('//a[1]/following-sibling::*[2]/text()')
# a=html.xpath('//a[1]/following-sibling::*[2]/@href')

print(a)

自动登录12306

from selenium import webdriver
import base64
from PIL import Image
import time
from chaojiying import Chaojiying_Client
from selenium.webdriver import ActionChains


# 不让程序检测出是用驱动控制
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")
bro=webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=options)

bro.get('https://kyfw.12306.cn/otn/resources/login.html')

bro.implicitly_wait(10)
# 把窗口设置全屏
bro.maximize_window()

try:
    username_login_btn=bro.find_element_by_css_selector('.login-hd-account>a')
    username_login_btn.click()

    username=bro.find_element_by_id('J-userName')
    password=bro.find_element_by_id('J-password')
    login_btn=bro.find_element_by_id('J-login')
    username.send_keys('liuqingzheng')
    password.send_keys('lqz12345')


    img_code=bro.find_element_by_id('J-loginImg')
    print(img_code.size)
    print(img_code.location)
    # 获取验证码图片的两种方案
    # 方案一：整体截图，根据位置抠出验证码图片
    # bro.save_screenshot('main.png') # 对整个页面进行截图，main.png
    #
    # location=img_code.location
    # size=img_code.size
    # print(location)
    # print(size)
    # #验证码的坐标
    # img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
    # #使用pillow打开截图
    # img=Image.open('./main.png')
    # #从截图中按照位置扣除验证码
    # code_img=img.crop(img_tu)
    # # 把扣出来的图，保存到本地
    # code_img.save('./code2.png')

    # 方案二：把图片的base64编码转成图片保存到本地
    img_base64=img_code.get_attribute('src')
    img_base64_real=img_base64.split(',')[-1]
    img_1=base64.b64decode(img_base64_real)
    with open('code.jpg','wb') as f:
        f.write(img_1)


    # 调用超级鹰，完成验证码破解
    # 调用超级鹰识别
    chaojiying = Chaojiying_Client('306334678', 'lqz12345', '903641')  # 用户中心>>软件ID 生成一个替换 96001
    im = open('code.jpg', 'rb').read()  # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
    res=chaojiying.PostPic(im, 9004)  # 1902 验证码类型  官方网站>>价格体系 3.4+版 print 后要加()
    # 123,155|42,135|11,77---->[[123,155],[42,135],[11,77]]
    print(res)
    result=res['pic_str']
    all_list = []
    if '|' in result:
        list_1 = result.split('|')
        count_1 = len(list_1)
        for i in range(count_1):
            xy_list = []
            x = int(list_1[i].split(',')[0])
            y = int(list_1[i].split(',')[1])
            xy_list.append(x)
            xy_list.append(y)
            all_list.append(xy_list)
    else:
        x = int(result.split(',')[0])
        y = int(result.split(',')[1])
        xy_list = []
        xy_list.append(x)
        xy_list.append(y)
        all_list.append(xy_list)
    print(all_list)


    ### 在页面中点击破解的图案
    #点击  [[123,155],[42,135],[11,77]]
    for item in all_list:
        ActionChains(bro).move_to_element_with_offset(img_code,item[0],item[1]).click().perform()
        time.sleep(1)

    time.sleep(5)
    login_btn.click()
    time.sleep(1)

    # 滑动滑块
    span=bro.find_element_by_id('nc_1_n1z')
    ActionChains(bro).drag_and_drop_by_offset(span, 300, 0).perform()

    time.sleep(30)

    print(bro.get_cookies())
except Exception as e:
    print(e)
finally:
    bro.close()
    bro.quit()  # 关闭整个浏览器

posted @ 2021-08-12 20:56 Jerry` 阅读(106) 评论(0) 编辑收藏举报

会员力量，点亮园子希望

刷新页面返回顶部

Jerry`

欲买桂花同载酒，终不似，少年游

爬虫

爬虫

介绍

requests库介绍

requests发送get请求

requests发送post请求

爬取视频

补充

requests高级用法

抽屉自动点赞

爬取汽车之家新闻

bs4遍历文档树

bs4搜索文档树

css选择器(与xpath是通用的)

selenium使用

基本使用

无头浏览器

获取元素位置，属性，大小

等待元素被加载

元素操作

执行js

切换选项卡

模拟前进后退

异常处理

selenium登录cnblogs获取cookie

抽屉半自动点赞

打码平台使用

xpath使用

自动登录12306

公告