爬虫从入门到入狱之入门

1 爬虫介绍

# 爬虫：spider，网络蜘蛛

# 本质原理：
    -现在所有的软件原理：大部分都是基于http请求发送和获取数据的
        -pc端的网页
        -移动端app
    -模拟发送http请求，从别人的服务端获取数据
    -绕过反扒：不同程序反扒措施不一样，比较复杂
    
    
# 爬虫原理
    -发送http请求【requests，selenium】----》第三方服务端----》服务端响应的数据解析出想要的数据【selenium,bs4】---》入库(文件，excel，mysql,redis,mongodb。。)
    -scrapy:专业的爬虫框架
    
    
# 爬虫是否合法
    -爬虫协议：每个网站根路径下都有robots.txt，这个文件规定了，该网站，哪些可以爬取，哪些不能爬
    
    
    
# 百度：大爬虫
    -百度搜索框中输入搜索内容，回车，返回的数据，是百度数据库中的数据
    -百度一刻不停的在互联网中爬取各个页面，链接地址--》爬完存到自己的数据库
    -当你点击，跳转到真正的地址上去了
    -核心：搜索，海量数据中搜索出想要的数据
    -seo：免费的搜索，排名靠前
    -sem：花钱买关键字

2 requests模块发送get请求

# 模拟发送http请求的模块：requests 不仅仅做爬虫用它，后期调用第三方接口，也是要用它的
# pip3 install requests    
    -本质是封装了内置模块urlib3

    
import requests
res=requests.get('https://www.cnblogs.com/liuqingzheng/p/16005866.html')
print(res.text) # http响应体的文本内容

3 get请求携带参数

# 2 发送get请求携带数据
# 2.1 地址栏中拼接
# res=requests.get('https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3')
# print(res.text)

# 2.2 使用params参数携带
# res=requests.get('https://www.baidu.com/s',params={
#     'wd':'美女',
#     'name':'lqz'
# })
# print(res.text)
# https://www.baidu.com/s?wd=美女&name=lqz


## url编码和解码
# 美女被url编码后--》
# %E7%BE%8E%E5%A5%B3
# %E7%BE%8E%E5%A5%B3
from urllib import parse
# res=parse.quote('美女')
# print(res)
res=parse.unquote('%E7%BE%8E%E5%A5%B3')
print(res)

4 携带请求头

# http 请求，有请求头，有的网站，通过某些请求头来做反扒


# 3 请求头中带数据---->爬取某个网站，不能正常返回，模拟的不像
# 网站做反扒，没有携带请求头中的客户端类型
# User-Agent：客户端类型：有浏览器，手机端浏览器，爬虫类型，程序，scrapy。。一般伪造成浏览器
# referer：上次访问的地址：Referer: https://www.lagou.com/gongsi/
    # 如果要登录，模拟向登录接口发请求，正常操作必须在登录页面上才能干这事，如果没有携带referer，它就认为你是恶意的，拒绝调
    # 图片防盗链
# cookie： 认证后的cookie，就相当于登录了
# header={
#     # 客户端类型
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
# }
# res=requests.get('https://dig.chouti.com/',headers=header)
# print(res.text)

5 携带cookie

# 4 请求中携带cookie#
## 方式一：直接带在请求头中
#模拟点赞
# data={
#     'linkId':'36996038'
# }
# header={
#     # 客户端类型
#     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
#     #携带cookie
#     'Cookie':'deviceId=web.eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiI3MzAyZDQ5Yy1mMmUwLTRkZGItOTZlZi1hZGFmZTkwMDBhMTEiLCJleHBpcmUiOiIxNjYxNjU0MjYwNDk4In0.4Y4LLlAEWzBuPRK2_z7mBqz4Tw5h1WeqibvkBG6GM3I; __snaker__id=ozS67xizRqJGq819; YD00000980905869%3AWM_TID=M%2BzgJgGYDW5FVFVAVQbFGXQ654xCRHj8; _9755xjdesxxd_=32; Hm_lvt_03b2668f8e8699e91d479d62bc7630f1=1666756750,1669172745; gdxidpyhxdE=W7WrUDABQTf1nd8a6mtt5TQ1fz0brhRweB%5CEJfQeiU61%5C1WnXIUkZH%2FrE4GnKkGDX767Jhco%2B7xUMCiiSlj4h%2BRqcaNohAkeHsmj3GCp2%2Fcj4HmXsMVPPGClgf5AbhAiztHgnbAz1Xt%5CIW9DMZ6nLg9QSBQbbeJSBiUGK1RxzomMYSU5%3A1669174630494; YD00000980905869%3AWM_NI=OP403nvDkmWQPgvYedeJvYJTN18%2FWgzQ2wM3g3aA3Xov4UKwq1bx3njEg2pVCcbCfP9dl1RnAZm5b9KL2cYY9eA0DkeJo1zfCWViwVZUm303JyNdJVAEOJ1%2FH%2BJFZxYgMVI%3D; YD00000980905869%3AWM_NIKE=9ca17ae2e6ffcda170e2e6ee92bb45a398f8d1b34ab5a88bb7c54e839b8aacc1528bb8ad89d45cb48ae1aac22af0fea7c3b92a8d90fcd1b266b69ca58ed65b94b9babae870a796babac9608eeff8d0d66dba8ffe98d039a5edafa2b254adaafcb6ca7db3efae99b266aa9ba9d3f35e81bdaea4e55cfbbca4d2d1668386a3d6e1338994fe84dc53fbbb8fd1c761a796a1d2f96e81899a8af65e9a8ba3d4b3398aa78285c95e839b81abb4258cf586a7d9749bb983b7cc37e2a3; token=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiJjZHVfNTMyMDcwNzg0NjAiLCJleHBpcmUiOiIxNjcxNzY1NzQ3NjczIn0.50e-ROweqV0uSd3-Og9L7eY5sAemPZOK_hRhmAzsQUk; Hm_lpvt_03b2668f8e8699e91d479d62bc7630f1=1669173865'
# }
# res=requests.post('https://dig.chouti.com/link/vote',data=data,headers=header)
# print(res.text)


## 方式二：通过cookie参数：因为cookie很特殊，一般都需要携带，模块把cookie单独抽取成一个参数，是字典类型，以后可以通过参数传入
data={
    'linkId':'36996038'
}
header={
    # 客户端类型
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
res=requests.post('https://dig.chouti.com/link/vote',data=data,headers=header,cookies={'key':'value'})
print(res.text)

6 发送post请求

###6 发送post请求
# data = {
#     'username': '616564099@qq.com',
#     'password': 'lqz123',
#     'captcha': 'cccc',
#     'remember': 1,
#     'ref': 'http://www.aa7a.cn/',
#     'act': 'act_login'
# }
# res = requests.post('http://www.aa7a.cn/user.php', data=data)
# print(res.text)
# print(res.cookies)  # 响应头中得cookie，如果正常登录，这个cookie 就是登录后的cookie  RequestsCookieJar：当成字典
#
# # 访问首页，携带cookie，
# # res2 = requests.get('http://www.aa7a.cn/', cookies=res.cookies)
# res2 = requests.get('http://www.aa7a.cn/')
# print('616564099@qq.com' in res2.text)


## 6.2 post请求携带数据 data={} ,json={}   drf后端，打印 request.data
# data=字典是使用默认编码格式：urlencoded
# json=字典是使用json 编码格式
# res = requests.post('http://www.aa7a.cn/user.php', json={})


## 6.4 request.session的使用：当request使用，但是它能自动维护cookie
# session=requests.session()
# data = {
#     'username': '616564099@qq.com',
#     'password': 'lqz123',
#     'captcha': 'cccc',
#     'remember': 1,
#     'ref': 'http://www.aa7a.cn/',
#     'act': 'act_login'
# }
# res = session.post('http://www.aa7a.cn/user.php', data=data)
# res2 = session.get('http://www.aa7a.cn/')
# print('616564099@qq.com' in res2.text)

7 响应Response

import requests

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}
respone = requests.get('https://www.jianshu.com', params={'name': 'lqz', 'age': 19},headers=header)
# respone属性
print(respone.text)  # 响应体的文本内容
print(respone.content)  # 响应体的二进制内容
print(respone.status_code)  # 响应状态码
print(respone.headers)  # 响应头
print(respone.cookies)  # 响应cookie
print(respone.cookies.get_dict())  # cookieJar对象，获得到真正的字段
print(respone.cookies.items())  # 获得cookie的所有key和value值
print(respone.url)  # 请求地址
print(respone.history)  # 访问这个地址，可能会重定向，放了它冲定向的地址
print(respone.encoding)  # 页面编码

8 获取二进制数据

###8 获取二进制数据 ：图片，视频
#
# res = requests.get(
#     'https://upload.jianshu.io/admin_banners/web_images/5067/5c739c1fd87cbe1352a16f575d2df32a43bea438.jpg')
# with open('美女.jpg', 'wb') as f:
#     f.write(res.content)

# 一段一段写

res=requests.get('https://vd3.bdstatic.com/mda-mk21ctb1n2ke6m6m/sc/cae_h264/1635901956459502309/mda-mk21ctb1n2ke6m6m.mp4')
with open('美女.mp4', 'wb') as f:
    for line in res.iter_content():
        f.write(line)

9 解析解析json

# 前后分离后，后端给的数据，都是json格式，

# 解析json格式

res = requests.get(
    'https://api.map.baidu.com/place/v2/search?ak=6E823f587c95f0148c19993539b99295&region=%E4%B8%8A%E6%B5%B7&query=%E8%82%AF%E5%BE%B7%E5%9F%BA&output=json')
print(res.text)
print(type(res.text))
print(res.json()['results'][0]['name'])
print(type(res.json()))

10 requests高级用法

10.1 ssl认证(了解)

# https 和http有什么区别
    -https=http+ssl/tsl  证书
   
# 没有被认证过的机构，签发的证书，用的时候，浏览器会提示不安全


# 1 ssl认证
# 1.1 不认证证书了
# import requests
# respone = requests.get('https://www.12306.cn', verify=False)  # 不验证证书,报警告,返回200
# print(respone.status_code)
# 
# # 1.2 手动携带证书访问
# import requests
# respone=requests.get('https://www.12306.cn',cert=('/path/server.crt','/path/key'))
# print(respone.status_code)

10.2 使用代理（重要）

# 频率限制，封账号，通过ip或用户id限制，做爬虫，就要避免这些
    -封ip：代理
    -封账号：注册很多小号
    
    
# 代理是什么？
    -正向代理：代理客户端
    -反向代理：代理服务端，nginx是反向代理服务器
    
    
# 收费的，免费，基本都收费
    
    
    
# 发送http请求，使用代理发送
    
## 2 使用代理ip发送请求
import requests
proxies = {
    'http': '192.168.10.102:9003',
}
respone=requests.get('https://www.baidu.com',proxies=proxies)

print(respone.text)

10.3 超时设置

# 3 超时设置
# respone=requests.get('https://www.baidu23.com',timeout=3)
# print(respone)

10.4 异常处理

# 4 异常处理
# import requests
# from requests.exceptions import * #可以查看requests.exceptions获取异常类型
# try:
#     r=requests.get('http://www.baidu.com',timeout=0.00001)
# except ReadTimeout:
#     print('===:')
# except ConnectionError: #网络不通
#     print('-----')
# except Timeout:
#     print('aaaaa')
#
# except RequestException:
#     print('Error')

10.5 上传文件

## 5 上传文件
# import requests
# files={'file':open('a.txt','rb')}
# respone=requests.post('http://httpbin.org/post',files=files)
# print(respone.text)

11 代理池搭建

# github开源的，代理池的代码，本地跑起来
    -爬虫技术：爬取免费的代理网站，获取免费代理，验证过后，存到本地
    -使用flask搭建一个web后端，访问某个接口就可以随机返回一个可用的代理地址
    -https://github.com/jhao104/proxy_pool
    
    
    
# 搭建步骤：
    1 git clone https://github.com/jhao104/proxy_pool.git
    2 创建虚拟环境，安装依赖：pip install -r requirements.txt
    3 修改配置文件settings.py   ---》redis服务启动
        # 配置API服务
        HOST = "0.0.0.0"               # IP
        PORT = 5000                    # 监听端口
        # 配置数据库

        DB_CONN = 'redis://127.0.0.1:8888/0'
        # 配置 ProxyFetcher
        PROXY_FETCHER = [
            "freeProxy01",   
            "freeProxy02",
        ]
    4 启动爬虫，启动web服务
        # 启动调度程序
        python proxyPool.py schedule
        # 启动webApi服务
        python proxyPool.py server
        
    5 随机获取ip
        127.0.0.1:5000/get

import requests

# http://127.0.0.1:5010/get/
# 获取一个随机ip
res = requests.get('http://127.0.0.1:5010/get/').json()
if res['https']:
    http = 'https'
else:
    http = 'http'
proxie = {
    http: res['proxy']
}
print(proxie)
res = requests.get('https://www.cnblogs.com/liuqingzheng/p/16005896.html', proxies=proxie)
print(res.status_code)

11.1 django后端获取客户端的ip

# 写一个返回用户ip地址的django程序
def ip_test(request):
    # 获取客户端ip
    ip=request.META.get('REMOTE_ADDR')
    return HttpResponse('您的ip是：%s'%ip)
#部署在云服务器

#本地使用requests+代理访问，查看是否返回代理的ip地址
import requests

res = requests.get('http://127.0.0.1:5010/get/').json()
if res['https']:
    http = 'https'
else:
    http = 'http'
proxie = {
    http: http+'://'+res['proxy']
}
print(proxie)
# 服务端部署在本地，是访问不到的，内网穿透，或者部署在服务器上
# res = requests.get('http://192.168.1.143:8000/ip/', proxies=proxie)
# res = requests.get('https://46b3k95600.zicp.fun/ip/', proxies=proxie) # 不生效
res = requests.get('http://101.133.225.166/ip/', proxies=proxie)
print(res.text)
# 如果代理不可用，就不用代理了

12 爬取某视频网站

# requests 爬取好多网站，但是咱们爬回来，没法解析，re 正则匹配

# requests+正则，整站爬取视频

# 以它为例：
    https://www.pearvideo.com/

import requests
import re
https://video.pearvideo.com/mp4/adshort/20200330/1669284875001-15051215_adpkg-ad_hd.mp4
https://video.pearvideo.com/mp4/adshort/20200330/cont-1665251-15051215_adpkg-ad_hd.mp4
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=1')

# 使用正则，解析出该页面中所有的视频地址
video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)
# print(video_list)
for video in video_list:
        # video_url = 'https://www.pearvideo.com/' + video
    # print(video_url)
    # res = requests.get(video_url)
    # print(res.text)
    # break
    # 向https://www.pearvideo.com/videoStatus.jsp?contId=1646509&mrd=0.6761335369801458发送请求获取视频地址
    video_id = video.split('_')[-1]
    header = {
        'Referer': 'https://www.pearvideo.com/%s' % video
    }
    res = requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.6761335369801458' % video_id,
                       headers=header).json()
    real_mp4_url = res['videoInfo']['videos']['srcUrl']
    real_mp4_url = real_mp4_url.replace(real_mp4_url.rsplit('/', 1)[-1].split('-')[0], 'cont-%s' % video_id)
    print(real_mp4_url)

    res = requests.get(real_mp4_url)
    with open('./video/%s.mp4' % video_id, 'wb') as f:
        for line in res.iter_content():
            f.write(line)

13 爬取新闻

# requests+BautifulSoup4(解析库：bs4，lxml...)
# https://www.autohome.com.cn/news/

import requests
from bs4 import BeautifulSoup

res = requests.get('https://www.autohome.com.cn/news/1/#liststart')

soup = BeautifulSoup(res.text, 'html.parser')
soup = soup.find_all('ul',class_='article')
for i in soup:
    url = i.find_all('li')
    for li in url:
        h3 = li.find('h3')
        if h3:
            title = h3.text
            desc = li.find('p').text
            url = 'https:'+li.find('a').attrs.get('href')
            img = li.find('img').attrs.get('src')
            if not img.startswith('http'):
                img = 'https'+img
            print(
                '''
                标题:%s,
                摘要:%s,
                地址:%s,
                图片:%s,
                '''%(title,desc,url,img)
            )
        else:
            pass

14 爬取视频

num = 1
def text(run):
    global num
    num += 1

    try:
        # proxies = {
        #     'https': 'https://106.225.178.75:9002'
        # }
        # print(num)
        # print('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=%s' % num)
        res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=%s' % num,
                           timeout=3)
        video = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)

        for i in video:

            video_id = i.split('_')[-1]
            video_url = 'https://www.pearvideo.com/' + i
            # res = requests.get(video_url)
            headers = {'Referer': video_url}

            res = requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.6210306818459239' % video_id,
                               headers=headers).json()

            real_mp4_url = res['videoInfo']['videos']['srcUrl']

            # real_mp4_url.rsplit('/')[-1].split('-')[1]='cont-%s'%i
            # print(real_mp4_url)
            res1 = real_mp4_url.replace(real_mp4_url.rsplit('/', 1)[-1].split('-')[0], 'cont-%s' % i.split('_')[-1])
            print('线程:%s正在执行%s' % (run, res1))
            res = requests.get(res1)
            with open('./video/%s.mp4' % i, 'wb') as f:
                for i in res.iter_content():
                    f.write(i)
    except:
        pass



if __name__ == '__main__':
        for i in range(24):
            T = Thread(target=text,args=(i + 1,))
            T.start()

15 BautifulSoup4 介绍

# Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库

# pip3 install BeautifulSoup4
# 解析库解释
    BeautifulSoup('要解析的内容：xml格式字符串', "html.parser") #内置解析库html.parser
    BeautifulSoup('要解析的内容：xml格式字符串',  "lxml")  # 速度快 必须要装lxml pip3 install lxml

16 bs4 遍历文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_p' name='lqz' xx='yy'>lqz is handsome <b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 1 美化html:了解
# print(soup.prettify())

# 2 遍历文档树
'''
#遍历文档树：即直接通过标签名字选择，特点是选择速度快，但如果存在多个相同的标签则只返回第一个
#1、用法
#2、获取标签的名称
#3、获取标签的属性
#4、获取标签的内容
#5、嵌套选择
#6、子节点、子孙节点
#7、父节点、祖先节点
#8、兄弟节点
'''
# 1 基本用法，直接  .标签名字
# res=soup.title
# print(res)
# res=soup.a
# print(res)
# 可以嵌套使用
# res=soup.head.title
# print(res)

# 2 获取标签的名称
# 拿到的所有标签都是一个对象，Tag对象  bs4.element.Tag
# res=soup.head.title
# res=soup.body
# print(res.name)

# 3 获取标签的属性
# res=soup.p
# print(res.attrs)  # 属性字典


# 4 获取标签的内容
# res = soup.p
# print(res.text) # 把该标签子子孙孙内容拿出来拼到一起 字符串
# print(res.string) # None 必须该标签没有子标签，才能拿出文本内容
# print(list(res.strings) )# generator 生成器，把子子孙孙的文本内容放到生成器中

# 5 嵌套选择

# res=soup.html.body.a
# print(res.text)


# 6、子节点、子孙节点
# print(soup.p.contents) #p下所有子节点
# print(soup.p.children) #得到一个迭代器,包含p下所有子节点

# 7、父节点、祖先节点
# print(soup.a.parent) #获取a标签的父节点,直接父节点
# print(list(soup.a.parents)) #找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...


# 8、兄弟节点
# print(soup.a.next_sibling)  # 下一个兄弟
# print(soup.a.previous_sibling)  # 上一个兄弟

print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
print('-----')
print(list(soup.a.previous_siblings)) #上面的兄弟们=>生成器对象

17 bs4搜索文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my p" class="title">asdfasdf<b id="bbb" class="boldest">The Dormouse's story</b>
</p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')

# 搜索文档树  find:找一个     find_all：找所有

# 5 种搜索方式： 字符串、正则表达式、列表、True、方法

# 5.1 字符串:可以按照标签名，属性名查找
# res=soup.find(name='a',id='link2')
# res=soup.find(href='http://example.com/tillie')
# res=soup.find(class_='story')
# res=soup.body.find('p')
# res=soup.body.find(string='Elsie')
# res=soup.find(attrs={'class':'sister'})
# print(res) #


# 5.2 正则表达式  标签名，属性可以使用正则匹配
# import re
# # res=soup.find_all(name=re.compile('^b'))
# # res=soup.find_all(href=re.compile('^http'))
# # for item in res:
# #     url=item.attrs.get('href')
# #     print(url)
# # request-html    获取到页面中所有的链接地址
# res=soup.find(attrs={'href':re.compile('^a')})

# print(res)


# 5.3 列表  标签名，属性名  等于列表  或条件
# res=soup.find_all(class_=['story','sister'])  # 或条件
# res=soup.find_all(name=['a','p'])  # 或条件
# print(res)


## 5.4 True  标签名，属性名  等于布尔
# res = soup.find_all(name=True)  # 有标签名的所有标签
# print(res)

# 拿出页面中所有图片
# res = soup.find_all(src=True)
# for item in res:
#     url = item.attrs.get('href')
#     print(url)



# 5.5 方法  标签名或属性名 = 方法
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
#
# print(soup.find_all(has_class_but_no_id))



'''
# 总结：
    1  find和find_all
    2  5 种搜索方法
    3  结合遍历文档树一起使用，提交查询速度
'''


#### 其他 find_all的其他属性   limit    recursive:False,只找一层
# res=soup.find_all(name='a',limit=2)   # find的本质是find_all + limit=1
#
# res=soup.body.find(name='p',id=False).find_all(name='a',recursive=False)
#
# print(res)


## 修改文档树：bbs，删除script标签

posted @ 2022-11-25 16:40 shangxin_bai 阅读(294) 评论(0) 收藏举报

刷新页面返回顶部

shangxin_bai

爬虫从入门到入狱之入门

1 爬虫介绍

2 requests模块发送get请求

3 get请求携带参数

4 携带请求头

5 携带cookie

6 发送post请求

7 响应Response

8 获取二进制数据

9 解析解析json

10 requests高级用法

10.1 ssl认证(了解)

10.2 使用代理（重要）

10.3 超时设置

10.4 异常处理

10.5 上传文件

11 代理池搭建

11.1 django后端获取客户端的ip

12 爬取某视频网站

13 爬取新闻

14 爬取视频

15 BautifulSoup4 介绍

16 bs4 遍历文档树

17 bs4搜索文档树

公告