爬虫操作

import requests

模块模拟网络请求

 res = requests.get('https://www.cnblogs.com/moongodnnn/p/17219288.html')
 print(res.text)
# 基础用法，爬取网页全部html内容



# 请求头携带参数 方式1 直接加在后面
res = requests.get('https://www.cnblogs.com/moongodnnn/?page=3')

res = requests.get('https://www.cnblogs.com/moongodnnn/', params={'page': '3'})
print(res.url)
# 也可以通过 params={K:V,K:V} 
# https://www.cnblogs.com/moongodnnn/?page=3

url网址中文编码解码

res = requests.get('https://www.cnblogs.com/moongodnnn/', params={'name': '吴彦祖'})
print(res.url)
# https://www.cnblogs.com/moongodnnn/?name=%E5%90%B4%E5%BD%A6%E7%A5%96
# 链接中携带的中文数据 会被编码 需要解码


from urllib.parse import quote,unquote

name = unquote('%E5%90%B4%E5%BD%A6%E7%A5%96')
# name = 彭于晏
# 解码 把url中的编码转为utf8

name1 = quote('彭于晏')
# name1 = %E5%90%B4%E5%BD%A6%E7%A5%96
# 编码 把中文转为url编码格式
print(name,name1)

发送请求携带请求头

真实浏览器访问接口也会自带一些请求头数据

爬虫也需要模仿这些所以要在请求头中加入数据

需要携带 user-agent 还有 cookie

# 反扒措施之一，就是请求头

# http请求中，请求头中有一个很重要的参数 User-Agent
	-表明了客户端类型是什么：Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36
    -如果没有带这个请求头，后端就禁止
    -request发送请求，没有携带该参数，所以有的网站就禁止了
    
    
    

import requests
# http请求头：User-Agent，cookie，Connection

# http协议版本间的区别
# Connection: keep-alive
# http协议有版本：主流1.1   0.9   2.x
# http 基于TCP 如果建立一个http链接---》底层创建一个tcp链接
# 1.1比之前多了keep-alive
# 2.x比1.x多了 多路复用
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
res = requests.get('https://dig.chouti.com/',headers=headers)
print(res.text)

发送post请求给接口

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Mobile Safari/537.36',
    'Cookie': 'deviceId=web.eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJqaWQiOiIxMmEzMDM3ZC1kYTAxLTQ5OWYtOThjYS02NGQ2YjQ0ODIzYTYiLCJleHBpcmUiOiIxNjgxNDcxMzIxNDc5In0.wC9lWJTMmjd5B80VqQ_5IszO8fx__GS2izwseH7-82Y; __snaker__id=CzPxzJZTLtU5R6HD; gdxidpyhxdE=sA3CG62%2FwKQDDqRSP372dmqn%5ChUTAZX1xC5%5Czy2%5CB%2Fn5GoBQhCcvLXXT3vsKWY4epI2y9tAuEQ%2FO6I7gtt4La8T33wy7N%2BkeYNu7APU32xp'
}
# 请求头携带 Cookie 这样就是已登录状态

data = {
    'linkId': '38068398'
}
# 给接口携带的数据


res = requests.post('https://dig.chouti.com/link/vote', headers=headers, data=data)
print(res.text)
# 拒接爬虫访问 需要携带 请求头和数据
# {"msg":"你已经推荐过了","code":400,"errorType":0,"success":false}
# 自动点赞实现


扩展：如果需要解决爬虫问题，可以使用双token认证

模拟自动登录

先找到网页的登录接口，查看需要传递什么参数

#
import requests

data = {
    'username': '616564099@qq.com',
    'password': 'lqz123',
    'captcha': '3456',
    'remember': '1',
    'ref': 'http://www.aa7a.cn/',
    'act': 'act_login'
}


res = requests.post('http://www.aa7a.cn/user.php',data=data)
# 请求登录接口并携带参数
print(res.text)
# 获取到cookies

res = requests.post('http://www.aa7a.cn/user.php',cookies=res.cookies)
# 这样就可以 已登录用户的身份访问其他接口

Requests.session

为了保持cookie 自动维护cookies

import requests

data = {
    'username': '616564099@qq.com',
    'password': 'lqz123',
    'captcha': '3456',
    'remember': '1',
    'ref': 'http://www.aa7a.cn/',
    'act': 'act_login'
}

session = requests.session()
res = session.post('http://www.aa7a.cn/user.php',data=data)
# 当有了cookies以后会自动维护 自动保持登录状态
print(res.text)
print(res.cookies)
# 获取到cookies

res = session.post('http://www.aa7a.cn/user.php')
# 直接用session对象请求 这样自动会带着cookies去请求，


只需要获取一次cookies即可

post请求携带数据编码格式

大部分接口需要我们提交json格式数据

import requests

# data对应字典，这样写，编码方式是urlencoded
requests.post(url='xxxxxxxx',data={'xxx':'yyy'})


# json对应字典，这样写，编码方式是json格式
requests.post(url='xxxxxxxx',json={'xxx':'yyy'})



# 终极方案，编码就是json格式  只需要在请求头中加上'content-type':'application/json'
# 指定编码格式
requests.post(url='',
              data={'':1,},
              headers={
                  'content-type':'application/json'
                
              })

返回的response中有哪些数据


respone=requests.get('http://www.jianshu.com',headers=headers)
# 返回了哪些数据


# respone属性 可以拿到的数据
print(respone.text) # 响应体转成了字符串
print(respone.content) # 响应体的二进制内容

print(respone.status_code) # 响应状态码
print(respone.headers)   # 响应头
print(respone.cookies)  # cookie是在响应头，cookie很重要，它单独做成了一个属性
print(respone.cookies.get_dict()) # cookieJar对象---》转成字段
print(respone.cookies.items())  # cookie的键值对

print(respone.url)    # 请求地址
print(respone.history) # 不用关注

print(respone.encoding)  # 响应编码格式

编码问题

# 有的网站，打印
res.text
拿回来的数据是二进制数据 二进制的字符串，并且默认用utf8格式进行了转码

# 如何解决
response.encoding='gbk'
这样就指定了编码格式更换，
但是一般网站都是utf8就可以的

爬虫下载图片视频

无防盗链公开资源

import requests

res = requests.get('http://pic.imeitou.com/uploads/allimg/230224/7-230224151210-50.jpg')

with open('美女头像.jpg', 'wb') as f:
    f.write(res.content)
    # 获取二进制数据 并写入到本地
    print('下载完成')

# 直接将图片写入到我们本地

src = "https://vd2.bdstatic.com/mda-pcdgg1sfdavt6g1e/720p/h264/1678795145924710807/mda-pcdgg1sfdavt6g1e.mp4"
res = requests.get(src)

with open('汽车.mp4', 'wb') as f:
    for line in res.iter_content():
        f.write(line)
    # 因为视频可能很大，所以这样一行行的写入
    print('视频下载完成')

requests解析json

直接将爬回来的json数据转为python中的字典类型

# 发送http请求，返回的数据会有xml格式，也有json格式

import requests
data = {
    'cname': '',
    'pid': '',
    'keyword': '500',
    'pageIndex': 1,
    'pageSize': 10,
}
res = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword',data=data)

res.text  # 得到的是json格式的字符串 而我们想要的是字典格式的json

res = res.json()
# 直接就直接将返回的json字符串转会了 字典格式 

res.name
# 就可以直接对爬到的数据进行操作

ssl认证，爬虫关闭https警告

http 协议和 https协议的区别

http协议传输是明文传输  默认端口是80端口 

https协议是密文传输 更加安全 数据传输中可以防止被窃取 和 篡改
对比http https加了处理加密信息的模块 ssl/tls 默认端口443

https其实就是网站加了证书
        
# 以后遇到证书提示错误问题 ssl xxx
	1 不验证证书
    import requests
    respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200
    print(respone.status_code)
    
    2 关闭警告
    import requests
    from requests.packages import urllib3
    urllib3.disable_warnings() #关闭警告
    respone=requests.get('https://www.12306.cn',verify=False)
    # 关键词 verify=False
    print(respone.status_code)
    
    3 手动携带证书（了解）
    import requests
    respone=requests.get('https://www.12306.cn',
                         cert=('/path/server.crt',
                               '/path/key'))
    print(respone.status_code)

爬虫使用代理

关键词 proxies={'http':'27.79.236.66:4001'}

# 如果爬虫使用自身ip地址访问，很有可能被封ip地址，以后就访问不了了
# 我们可以使用代理ip
# 代理：收费和免费(不稳定)


proxies={ https:'代理的地址端口'}
# 代理的 方式 地址+端口

res = requests.post('https://www.cnblogs.com',proxies=proxies)

这样我们的请求就是用代理的地址访问数据，然后数据给到代理，代理再给我们

# 高匿代理和透明代理
	-高匿，服务端拿不到真实客户端的ip地址
  -透明：服务端能拿到真实客户端的ip地址
	
  
-后端如何拿到真实客户端ip地址
			从请求头中拿到 X-Forwarded-For 得到值 我们就可以查看到对方真实的ip 

  		-获得HTTP请求端真实的IP
    	-http请求头中有个：X-Forwarded-For: client1(真实ip), proxy1, proxy2, proxy3

爬虫任务超时处理/异常处理

import requests
respone=requests.get('https://www.baidu.com',timeout=10)
# 设置一个超时时间，如果超过10秒还没有请求回来 则会直接中断本次请求


import requests
from requests.exceptions import * #可以查看requests.exceptions获取异常类型

try:
    r=requests.get('http://www.baidu.com',timeout=0.00001)
except ReadTimeout:
    print('===:')
# except ConnectionError: #网络不通
#     print('-----')
# except Timeout:
#     print('aaaaa')

except RequestException:
    print('Error')

上传文件

给其他项目传文件公司需要上传可以写脚本

# 3 上传文件
import requests
files = {'file': open('美女.png', 'rb')}
# 设置要上传的文件 


respone = requests.post('http://httpbin.org/post', files=files)
# 对一个地址上传文件
print(respone.status_code)
# 查看是否上传成功

搭建本地代理池免费

# requests 发送请求使用代理
# 代理从哪来
	-公司花钱买
  -搭建免费的代理池：https://github.com/jhao104/proxy_pool
  # 注明的开源项目 直接下载到本地部署即可
        -python：爬虫+flask写的
        -架构：看下图
      
      
# 搭建步骤：
	  1 git clone https://github.com/jhao104/proxy_pool.git
    # 将开源代码克隆下来或者下载下来
    2 使用pycharm打开
    3 安装依赖：pip install -r requirements.txt
    4 修改配置文件（redis地址即可）
        HOST = "0.0.0.0"
        PORT = 5010
        DB_CONN = 'redis://127.0.0.1:6379/0'
        PROXY_FETCHER #爬取哪些免费代理网站
   	5 启动爬虫程序  
    python proxyPool.py schedule
	  6 启动服务端
    python proxyPool.py server
    
    7 使用随机一个免费代理
    地址栏中输入：http://127.0.0.1:5010/get/
    
    
    
# 使用代理池
		import requests
		from requests.packages import urllib3
		urllib3.disable_warnings() #关闭警告
		# 获取代理
		res = requests.get('http://127.0.0.1:5010/get/').json()
    # 这样就随机拿到了一个代理池里面的 一条代理数据
    
		proxies = {}
    # 生成一个代理连接和端口号
    if res['https']:
      # 判断是https协议还是http协议
        proxies['https'] = res['proxy']                                                               
    else:
        proxies['http'] = res['proxy']
    print(proxies)
    res = requests.post('https://www.cnblogs.com', proxies=proxies,verify=False)
    # 使用代理去访问接口

django后端如何获得访问者IP地址



def index(request):
    ip = request.META.get('REMOTE_ADDR')
    # 获取访问者的ip地址
    print('ip地址是', ip)
    return HttpResponse(ip)
  
  

# 使用多线程访问接口
from threading import Thread
import requests

# 设置一个任务
def task():
    res = requests.get('http://101.43.19.239/')
    print(res.text)


# 开启100线程执行这个任务
for i in range(100):
    t = Thread(target=task)
    t.start()

爬取梨视频

import requests
import re

from threading import Thread



# 设置一个任务

def task(i):
    with open(f'./video/{i}.mp4', 'wb') as f:
        for i in data1.iter_content():
            f.write(i)
        print(f'视频{i}下载完毕')


start = 0
for s in range(2):
    res1 = requests.get('http://127.0.0.1:5011/get/').json()
    proxy = {}
    if res1['https']:
        proxy['https'] = res1.get('proxy')
    else:
        proxy['http'] = res1.get('proxy')
    res = requests.get('https://www.pearvideo.com/panorama_loading.jsp?start=%s' % start, proxies=proxy)
    print(res)
    start += 24
    # 拿到网页的所有标签  分析视频地址在哪里

    url_id = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)
    # 通过正则匹配出所有的url-id
    # print(url_id)
    for i in url_id:
        i = str(i).split('_')[-1]
        headers = {'Referer': 'https://www.pearvideo.com/video_' + i}
        real_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + i
        print(real_url)
        data = requests.get(real_url, headers=headers, proxies=proxy).json()
        video_url = data['videoInfo']['videos']['srcUrl']
        video_url = video_url.replace(video_url.split('/')[-1].split('-')[0], 'cont-%s' % i)
        data1 = requests.get(video_url, proxies=proxy)

        t = Thread(target=task, args=(i,))
        t.start()

用bs4模块爬取新闻

'https://www.autohome.com.cn/all/2/#liststart'

import requests
from bs4 import BeautifulSoup
# 针对字符串数据做解析  用于拿到网页所有信息再次过滤最后得到自己想要的数据




res = requests.get('https://www.autohome.com.cn/all/2/#liststart')
# 拿到该网页的所有html信息



soup = BeautifulSoup(res.text,'html.parser')
# 用bs4模块 对信息进行过来 解析器为html.parser

'''
先用生BeautifulSoup模块成一个对象，传入参数 需要解析的数据 和 解析器
生成的对象可以针对内部标签进行筛选，name参数是标签  class_是标签上的class
find(name='a').attrs['href']
# 寻找a标签内的attrs 属性的值
'''


ul_list = soup.find_all(name='ul',class_='article')
# 首先拿出来所有 ul标签 标签属性是 article的内容，有多个该标签


# 对标签进行循环 对每一个ul标签 重新去下面找 li标签， 每个ul标签下有很多li标签
# 然后在对li标签循环 那一个li标签 下面的 h3标签作为标题
for i in ul_list:
    li = i.find_all(name='li')
    for i in li:
        name = i.find(name='h3').text
        brief = i.find(name='p').text
        url = i.find(name='a').attrs['href']
        print('''
        标题：%s,
        简介: %s,
        链接: %s,
        '''%(name,brief,url)
         )

bs4模块遍历文档树

beautifulsoup4是帮助我们把HTML或XML文件中提取过滤我们想要的数据的python库

pip install beautifulsoup4
pip install lxml  
#一个解析库

用法：
soup=BeautifulSoup('要解析的内容str类型','html.parser或lxml')
生成一个对象 该对象就是要过滤的数据


respone=requests.get('https://www.baidu.com',timeout=10)

soup=BeautifulSoup(respone.text,'lxml')

遍历文件树
soup.html.body.p
# 找寻html标签内的body标签内的第一个p标签  


import requests
from bs4 import BeautifulSoup
import lxml

res = requests.get('https://www.cnblogs.com/')

soup = BeautifulSoup(res.text, 'lxml')

# print(soup.prettify())
# 美化数据，带有缩进 换行的html

# 一层层的遍历
# print(soup.html.body.div)
# 拿到html标签内的body标签内的第一个div标签内的所有内容

# 获取标签名称
# print(soup.html.body.div.name)


# print(soup.a.attrs['href'])
# 拿到数据中 第一个a标签的 href属性值


# print(soup.a.attrs.get('name'))
# 获取a标签属性内的name

# print(soup.p)
# 获取数据中的第一个p标签

# print(soup.p.text)
# 获取第一个p标签里面的文本内容


# 5、嵌套选择
# print(soup.html.body)


# ---- 了解
#6、子节点、子孙节点
# print(soup.body.contents) #p下所有子节点，只取一层
# print(list(soup.p.children)) #list_iterator得到一个迭代器,包含p下所有子节点  只取一层
# print(list(soup.body.descendants) ) # generator  子子孙孙
#7、父节点、祖先节点

# print(soup.a.parent) #获取a标签的父节点  直接父亲
# print(list(soup.a.parents) )#找到a标签所有的祖先节点，父亲的父亲，父亲的父亲的父亲...
#8、兄弟节点
# print(soup.a.next_sibling) #下一个兄弟
# print(soup.a.previous_sibling) #上一个兄弟

bs4模块搜索标签

import requests
from bs4 import BeautifulSoup

res = requests.get('https://www.cnblogs.com/')

soup = BeautifulSoup(res.text, 'lxml')
res = soup.find_all(name='p')
# print(res)
# 搜索文本中所有的p标签 以及p标签包裹的内容

soup1 = BeautifulSoup(str(res), 'lxml')
# print(soup1.find_all(class_='avatar'))
# 对刚刚搜索的所有p标签内的内容再次过滤 拿到 这里面所有的属性为avatar的标签

# print(soup1.find_all(id='link1'))
# 搜索所有id为link1的标签

# print(soup1.find_all(text='风筝')[0].parent)
# 所有文本内容是风筝的标签 的父级标签

# print(soup1.find_all(attrs={'class':'avatar','alt':'博主头像'}))
# 获取所有标签属性是 class=avatar alt=博主头像的标签

import re
# print(soup1.find_all(class_=re.compile('^a')))
# # 匹配正则表达式 获取文本内容中 所有属性为 a开头的标签
#
# # 多条件or关系
# print(soup1.find_all(id=['link1','link2']))
# 查询所有id为link1 或 link2的标签


-----------------------拿所有图片标签的网址--------------------


# True
res = soup1.find_all(src=True,limit=4)
# 拿到4个带有src属性的标签 ，不填写limit=4默所有
print(res)
for i in res:
    print(i.attrs['src'])
    # 拿出每一个src标签对应的网址
soup2 = BeautifulSoup(str(res),'lxml')
# print(soup2.)


# 拿到所有有href属性的标签



limit:限制调试，find_all用的    find本质是find_all  limit=1

通过css选择器拿标签

import requests
from bs4 import BeautifulSoup

res = requests.get('https://www.cnblogs.com/')

soup = BeautifulSoup(res.text, 'lxml')
# print(res)
# 搜索文本中所有的p标签 以及p标签包裹的内容

# 关键字 .select css选择器拿标签
print(soup.select('#post_list > article:nth-child(2) > section > div > a'))
# 可以通过网页检查 复制标签的select来直接筛选到该标签

posted @ 2023-03-15 20:47 Python-moon 阅读(34) 评论(0) 编辑收藏举报

刷新页面返回顶部

登录后才能查看或发表评论，立即登录或者逛逛博客园首页

相关博文：

· python面试题

· 爬虫selenium模块

· 爬虫从入门到入狱之入门

· requests模块+代理+bs4

阅读排行：
· 全程不用写代码，我用AI程序员写了一个飞机大战
· MongoDB 8.0这个新功能碉堡了，比商业数据库还牛
· 记一次.NET内存居高不下排查解决与启示
· DeepSeek 开源周回顾「GitHub 热点速览」
· 白话解读 Dapr 1.15：你的「微服务管家」又秀新绝活了

公告

昵称： Python-moon
园龄： 2年5个月
粉丝： 2
关注： 5

+加关注

2025年3月

日

一

二

三

四

五

六

随笔分类

随笔档案

阅读排行榜

评论排行榜

1. JavaScript教程(1)

你好，少年

学习、赚钱，这是你一辈子都要做的事，永远不能停下。

爬虫操作

爬虫操作

url网址中文编码解码

发送请求携带请求头

发送post请求给接口

模拟自动登录

Requests.session

post请求携带数据编码格式

返回的response中有哪些数据

编码问题

爬虫下载图片视频

requests解析json

ssl认证，爬虫关闭https警告

爬虫使用代理

爬虫任务超时处理/异常处理

上传文件

搭建本地代理池免费

django后端如何获得访问者IP地址

爬取梨视频

用bs4模块爬取新闻

bs4模块遍历文档树

bs4模块搜索标签

通过css选择器拿标签

公告

搜索

常用链接

随笔分类

随笔档案

相册

阅读排行榜

评论排行榜

推荐排行榜

最新评论

你好，少年

学习、赚钱，这是你一辈子都要做的事，永远不能停下。

爬虫操作

爬虫操作

url网址中文编码解码

发送请求携带请求头

发送post请求给接口

模拟自动登录

Requests.session

post请求携带数据编码格式

返回的response中有哪些数据

编码问题

爬虫下载图片视频

requests解析json

ssl认证，爬虫关闭https警告

爬虫使用代理

爬虫任务超时处理/异常处理

上传文件

搭建本地代理池 免费

django后端如何获得访问者IP地址

爬取梨视频

用bs4模块爬取新闻

bs4模块遍历文档树

bs4模块搜索标签

通过css选择器拿标签

公告

搜索

常用链接

随笔分类

随笔档案

相册

阅读排行榜

评论排行榜

推荐排行榜

最新评论

搭建本地代理池免费