不墨迹爬就完啦：用bs4爬壁纸网、用re爬动画影评网、用xpath爬中国票房数据、爬梨视频、ThreadPoolExecutor线程池爬电影票房、Queue队列和线程池爬斗图网、用协程asyncio和异步包aiohttp和aiofiles爬美图网、

1、用bs4爬壁纸网

import requests
from bs4 import BeautifulSoup  # 导入BeautifulSoup
from urllib.parse import urljoin  # 专门用来做url路径拼接的
import time
header = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
}
url = "https://desk.zol.com.cn/pc/"
resp = requests.get(url, headers=header)
resp.encoding = "gbk"  # 设置编码
main_page_source = resp.text
print(type(main_page_source))
# 需要解析页面源代码. 获取到a标签中的href的值
# 直接把页面源代码塞进去
main_page = BeautifulSoup(main_page_source, "html.parser")
# 从BeautifulSoup  提取你要的东西
a_list = main_page.find("ul", attrs={"class": "pic-list2"}).find_all("a")
for a in a_list:
    # 我需要的是a标签的href
    # 想要从bs4里面拿到某一个标签的某一个属性
    href = a.get("href")  # get(属性)
    if href.endswith(".exe"):  # 判断字符串href是否以.exe结尾
        continue
    text = a.find("em").text  # 文本
    # 你的href是不完整的, 是需要拼接的
    # 用你获取到这个href的url和href拼接
    # 必须记住. 好用
    href = urljoin(url, href)
    # 我要访问详情页. 获取图片下载地址
    child_resp = requests.get(href, headers=header)
    child_resp.encoding = "gbk"
    child_page_source = child_resp.text
    child_page = BeautifulSoup(child_page_source, "html.parser")
    # 可能会有风险, NoneType xxxx
    src = child_page.find("img", attrs={"id": "bigImg"}).get("src")
    print(src)
    # 下载图片
    img_resp = requests.get(src)
    # 1. 如果a中的名字不重复, 可以用a的文字
    # 2. 如果a中的名字重复,  可以使用路径中的名字
    file_name = src.split("/")[-1]
    with open(file_name, mode="wb") as f:
        f.write(img_resp.content)
    break   # 测试用
    time.sleep(1)
2、用re爬动画影评网

# 需求: 文章标题, 来源, 作者, 时间, 内容
# http://www.animationcritics.com/chinese_aniamtion.html
# 1. 在首页中获取到10个详情页的url地址.
# 经过循环.拿到每一个url地址
# 2. 访问详情页的url. 得到详情页的内容
# 3. 在详情页内容中提取到最终你需要的内容
# 你要的东西在页面源代码里??
import requests
import re
import time
url = "http://www.animationcritics.com/chinese_aniamtion.html"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.82 Safari/537.36"
}
resp = requests.get(url, headers=headers)  # 发请求
main_page_source = resp.text  # 字符串, 得到页面源代码
# re.S 让.能匹配换行
main_obj = re.compile(r'<li style="margin-bottom:10px;">.*?href="(?P<url>.*?)" title="(?P<title>.*?)"', re.S)
# 来源的正则
laiyuan_obj = re.compile(r"来源:</span>(?P<laiyuan>.*?)</span>", re.S)
zuozhe_obj = re.compile(r"作者:</span>(?P<zuozhe>.*?)</span>", re.S)
pub_data_obj = re.compile(r"发布时间: </span>(?P<pub_date>.*?)</span>", re.S)
section_obj = re.compile(r"<section.*?>(?P<content>.*?)</section>", re.S)
p_obj = re.compile(r"<p data-track=.*?>(?P<content>.*?)</p>", re.S)
content_filter_obj = re.compile(r"<.*?>", re.S)
# 匹配东西
result = main_obj.finditer(main_page_source, re.S)
for item in result:  # 每次循环得到一条匹配的结果
    child_url = item.group("url")  # 详情页url
    child_title = item.group("title")  # 标题
    # print(child_title, child_url)
    # 访问详情页
    child_resp = requests.get(child_url, headers=headers)
    child_page_source = child_resp.text
    # print(child_page_source)
    # 对详情页进行数据提取
    lyr = laiyuan_obj.search(child_page_source)   # search的结果是match对象.
    if lyr:  # 判空操作
        laiyuan = lyr.group("laiyuan")   # 需要group拿结果
    else:
        laiyuan = ""
    # print(laiyuan)
    # 作者
    zz_r = zuozhe_obj.search(child_page_source)
    if zz_r:
        zuozhe = zz_r.group("zuozhe")
    else:
        zuozhe = ""
    pub_data_r = pub_data_obj.search(child_page_source)
    if pub_data_r:
        pub_data = pub_data_r.group("pub_date")
    else:
        pub_data = ""
    print(child_title, laiyuan, zuozhe, pub_data)
    # 内容怎么搞??
    # 拿所有section中的内容
    sec_list = []
    section_results = section_obj.finditer(child_page_source, re.S)
    for section in section_results:
        content = section.group("content")
        sec_list.append(content)  # 把拿到的section的内容放到列表中
    all_content = "".join(sec_list)  # 拼接所有的section为一个字符串
    if not all_content:  # 如果是空. 用 p_obj 重新提取
        section_results = p_obj.finditer(child_page_source, re.S)
        for section in section_results:
            content = section.group("content")
            sec_list.append(content)  # 把拿到的section的内容放到列表中
        all_content = "".join(sec_list)  # 拼接所有的section为一个字符串
    # 用正则表达式去替换内容. re.sub()
    # 结果 = re.sub(正则, 替换之后的结果, 整个字符串)
    # all_content = re.sub(r"<.*?>", "", all_content)
    all_content = content_filter_obj.sub("", all_content)
    print(all_content)
    time.sleep(1)  # 睡眠.安全.
    # break  # 测试
3、用xpath爬中国票房数据

import requests
from lxml import etree
# 1.拿页面源代码#
# 2.xpath提取数据
# //table/tbody/tr
url = "http://www.boxofficecn.com/boxoffice2022"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
}
resp = requests.get(url, headers=headers)
page = etree.HTML(resp.text)
trs = page.xpath("//table/tbody/tr")[1:-1]
# tr全是数据
for tr in trs:
    num = tr.xpath("./td[1]/text()")
    year = tr.xpath("./td[2]//text()")
    name = tr.xpath("./td[3]//text()")[0]
    # py基础
    if name:
        "".join(name)  # 这是合理的方案
    money = tr.xpath("./td[4]/text()")
    print(num, year, name, money)
# 不正常
movie1 = ['京北的我们（', '重映', '）']  # 京北的我们（重映）
len(movie1)  # ???? 3
# 正常的  1
movie2 = ['不要忘记我爱你']
# 这个是空
movie3 = []
4、爬梨视频

import requests
# url = "https://www.pearvideo.com/videoStatus.jsp?contId=1756814&mrd=0.8773583037760648"
while 1:
    main_url = input("请输入你需要爬取的梨视频的地址: ")  # "https://www.pearvideo.com/video_1756814"
    contId = main_url.split("_")[-1]
    print(contId)
    url = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}"
    headers = {
        "Referer": main_url,  # 处理防盗链
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36"
    }
    resp = requests.get(url, headers=headers)
    dic = resp.json()
    # print(dic)
    src_url = dic['videoInfo']['videos']['srcUrl']
    systemTime = dic["systemTime"]
    src_url = src_url.replace(systemTime, f"cont-{contId}")
    # print(src_url)
    # 下载视频
    print("已经找到视频. 等待下载中....")
    resp = requests.get(src_url, headers=headers)
    with open(f"{contId}.mp4", mode="wb") as f:
        f.write(resp.content)
    print("下载成功")
# 对比
# https://video.pearvideo.com/mp4/third/20220330/cont-1756814-15454898-100434-hd.mp4  # 正常的
# https://video.pearvideo.com/mp4/third/20220330/1648910860599-15454898-100434-hd.mp4  # 破烂
5、ThreadPoolExecutor线程池爬电影票房

import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
import time
def str_tools(lst):
    if lst:
        s = "".join(lst)
        return s.strip()
    else:
        return ""
def get_movie_info(year):
    # 抓取1996年的电影票房
    f = open(f"{year}.csv", mode="w", encoding="utf-8")
    url = f"http://www.boxofficecn.com/boxoffice{year}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
    }
    resp = requests.get(url, headers=headers)
    tree = etree.HTML(resp.text)
    trs = tree.xpath("//table/tbody/tr")[1:]
    for tr in trs:
        num = tr.xpath("./td[1]//text()")
        year = tr.xpath("./td[2]//text()")
        name = tr.xpath("./td[3]//text()")
        money = tr.xpath("./td[4]//text()")
        num = str_tools(num)  # ???
        year = str_tools(year)  # ???
        name = str_tools(name)  # ???
        money = str_tools(money)  # ???
        f.write(f"{num},{year},{name},{money}\n")
if __name__ == '__main__':
    # s1 = time.time()  # 当前系统时间的时间戳
    # for y in range(1994, 2023):
    #     get_movie_info(y)
    # s2 = time.time()  # 执行之后的时间戳
    # print(s2 - s1)  # 16.23
    # 效果不够好. 有丢失数据现象.这个网站不能直接这么干. 需要数据的验证或者单线程.
    s1 = time.time()
    with ThreadPoolExecutor(20) as t:
        for y in range(1994, 2023):
            t.submit(get_movie_info, y)  # 交任务
    s2 = time.time()
    print(s2 - s1)  # 如果用来下载图片. 视频等资源. 效果会比这个好n倍
6、Queue队列和线程池爬斗图网

import requests
from lxml import etree
import time
from multiprocessing import Queue  # 队列
from multiprocessing import Process  # 进程
from concurrent.futures import ThreadPoolExecutor  # 多线程
# 图片确实在页面源代码中
# 但是, 图片不在src里, 在data-original放着
# 1.拿到页面源代码
# 2.提取data-original
# 3.下载图片
# 知识点: 进程之间是不能直接通信的(操作系统层面)
# 写一个函数. 专门负责提取data-original
# 第一个进程. 只负责提取url
def get_img_url(q):
    for page in range(1, 5):
        # 先考虑一页数据怎么抓
        url = f"https://www.pkdoutu.com/photo/list/?page={page}"
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
        }
        resp = requests.get(url, headers=headers)
        tree = etree.HTML(resp.text)
        img_urls = tree.xpath("//li[@class='list-group-item']//img/@data-original")
        for img_url in img_urls:
            print(img_url)  # ? 7
            # 把拿到的img_url 塞入队列
            q.put(img_url)  # 固定的
    q.put("滚蛋吧.没了")  # 结束的一个消息
# 第二个进程. 只负责下载图片
def img_process(q):  # 从队列中提取url. 进行下载
    with ThreadPoolExecutor(10) as t:  # ?
        while 1:  # 这边不确定有多少个. 那就一直拿
            img_url = q.get()  # 没有问题. 这里面, get是一个阻塞的逻辑
            if img_url == '滚蛋吧.没了':
                break
            # 在进程中开启多线程(唐马儒)
            t.submit(download_img, img_url)
def download_img(url):
    # 如何下载一张图片
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"
    }
    resp = requests.get(url, headers=headers)
    # 文件名称
    file_name = url.split("/")[-1]
    with open("./img/" + file_name, mode="wb") as f:
        f.write(resp.content)
    print("一张图片下载完成")
if __name__ == '__main__':
    # 准备队列
    s1 = time.time()
    q = Queue()  # 主进程 水
    p1 = Process(target=get_img_url, args=(q,))  # 单独开辟一个内存 阿大
    p2 = Process(target=img_process, args=(q,))  # 单独开辟一个内存 阿二
    p1.start()
    p2.start()
    p1.join()  # 主进程等待子进程跑完
    p2.join()  # 主进程等待子进程跑完
    s2 = time.time()
    print(s2 - s1)
7、用协程asyncio和异步包aiohttp和aiofiles爬美图网

import asyncio
import aiohttp  # pip install aiohttp   => requests
import aiofiles  # pip install aiofiles   => open
async def download(url):
    print("我要开始下载了", url)
    file_name = url.split("/")[-1]
    # 我要发送请求
    # 如果with后面用的是一个异步的包. 那么绝大多数这里前面要加async
    async with aiohttp.ClientSession() as session:  # 理解: session = requests.session()
        async with session.get(url) as resp:  # 理解: resp = session.get()
            # 等待服务器返回结果了????
            # 页面源代码
            # page_source = await resp.text(encoding="utf-8")
            # 需要json
            # dic = await resp.json()
            # 字节
            content = await resp.content.read()
            # 有了结果要干嘛??
            # 在异步协程中. 可以用同步代码
            # open()  # 慢
            # with open(file_name, mode="wb") as f:
            #     f.write(content)
            async with aiofiles.open(file_name, mode="wb") as f:
                await f.write(content)
    print("一张图下载完毕!")
async def main():
    urls = [
        "https://www.xiurenji.vip/uploadfile/202110/20/1F214426892.jpg",
        "https://www.xiurenji.vip/uploadfile/202110/20/91214426753.jpg"
    ]
    tasks = []
    for url in urls:
        tasks.append(asyncio.ensure_future(download(url)))
    await asyncio.wait(tasks)
if __name__ == '__main__':
    # asyncio.run(main())
    event_loop = asyncio.get_event_loop()
    event_loop.run_until_complete(main())

posted @ 2023-11-08 12:58 干it的小张阅读(63) 评论(0) 编辑收藏举报

刷新页面返回顶部

干it的小张

不墨迹爬就完啦：用bs4爬壁纸网、用re爬动画影评网、用xpath爬中国票房数据、爬梨视频、ThreadPoolExecutor线程池爬电影票房、Queue队列和线程池爬斗图网、用协程asyncio和异步包aiohttp和aiofiles爬美图网、

公告