关于异步请求的两个案例

下载百度图片:

import asyncio
import json
import random
import re
import httpx
import requests
import time
from urllib import parse
import os
from loguru import logger

logger.add("children.log", rotation="23:59", encoding="utf-8")


class BaiduPicture:

    def __init__(self, query_list):
        self.query_list = query_list
        # 共四个需要格式化的 word queryWord pn 和时间戳
        self.base_url = 'https://image.baidu.com/search/acjson?tn=resultjson_com&logid=10807585482968436429&ipn=rj&ct=201326592&is=&fp=result&fr=&word={}&queryWord={}E7%90%83&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=&copyright=&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&expermode=&nojc=&isAsync=&pn={}&rn=30&gsm=5a&{}='
        self.search_headers = {
            "Accept": "text/plain, */*; q=0.01",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "Pragma": "no-cache",
            "Referer": "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=detail&fr=&hs=0&xthttps=111110&sf=1&fmq=1652751245395_R&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%93%88%E5%93%88&oq=%E5%93%88%E5%93%88&rsp=-1",
            "Sec-Fetch-Dest": "empty",
            "Sec-Fetch-Mode": "cors",
            "Sec-Fetch-Site": "same-origin",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36",
            "X-Requested-With": "XMLHttpRequest",
            "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Google Chrome\";v=\"101\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\""
        }
        self.search_cookies = {
            "BDqhfp": "%E5%93%88%E5%93%88%26%260-10-1undefined%26%26708%26%262",
            "BIDUPSID": "29D534423307903C62A41306DE256BDB",
            "PSTM": "1646999926",
            "BAIDUID": "44E4B3A47C120FD98AC4C1C1B43B1641:FG=1",
            "indexPageSugList": "%5B%22%E5%84%BF%E7%AB%A5%E8%B6%B4%E7%88%AC%22%2C%22%E5%84%BF%E7%AB%A5%E8%B6%B3%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E7%AF%AE%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E4%B9%92%E4%B9%93%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%8B%8D%E7%9A%AE%E7%90%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%B8%B8%E6%B3%B3%22%2C%22%E5%84%BF%E7%AB%A5%E8%8D%A1%E7%A7%8B%E5%8D%83%22%2C%22%E5%84%BF%E7%AB%A5%E6%BB%91%E6%BB%91%E6%A2%AF%22%2C%22%E5%84%BF%E7%AB%A5%E5%90%83%E9%A5%AD%22%5D",
            "BDORZ": "B490B5EBF6F3CD402E515D22BCDA1598",
            "BAIDUID_BFESS": "44E4B3A47C120FD98AC4C1C1B43B1641:FG=1",
            "BA_HECTOR": "058h252g8l8h2k04n91h843p10r",
            "H_PS_PSSID": "31253_35911_36165_34584_35979_36055_36337_26350_36301_36311",
            "delPer": "0",
            "PSINO": "6",
            "BDRCVFR[X_XKQks0S63]": "mk3SLVN4HKm",
            "userFrom": "www.baidu.com",
            "firstShowTip": "1",
            "BDRCVFR[dG2JNJb_ajR]": "mk3SLVN4HKm",
            "ab_sr": "1.0.1_ZWE1OWY2NmRkNTUzYmRhMjFmYmNlNGQxMjQzOGEzNmQxNmYxYTgxZjgyNzNmOTYxMWI3MDczMWI3Nzc1ODk1OGM3YzU3Mjk5NTc5NzQwNWU2Nzg5OTc4MmIwNDg4MTZjMzI1ZGUxZTA4NmQwZGU4YzBhNWEzZmZiODgxYWUxMjhhMTU0YTljNmYzY2QyMTYxOWFmMzEwNTk3YTRhNzgzYg==",
            "BDRCVFR[-pGxjrCMryR]": "mk3SLVN4HKm"
        }
        self.img_headers = {
            "accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
            "accept-language": "zh-CN,zh;q=0.9",
            "cache-control": "no-cache",
            "pragma": "no-cache",
            "referer": "https://image.baidu.com/",
            "sec-ch-ua": "\" Not A;Brand\";v=\"99\", \"Chromium\";v=\"101\", \"Google Chrome\";v=\"101\"",
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": "\"Windows\"",
            "sec-fetch-dest": "image",
            "sec-fetch-mode": "no-cors",
            "sec-fetch-site": "same-site",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.67 Safari/537.36"
        }

    def get_search_data(self, current_query, pn):
        try:
            quote_query = parse.quote(current_query)
            temp_times = str(int(time.time() * 1000))
            url = self.base_url.format(quote_query, quote_query, pn, temp_times)
            response = requests.get(url=url, headers=self.search_headers, cookies=self.search_cookies)
            if response.status_code == 200:
                result = response.content.decode('utf-8')
                return result
            else:
                logger.error(
                    f"get_search status is {response.status_code} ,query is {current_query} ,page is {pn}")
                logger.info(f"response is {response.content.decode('utf-8')}")
        except Exception as search_err:
            logger.error(f"get_search error,query is {current_query} ,page is {pn},err is {search_err}")
            logger.info(f"response is {response.content.decode('utf-8')}")

    def get_img_content(self, urls, current_query, pn):

        for url in urls:
            logger.info(f"get img ,url is {url}")
            try:
                content = requests.get(url=url, headers=self.img_headers).content
                if len(content) > 0:
                    with open('./{}/{}.jpg'.format(current_query, url[28:35]), 'wb') as f:
                        f.write(content)
                        f.flush()
                    print(f'{current_query}-{pn} 完成一张照片')
                else:
                    print(f"下载图片失败,图片没有内容")
            except Exception as err:
                logger.error(f"get image content err,err is {err} ")

    async def get_conent(self, client, url, current_query, pn):
        logger.info(f"get img ,url is {url}")
        try:
            res = await client.get(url)
            with open('./{}/{}.jpg'.format(current_query, url[28:35]), 'wb') as f:
                f.write(res.content)
                f.flush()
            print(f'{current_query}-{pn} 完成一张照片')
        except Exception as err:
            logger.error(f"get image content err,err is {err} ")

    async def get_img_content_async(self, urls, current_query, pn):
        async with httpx.AsyncClient(headers=self.img_headers) as client:
            task_list = []
            for url in urls:
                req = self.get_conent(client, url, current_query, pn)
                task = asyncio.create_task(req)
                task_list.append(task)
            await asyncio.gather(*task_list)

    def parse_search(self, html):
        num_pattern = re.compile('"displayNum":(.*?),')
        num_list = num_pattern.findall(html)
        return num_list

    def parse_search_img_url(self, html):
        stop = False
        url_pattern = re.compile('"middleURL":"(.*?)"')
        url_list = url_pattern.findall(html)
        if not url_list:
            try:
                dic = json.loads(html)
                data = dic.get("data")[0]
                if not data:
                    stop = True
                    print('=')
            except Exception as err:
                logger.error(f"parse json err.err is {err}")

        return stop, url_list

    # 是否翻页
    def run(self, next_page=True):
        """
        next_page 是否翻页
        :param next_page:
        :return:
        """

        for query in self.query_list:
            try:
                if not os.path.exists('./{}/'.format(query)):
                    os.mkdir('./{}'.format(query))
                response = self.get_search_data(query, 0)

                num_list = self.parse_search(response)
                if num_list:
                    num = num_list[0]
                    logger.info(f'{query} 一共有{num}张照片')
                else:
                    logger.error(f'{query} 获取不到数据总量')
                    continue
                if int(num) % 30 == 0:
                    pages = int(num) / 30
                else:
                    pages = int(num) // 30 + 2

                if not next_page:
                    pages = 1

                for pn in range(pages):
                    try:
                        resp = self.get_search_data(query, pn * 30)
                        stop, urls = self.parse_search_img_url(resp)
                        logger.info(f"query is {query},page is {pn},urls is \n {urls}")
                        if stop:
                            break
                        if urls:
                            # self.get_img_content(urls, query, pn)
                            # 异步请求
                            asyncio.run(self.get_img_content_async(urls, query, pn))
                        else:
                            logger.error(f"get origin err,can not get picture urls,query is {query},page is {pn}")
                            logger.info(f"parse img is none,resp  is \n {resp} ")
                    except Exception as for_num_err:
                        logger.error(f"for_num_err ,query is {query},err is {for_num_err}")
                    finally:
                        time.sleep(random.randint(3, 5))
            except Exception as for_query_err:
                logger.error(f"for_query_err ,query is {query},err is {for_query_err}")


if __name__ == '__main__':
    query = [
        # '少儿 乒乓 照片',
        '萌娃趴在床上', '婴儿趴在床上', '婴儿趴在地上', '萌娃趴在地上', '躺在床上小孩', '躺在地上小孩', '萌娃躺在床上', '萌娃躺在地上', '萌娃跑步', '小孩跑步', '儿童室内搭积木',
        '婴儿电动车', '小朋友滑滑梯照片', '儿童滑滑梯照片', '小朋友荡秋千照片', '小孩荡秋千照片', '儿童游泳照片', '儿童拍皮球照片', '儿童篮球照片', '儿童足球照片',
        '儿童舞台照片', '舞台上的小朋友', '儿童在卧室', '卧室里的小朋友', '儿童学步车', '看书的小朋友', '画画的小朋友', '小朋友写作业']



    print(len(query))
    bp = BaiduPicture(query_list=query)
    bp.run(next_page=False)

 

 

异步下载小说

# url记录
# 章节id
# http://dushu.baidu.com/api/pc/getCatalog?data={book_id:4306063500}

# 第一章
# http://dushu.baidu.com/api/pc/getChapterContent?data={book_id:4306063500,cid:4306063500|11348571,need_bookinfo:1}

import requests
import aiohttp
import asyncio
import json
import aiofiles

url = ''
# html = requests.get(url)
'''
步骤:
1.同步操作,获取所有章节名称
2.异步操作,获取章节具体内容
'''


async def downwenzhang(cid,bid,title):
    # 准备url
    data={
        'book_id':bid,
        'cid':f"{bid}|{cid}",
        'need_bookinfo': 1
    }
    data=json.dumps(data)
    url=f'http://dushu.baidu.com/api/pc/getChapterContent?data={data}'
    # 开始请求数据
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            dic = await resp.json()

            async with aiofiles.open(title+".txt",mode='w',encoding='utf-8') as f:
                await f.write(dic['data']['novel']['content'])

async def getCatalog(url):
    resp=requests.get(url)
    txt=resp.json()
    tasks = []
    print(txt)
    for item in txt["data"]["novel"]["items"]:
        title = item["title"]
        cid = item["cid"]
        tasks.append(downwenzhang(cid, "4306063500", title))
    await asyncio.wait(tasks)


if __name__ == '__main__':
    bid = "4306063500"
    url ='http://dushu.baidu.com/api/pc/getCatalog?data={"book_id":"4306063500"}'
    asyncio.run(getCatalog(url))

 

posted @ 2023-07-27 17:03  阿布_alone  阅读(49)  评论(0编辑  收藏  举报
TOP