爬取卡通图片

因为想找个网站练练手,发现这个网站不错,所以对这个网站的部分图片进行了爬取,主要是卡通图片什么的,请直接查看代码。
这个还是初版,后面会有一个升级版多线程版。

# -*- coding: utf-8 -*-
# by wangcc
# mail:wangcc_sd@163.com

import requests
import sys
import io
import os
from bs4 import BeautifulSoup
import asyncio
import json

sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8')  ###改变标准输出的默认编码

headers = {
    'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}

async def get_url(queue,url):
    print(url)
    response  = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text,'html.parser')
    for line in range(len(soup.find_all('p', class_='list_h'))):
        divObj = soup.find_all('p', class_='list_h')[line]
        for i in divObj:
            #print(divObj)
            await queue.put(divObj)
    await asyncio.sleep(1)

def get_page(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    soup_page = soup.find_all('a', text="末页")
    try:
        page = str(str(soup_page).split('_')[1]).split('.')[0]
    except IndexError as e:
        print(e)
        return 0
    return page

def get_url_second(url):
    page = get_page(url)
    url_list = [url.split('.html')[0] + '_{}.html'.format(i) for i in range(int(page)) if i>=2 ]
    url_list.append(url)
    for index in range(len(url_list)):
        url = url_list[index]
        #print(url)
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        one_jpg=soup.find_all('script', type="application/ld+json")
        reversed_news_arr=[]
        for single_news in one_jpg:
            reversed_news_arr.append(single_news)
        script_info=(str(reversed_news_arr)[37:-10])
        script_dict = json.loads(script_info)
        id = script_dict["@id"]
        title = script_dict["title"]
        images = script_dict["images"][0]
        jpg_name = str(images).split('/')[-1]
        jpg_name="./date/" + title +'/'+ jpg_name
        jpg_index = requests.get(images)
        with open(jpg_name, 'wb')as jpg:
            jpg.write(jpg_index.content)
            jpg.close()



def dir_save(dir_name):
    path='./date'
    if not os.path.exists(path+'/'+dir_name):
        os.mkdir(path+'/'+dir_name)

async def consumer(queue):
    while True:
        print('qsize--->',queue.qsize())
        divObj = await queue.get()
        href=divObj.a.get('href')
        title=divObj.a.get('title')
        dir_save(title)
        url ="https://www.uumtu.com"+href
        get_url_second(url)
        continue
        await asyncio.sleep(1)



async def main():
    queue = asyncio.Queue()
    for i in range(1,50):
        #url='https://www.uumtu.com/meinv/list_{}.html'.format(i)
        url = 'https://www.uumtu.com/katong/list_{}.html'.format(i)
        print(url)
        producer_1 = asyncio.create_task(get_url(queue, url))
        consumer_1 = asyncio.create_task(consumer(queue))

if __name__ == '__main__':
    for i in range(931):
        pass
    asyncio.run(main())


posted @ 2020-09-10 22:39  Chuan_Chen  阅读(243)  评论(0编辑  收藏  举报
#waifu-toggle { background-color: #fa0; border-radius: 5px; bottom: 66px; color: #fff; cursor: pointer; font-size: 12px; right: 0; margin-right: -100px; padding: 5px 2px 5px 5px; position: fixed; transition: margin-right 1s; width: 60px; writing-mode: vertical-lr; } #waifu-toggle.waifu-toggle-active { margin-right: -40px; } #waifu-toggle.waifu-toggle-active:hover { margin-right: -30px; } #waifu { bottom: -1000px; right: 0; line-height: 0; margin-bottom: -10px; position: fixed; transform: translateY(3px); transition: transform .3s ease-in-out, bottom 3s ease-in-out; z-index: 1; } #waifu:hover { transform: translateY(0); } #waifu-tips { animation: shake 50s ease-in-out 5s infinite; background-color: rgba(236, 217, 188, .5); border: 1px solid rgba(224, 186, 140, .62); border-radius: 12px; box-shadow: 0 3px 15px 2px rgba(191, 158, 118, .2); font-size: 14px; line-height: 24px; margin: -30px 20px; min-height: 70px; opacity: 0; overflow: hidden; padding: 5px 10px; position: absolute; text-overflow: ellipsis; transition: opacity 1s; width: 250px; word-break: break-all; } #waifu-tips.waifu-tips-active { opacity: 1; transition: opacity .2s; } #waifu-tips span { color: #0099cc; } #waifu #live2d { cursor: grab; height: 300px; position: relative; width: 300px; } #waifu #live2d:active { cursor: grabbing; } #waifu-tool { color: #aaa; opacity: 0; position: absolute; left: -10px; top: 70px; transition: opacity 1s; } #waifu:hover #waifu-tool { opacity: 1; } #waifu-tool span { color: #7b8c9d; cursor: pointer; display: block; line-height: 30px; text-align: center; transition: color .3s; } #waifu-tool span:hover { color: #0684bd; /* #34495e */ } @keyframes shake { 2% { transform: translate(.5px, -1.5px) rotate(-.5deg); } 4% { transform: translate(.5px, 1.5px) rotate(1.5deg); } 6% { transform: translate(1.5px, 1.5px) rotate(1.5deg); } 8% { transform: translate(2.5px, 1.5px) rotate(.5deg); } 10% { transform: translate(.5px, 2.5px) rotate(.5deg); } 12% { transform: translate(1.5px, 1.5px) rotate(.5deg); } 14% { transform: translate(.5px, .5px) rotate(.5deg); } 16% { transform: translate(-1.5px, -.5px) rotate(1.5deg); } 18% { transform: translate(.5px, .5px) rotate(1.5deg); } 20% { transform: translate(2.5px, 2.5px) rotate(1.5deg); } 22% { transform: translate(.5px, -1.5px) rotate(1.5deg); } 24% { transform: translate(-1.5px, 1.5px) rotate(-.5deg); } 26% { transform: translate(1.5px, .5px) rotate(1.5deg); } 28% { transform: translate(-.5px, -.5px) rotate(-.5deg); } 30% { transform: translate(1.5px, -.5px) rotate(-.5deg); } 32% { transform: translate(2.5px, -1.5px) rotate(1.5deg); } 34% { transform: translate(2.5px, 2.5px) rotate(-.5deg); } 36% { transform: translate(.5px, -1.5px) rotate(.5deg); } 38% { transform: translate(2.5px, -.5px) rotate(-.5deg); } 40% { transform: translate(-.5px, 2.5px) rotate(.5deg); } 42% { transform: translate(-1.5px, 2.5px) rotate(.5deg); } 44% { transform: translate(-1.5px, 1.5px) rotate(.5deg); } 46% { transform: translate(1.5px, -.5px) rotate(-.5deg); } 48% { transform: translate(2.5px, -.5px) rotate(.5deg); } 50% { transform: translate(-1.5px, 1.5px) rotate(.5deg); } 52% { transform: translate(-.5px, 1.5px) rotate(.5deg); } 54% { transform: translate(-1.5px, 1.5px) rotate(.5deg); } 56% { transform: translate(.5px, 2.5px) rotate(1.5deg); } 58% { transform: translate(2.5px, 2.5px) rotate(.5deg); } 60% { transform: translate(2.5px, -1.5px) rotate(1.5deg); } 62% { transform: translate(-1.5px, .5px) rotate(1.5deg); } 64% { transform: translate(-1.5px, 1.5px) rotate(1.5deg); } 66% { transform: translate(.5px, 2.5px) rotate(1.5deg); } 68% { transform: translate(2.5px, -1.5px) rotate(1.5deg); } 70% { transform: translate(2.5px, 2.5px) rotate(.5deg); } 72% { transform: translate(-.5px, -1.5px) rotate(1.5deg); } 74% { transform: translate(-1.5px, 2.5px) rotate(1.5deg); } 76% { transform: translate(-1.5px, 2.5px) rotate(1.5deg); } 78% { transform: translate(-1.5px, 2.5px) rotate(.5deg); } 80% { transform: translate(-1.5px, .5px) rotate(-.5deg); } 82% { transform: translate(-1.5px, .5px) rotate(-.5deg); } 84% { transform: translate(-.5px, .5px) rotate(1.5deg); } 86% { transform: translate(2.5px, 1.5px) rotate(.5deg); } 88% { transform: translate(-1.5px, .5px) rotate(1.5deg); } 90% { transform: translate(-1.5px, -.5px) rotate(-.5deg); } 92% { transform: translate(-1.5px, -1.5px) rotate(1.5deg); } 94% { transform: translate(.5px, .5px) rotate(-.5deg); } 96% { transform: translate(2.5px, -.5px) rotate(-.5deg); } 98% { transform: translate(-1.5px, -1.5px) rotate(-.5deg); } 0%, 100% { transform: translate(0, 0) rotate(0); } } © 2022 GitHub, Inc. Terms Privacy Securi