爬取卡通图片
因为想找个网站练练手,发现这个网站不错,所以对这个网站的部分图片进行了爬取,主要是卡通图片什么的,请直接查看代码。
这个还是初版,后面会有一个升级版多线程版。
# -*- coding: utf-8 -*-
# by wangcc
# mail:wangcc_sd@163.com
import requests
import sys
import io
import os
from bs4 import BeautifulSoup
import asyncio
import json
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf8') ###改变标准输出的默认编码
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
async def get_url(queue,url):
print(url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')
for line in range(len(soup.find_all('p', class_='list_h'))):
divObj = soup.find_all('p', class_='list_h')[line]
for i in divObj:
#print(divObj)
await queue.put(divObj)
await asyncio.sleep(1)
def get_page(url):
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
soup_page = soup.find_all('a', text="末页")
try:
page = str(str(soup_page).split('_')[1]).split('.')[0]
except IndexError as e:
print(e)
return 0
return page
def get_url_second(url):
page = get_page(url)
url_list = [url.split('.html')[0] + '_{}.html'.format(i) for i in range(int(page)) if i>=2 ]
url_list.append(url)
for index in range(len(url_list)):
url = url_list[index]
#print(url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
one_jpg=soup.find_all('script', type="application/ld+json")
reversed_news_arr=[]
for single_news in one_jpg:
reversed_news_arr.append(single_news)
script_info=(str(reversed_news_arr)[37:-10])
script_dict = json.loads(script_info)
id = script_dict["@id"]
title = script_dict["title"]
images = script_dict["images"][0]
jpg_name = str(images).split('/')[-1]
jpg_name="./date/" + title +'/'+ jpg_name
jpg_index = requests.get(images)
with open(jpg_name, 'wb')as jpg:
jpg.write(jpg_index.content)
jpg.close()
def dir_save(dir_name):
path='./date'
if not os.path.exists(path+'/'+dir_name):
os.mkdir(path+'/'+dir_name)
async def consumer(queue):
while True:
print('qsize--->',queue.qsize())
divObj = await queue.get()
href=divObj.a.get('href')
title=divObj.a.get('title')
dir_save(title)
url ="https://www.uumtu.com"+href
get_url_second(url)
continue
await asyncio.sleep(1)
async def main():
queue = asyncio.Queue()
for i in range(1,50):
#url='https://www.uumtu.com/meinv/list_{}.html'.format(i)
url = 'https://www.uumtu.com/katong/list_{}.html'.format(i)
print(url)
producer_1 = asyncio.create_task(get_url(queue, url))
consumer_1 = asyncio.create_task(consumer(queue))
if __name__ == '__main__':
for i in range(931):
pass
asyncio.run(main())
本文来自博客园,作者:Chuan_Chen,转载请注明原文链接:https://www.cnblogs.com/wangcc7/p/13648900.html