爬取漫画DB上的《浪客行》
漫画链接:https://www.manhuadb.com/manhua/324
建议:早上爬,速度较快。
天下无双宫本武藏
代码
# https://www.manhuadb.com/manhua/324
import os
import re
import time
import requests
from requests import codes
from bs4 import BeautifulSoup
from requests import RequestException
def get_page(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36'
+ '(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return 'status_code = ' + str(status_code)
except RequestException:
return 'RequestException'
def get_pagesNumber(text):
soup = BeautifulSoup(text, 'lxml')
pagesNumber = soup.find(name='div', class_="d-none vg-r-data")
return pagesNumber.attrs['data-total']
def parse_page(text):
soup = BeautifulSoup(text, 'lxml')
url = soup.find(name='img', class_="img-fluid show-pic")
chapter = soup.find(name='h2', class_="h4 text-center")
page = soup.find(name='span', class_="c_nav_page")
yield {
'url': url['src'],
'chapter': chapter.get_text(),
'page': page.get_text()
}
def save_image(item):
img_path = '浪客行' + os.path.sep + item.get('chapter') #os.path.sep是路径分隔符\
if not os.path.exists(img_path):
os.makedirs(img_path)
try:
resp = requests.get(item.get('url'))
if codes.ok == resp.status_code:
file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
file_name=item.get('chapter')[-2:]+'-'+item.get('page'), file_suffix='jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(resp.content)
print('Downloaded image path is %s' % file_path)
else:
print('Already Downloaded', file_path)
except Exception as e:
print(e)
if __name__ == '__main__':
for chapter in range(3678, 3715): #共37章节,3678到3714
base_url = 'https://www.manhuadb.com/manhua/324/320_'+str(chapter)
text = get_page(base_url+'.html')
pagesNumber = get_pagesNumber(text) #获取当前章节总页数
for page in range(1, int(pagesNumber)+1):
url = base_url+'_'+str(page)+'.html'
text = get_page(url)
for item in parse_page(text):
save_image(item)