Python爬虫实例(六)多进程下载金庸网小说
目标任务:使用多进程下载金庸网各个版本(旧版、修订版、新修版)的小说
代码如下:
# -*- coding: utf-8 -*- import requests from lxml import etree from multiprocessing import Pool import os import sys reload(sys) sys.setdefaultencoding('utf-8') headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'} def download(title,url, filename): response = requests.get(url, headers=headers).text html = etree.HTML(response) pages = html.xpath('//div//p/text()')[2:] with open(filename, 'a') as f: f.write(title+'\n') for page in pages: with open(filename, 'a') as f: f.write(page+'\n') def main(url): start_url = 'http://www.jinyongwang.com'+url sname = start_url.split('/')[-2] if sname.startswith('o'): folder = 'old/' if(not os.path.exists(folder)): os.makedirs(folder) elif sname.startswith('n'): folder = 'new/' if(not os.path.exists(folder)): os.makedirs(folder) else: folder = 'now/' if(not os.path.exists(folder)): os.makedirs(folder) filename = folder+sname+'.txt' base_url = 'http://www.jinyongwang.com' response = requests.get(start_url, headers=headers).text html = etree.HTML(response) urls = html.xpath('//ul[@class="mlist"]/li/a/@href') titles = html.xpath('//ul[@class="mlist"]/li//text()') for index,url in enumerate(urls): full_url = base_url+url title = titles[index] download(title, full_url, filename) if __name__ == '__main__': url01 = 'http://www.jinyongwang.com/' response = requests.get(url01, headers=headers).text html = etree.HTML(response) urls = html.xpath('//li[@class="book_li"]/p[3]//a/@href') pool = Pool() pool.map(main,urls) pool.close() pool.join()
结果展示: