80小说爬取
import os import random import re import time import lxml.etree import requests import faker fake = faker.Faker() uaList = [] for i in range(0, 10): uaList.append(fake.user_agent()) headers = { "User-Agent": random.choice(uaList) } def request_view(response): import webbrowser request_url = response.url base_url = '<head><base href="%s">' % (request_url) base_url = base_url.encode() content = response.content.replace(b"<head>", base_url) tem_html = open('tmp.html', 'wb') tem_html.write(content) tem_html.close() webbrowser.open_new_tab('tmp.html') class Crawl: host = "" headers = "" def __init__(self, host, headers): self.host = host self.headers = headers def get_content(self, url): requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数 s = requests.session() s.keep_alive = True # 关闭多余连接 resp = s.get(url, headers=self.headers) if resp.status_code != 200: print("crawl url error " + url + str(resp.status_code)) content = None else: content = resp.content return content def get_novel_list(self, content, code): html = lxml.etree.HTML(content) list = html.xpath('//div[@class="searchlist_l_box"]/ul//li') if len(list) > 0: for li in list: hrefs = li.xpath("./a/@href") if len(hrefs) > 0: for href in hrefs: detail_url = self.join_url(href) self.get_download_url(detail_url, code) def join_url(self, url): return "http://" + self.host + url def get_download_url(self, detail_url, code): content = self.get_content(detail_url) html = lxml.etree.HTML(content) title = html.xpath('//dd[@class="bt"]/h2/text()') download_url = html.xpath('//div[@class="downlinks"]//a/@href') if len(title) == 1 and len(download_url) >= 1: title = title[0] download_url = download_url[0] download_url = self.join_url(download_url) self.download_url(download_url, title, code) def download_url(self, url, title, code): title = re.sub(r'[?\\*|“<>:/]', '', title) content = self.get_content(url) html = lxml.etree.HTML(content) txt_url = html.xpath('//div[@class="downlist"][1]/li/strong/a/@href') if len(txt_url) == 1: self.download_txt(txt_url[0], title, code) def download_txt(self, url, title, code): path = "E:\\xiaoshuo\\" + code isExists = os.path.exists(path) if not isExists: os.makedirs(path) file = path + "\\" + title+".txt" if not os.path.exists(file): content = self.get_content(url) with open(file, "wb") as f: f.write(content) f.close() print("download success " + title) else: print(file + " exists") def start(self): # list_code = [ 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', # 'U', 'V', 'W', 'X', 'Y', 'Z'] # # for code in list_code: code = "K" url = "http://" + self.host + "/" + code + ".html" content = self.get_content(url) if content is not None: print("crawl url success:" + url) self.get_novel_list(content, code) if __name__ == "__main__": host = "www.txt80.com" crawl = Crawl(host, headers) try: crawl.start() except Exception as e: print(str(e))