第一个成功的Python爬虫:加请求头多线程爬取小说并简单清理数据保存当地文件夹

用到BeautifulSoup与threading库,来根据网上源码规律找到目标数据的url爬取!

import os
import re
import requests
import threading
from bs4 import BeautifulSoup


header = {"ttp-equiv":"mobile-agent",
          "content":"format=wml,"
        }

url = "https://book.qidian.com/info/1013066244#Catalog"
html = requests.get(url, header).content.decode()
soup = BeautifulSoup(html, "lxml")
html_list = soup.find_all(class_="volume")
# print(str(html_list))
# 查找文章名
book_name = soup.find(class_="book-info")
book_name = book_name.find("em").string
print(book_name)
url_list = re.findall(r"//(vipreader.*?)(data|target)", str(html_list))
print(url_list)

# 查看是否有需要下载的文件目录,没有就创建一个
if os.path.exists("./"+ book_name) == False:
    os.mkdir("./"+ book_name)


# 定义一个函数来保存读取的数据
def down_load(tuple_url):
    url = "https://" + tuple_url[0]
    qidian_html = requests.get(url).content.decode()
    soup = BeautifulSoup(qidian_html, "lxml")

    tittle = soup.find(class_="j_chapterName")
    txt_file_all = soup.find(class_="read-content j_readContent")
    #txt_file = txt_file_all.findall("p")
    #print("标题为:%s"% tittle.string)
    print(txt_file_all.text)

    txt = str(txt_file_all).replace("<p>", "\n")
    txt = txt.replace("<div class=\"read-content j_readContent\">", tittle.string)
    txt = txt.replace("</p>", "")
    txt = txt.replace("</div>", "")
    # 写入数据
    with open("./"+ book_name+ "/" + tittle.string + ".txt", "w")as f:
        f.write(txt)
        print("写入完成")


for i in url_list:
    t1 = threading.Thread(target=down_load, args=(i,))
    t1.start()
    print(i)

数据清理的时候,本来想以txt_file_all.text来处理,发现里面的空格转换不了换行,只能将txt_file_all所有数据进行替换,虽然麻烦一些但也能达到自己的目的!

posted @ 2019-07-12 21:11  哈哈哈滴滴  阅读(825)  评论(0)    收藏  举报