[Python]网络小说爬取、爬虫

1.源代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : HtmlParser.py
# @Author: 赵路仓
# @Date  : 2020/3/27
# @Desc  :
# @Contact : 398333404@qq.com 

import requests
from bs4 import BeautifulSoup

# 请求头
head = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
}
url = 'https://www.xsbiquge.com/74_74627/'
count = 0


def menu(name):
    url = search(name)
    f = open("test.txt", "w", encoding="utf-8")
    r = requests.get(url, headers=head)
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, "html.parser")
    chapter = soup.find("dl")
    for c in chapter:
        try:
            for a in c:
                url = "https://www.xsbiquge.com" + str(a.attrs['href'])
                f.write(url + "\n")
        except:
            print("ss")
    f.close()


def single_page(url):
    global count
    r = requests.get(url, headers=head)
    r.encoding = r.apparent_encoding
    soup = BeautifulSoup(r.text, "html.parser")
    h1 = soup.find("h1")
    content = soup.find("div", {"id": "content"})
    all = str(h1.string) + "\n" + str(content).replace("<div id=\"content\">", "    ").replace("<br/><br/><br/><br/>",
                                                                                                "\n\n").replace(
        "<br/><br/>", "\n\n").replace("</div>", "") + "\n"
    count += 1
    print("当前第%d章" % (count))
    # print(all)
    return all


def content(path, name):
    menu(name)
    fh = open(path + name + ".txt", "w", encoding="utf-8")
    fh.write(name + "\n\n")
    fh.close()
    fw = open(path + name + ".txt", "a+", encoding="utf-8")
    f = open("test.txt", "r+", encoding="utf-8")
    for line in f:
        line = line.rstrip("\n")
        fw.write(single_page(line))
    fw.close()
    f.close()


def search(name):
    searchurl = "https://www.xsbiquge.com/search.php?keyword="
    searchurl = searchurl + name
    r = requests.get(searchurl, headers=head)
    soup = BeautifulSoup(r.text, "html.parser")
    title = soup.find_all("a", {"cpos": "title"})
    for t in title:
        print(t.attrs['title'])
        print(t.attrs['href'])
        if t.attrs['title'] == name:
            return t.attrs['href']


if __name__ == "__main__":
    # print(single_page("https://www.xsbiquge.com/74_74627/3845841.html"))
    # menu("E:/test.txt",url)
    # content("E:/", "万古大帝")
    # search("斗破苍穹")
    content("E:/", "斗破苍穹")

爬取结果:

 

posted @ 2020-04-25 08:10  雾霾王者  阅读(663)  评论(0编辑  收藏  举报