222wan

导航

python爬取数据爬取图书信息

# encoding=utf-8

import json  # json 包,用于读取解析,生成json格式的文件内容
import time
from random import randint

import requests  # 请求包  用于发起网络请求
from bs4 import BeautifulSoup  # 解析页面内容帮助包
from lxml import etree
import re  # 正则表达式
import csv

# sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8') #改变标准输出的默认编码

# 全局变量
booknamelist = []
authorlist = []
typelist = []
contentlist = []
novel = []


def get_data(url):
    """
    获取数据
    :param url: 请求网址
    :return:返回请求的页面内容
    """
    # 请求头,模拟浏览器,否则请求会返回418
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36 Edg/125.0.0.0'}
    time.sleep(randint(1, 8))#控制爬虫速率以保证不会很快被网站察觉否则爬两页就会被封
    resp = requests.get(url=url, headers=header)  # 发送请求
    resp.encoding = 'utf-8'
    if resp.status_code == 200:
        # 如果返回成功,则返回内容
        return resp.text
    else:
        # 否则,打印错误状态码,并返回空
        print('返回状态码:', resp.status_code)
        return ''


# 写入文件
def writeIntoCSVFile(fileName):
    '''
    :param fileName:待保存csv文件路径
    :return: None
    '''
    # newline = ''解决csv写入内容自动换行的问题
    # 参考文献:https://blog.csdn.net/weixin_44064937/article/details/105745398
    f = open(fileName, 'w', newline='', encoding='utf-8')
    csv_writer = csv.writer(f)
    # 构建列表头
    csv_writer.writerow(['作者', '书名', '类型', '简介', '小说'])
    for i in range(len(booknamelist)):
        csv_writer.writerow(
            [str(authorlist[i]).strip(), str(booknamelist[i]).strip(), str(typelist[i]).strip(),
             str(contentlist[i]).strip(), str(novel[i])])#加上strip去除里面的空格
    f.close()


# 爬取600页数据一共9000条数据
for i in range(30, 100):

    # LoopUrl = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='  # 网页切换
    url = 'http://read.nlc.cn/yuewen/index?&pageNo=' + str(i) + '&categoryId=14500'#url里面的参数可以根据实际信息修改
    html = get_data(url=url)  # 获取数据
    root = etree.HTML(html)
    booknames = root.xpath('//li/a/span[@class="right"]/span[@class="tt"]/text()')  # 书名
    authors = root.xpath('//li/a/span[@class="right"]/span[@class="txt1"]/text()')  # 作者信息
    types = root.xpath('//li/a/span[@class="right"]/span[@class="txt1"]/i/text()')  # 类型
    contents = root.xpath('//li/a/span[@class="right"]/span[@class="txt2"]/text()')  # 简介
    for bookname in booknames:
        str(bookname).strip()
        booknamelist.append(bookname)
        # print(bookname)
    for author in authors:
        str(author).strip()
        if author != '\r\n\t\t\t\t\t\t\t\t\t':  # 因为打印出来的里面有很多空格所以吧不是空格的代替掉
            authorlist.append(author)
        # print(author)
    for type in types:
        str(type).strip()
        typelist.append(type)
        # print(type)
    for content in contents:
        novel.append("传记")#根据不同的类型进行修改
        str(content).strip()
        contentlist.append(content)
        # print(content)
    print("", i, "页爬取完成")
    # print("bookname:list:", booknamelist)
    # print("authorlist:", authorlist)
    writeIntoCSVFile(fileName='data/cultural2.txt')
    # print(html + "555555555555555555555")

print('done')

 

posted on 2024-06-17 19:53  角落的蘑菇  阅读(22)  评论(0编辑  收藏  举报