python 爬取能源网

import requests
import re
import time
from bs4 import BeautifulSoup
import csv
import xlrd
from xlutils.copy import copy
import random
##屏蔽https错误
requests.packages.urllib3.disable_warnings()
class Spider():

    def __init__(self,keyworks):

        self.kw = keyworks
        self.headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
        "Cookie":"D3z_vi-ds=f1f6d61ffd02c29c1cd832a363888be3; __jsluid_s=0b360d705e0e333a682280ae3b03bf90; Hm_lvt_c909c1510b4aebf2db610b8d191cbe91=1655284406; Hm_lpvt_c909c1510b4aebf2db610b8d191cbe91=1655285546",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
        }
        self.csv_name = "123.csv"
        self.url = "https://www.dlzb.com/zb/search.php?kw="+str(self.kw)
        self.path = "123.xls"

    def getContent(self,nextUrl):
        res = requests.get(nextUrl, headers=self.headers, verify=False) #https访问
        return res
        pass

    #写入csv
    def writeXlx(self,title,kw):
        data = [title,kw]
        f = open(self.csv_name, 'a+', newline='', encoding='utf-8')
        # 2. 基于文件对象构建 csv写入对象
        csv_writer = csv.writer(f)
        # csv_writer.writerow(["作者", '摘要'])
        # 3. 构建列表头
        csv_writer.writerow(data)
        f.close()
        pass

    def getPage(self,p,total):

        for num in range(total):
            # if num == 1:
            #     break
            nextUrl = self.url+"&page="+str(num+p)
            print(nextUrl)
            #获取内容
            content = self.getContent(nextUrl)
            #print(content.text)
            # re.S匹配多行,包括换行符\n
            res = re.findall(r'<ul class=\"gclist_ul listnew\">(.*?)<\/ul>', content.text, re.S)
            if res:
                titleList = re.findall(r'<a class=\"gccon_title\".*?>(.*?)<\/a>', res[0], re.S)
                data = []
                for title in titleList:
                    soup = BeautifulSoup(title, 'html.parser')#过滤html标签
                    print(str(self.kw) +"-第"+str(num+p)+""+str(),soup.get_text())

                    data.append([soup.get_text(),self.kw])
                self.writeXLSAppend(data)

                time.sleep(random.uniform(1, 10))
            else:
                print(self.kw,"end*************")
                print(content.text)
                print(res)
                break
        pass


    def writeXLSAppend(self,value):
        index = len(value)  # 获取需要写入数据的行数
        workbook = xlrd.open_workbook(self.path)  # 打开工作簿
        sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
        worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
        rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
        new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
        new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
        for i in range(0, index):
            for j in range(0, len(value[i])):
                new_worksheet.write(i + rows_old, j, value[i][j])  # 追加写入数据,注意是从i+rows_old行开始写入
        new_workbook.save(self.path)  # 保存工作簿

        print("xls格式表格【追加】写入数据成功!")
if __name__ == '__main__':
    #https://www.dlzb.com/zb/search.php?kw=%E6%99%BA%E8%83%BD%E5%AE%A2%E6%9C%8D
    list = ['人工智能','知识图谱','计算机视觉','图像识别','文本挖掘','文本分析','知识问答','神经网络']

    for x in list:

        print(x)
        ##页数default=1,多少条(500)
        spider = Spider(x).getPage(1,1000) ##

 

posted @ 2022-06-16 14:03  freedom/cn  阅读(54)  评论(0编辑  收藏  举报