【爬虫技术_Selenium】批量新闻资讯爬取

1. 效果展示

在这里插入图片描述

2. 实现思路

  • step 1:获取新闻列表信息

    无论是爬取新闻还是别的内容,点进网站后映入眼帘的一定是内容的缩略列表,我们需要先在此界面获取到各项新闻的基础信息,如详细界面网址(这个最重要!因为首页对单个项目的内容显示一定是非常少的,为了详细信息一定要获取此项数据,才能随后爬取其中的更多信息!)、新闻图片、新闻标题和新闻作者。

  • step 2:获取各项新闻的详细信息

    获取新闻内容(除视频,因为视频是经过严格加密的,暂时无法获取)

3. 代码实现

3.1 引入第三方包&定义全局变量

import hashlib
import traceback
import time
import urllib
from urllib.parse import urlparse
from selenium import webdriver
from common.models.news import News# 这个是我自己写的
import os
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from application import app, db
from common.libs.FLHelper.DateHelper import getCurrentTime # 这个是我自己写的

date = getCurrentTime(frm="%Y%m%d")
source = "puffpost"
urls = ["https://new.qq.com/ch/antip/", "https://new.qq.com/ch/ent/", "https://new.qq.com/ch/milite/",
             "https://new.qq.com/ch/world/", "https://new.qq.com/ch/tech/", "https://new.qq.com/ch/finance/ "]
a_urls = ["https://new.qq.com/ch/antip/", "https://new.qq.com/ch/ent/", "https://new.qq.com/ch/tech/", "https://new.qq.com/ch/finance/"]
b_urls = ["https://new.qq.com/ch/milite/", "https://new.qq.com/ch/world/"]

3.2 获取新闻列表

def getList(self):
        """
        获取列表信息
        :return:None
        """
        print("正在获取新闻列表信息...")
        for url in urls:  # 页面循环
            genre = "a"
            if url in b_urls:
                genre = "b"
            print("get list: " + url)
            content = getHttpContent(url, flag="list")  # 线上获取content操作时,必须关闭vpn
            items_data = parseList(content, url, genre)  # 解析界面  获得[{单一新闻的名字、详情网址}{...}{...}]型的信息
            for item in items_data:  # 单新闻信息循环
                tmp_content = getHttpContent(item["link"])  # 单个新闻详情页面的Content
                parseInfo(tmp_content, item)

def getHttpContent(url, flag=""):
        """
        从线上获取页面content
        :param flag:
        :param url: 页面网址
        :return: content
        """
        print("正在从线上获取页面content...")
        try:
            driver = webdriver.Chrome(executable_path="C:/Program Files/Google/Chrome/Application/chromedriver.exe")  # chromedriver的地址
            driver.get(url=url)
            if flag == "list":
                for i in range(1, 200):
                    time.sleep(0.5)
                    driver.execute_script("window.scrollTo(window.scrollX, %d);" % (i * 200))
            return driver.page_source
        except Exception as e:
            traceback.print_exc()
            return None

3.3 解析新闻列表以获取粗略信息

   def parseList(self, content, url, genres):
        """
        解析新闻列表页面的content
        :param url:
        :param content: 待解析的页面Content
        :return: 包含新闻名称和新闻详情网址字典的列表
        """
        print("正在解析新闻列表页面的content...")
        data = []
        url_info = urlparse(url=url)
        url_domain = url_info[0] + "://" + url_info[1]
        tmp_soup = BeautifulSoup(str(content), "html.parser")
        if genres == "a":
            tmp_list = tmp_soup.select("div#List div.channel_mod ul#dataFull.list li.item.cf.itme-ls")
        else:
            tmp_list = tmp_soup.select("div#List div.hotnews ul#hot_scroll.list li.item.cf.itme-ls")
        for item in tmp_list:
            try:
                tmp_genre = url.split("/")[-2]
                tmp_target = item.select("a.picture")
                tmp_href = tmp_target[0]["href"]
                tmp_target = item.select("a.picture img")
                tmp_name = tmp_target[0]["alt"]
                tmp_pic = tmp_target[0]["src"]
                tmp_Authors = item.select("div.detail div.binfo.cf div.fl a.source")[0].getText()
                if len(tmp_Authors) == 0:
                    tmp_Authors = item.select("div.detail div.binfo.cf div.fl span.source")[0].getText()
                print(tmp_name)
                if "http:" not in tmp_href and "https:" not in tmp_href:
                    tmp_href = url_domain + tmp_href
                tmp_data = {
                    "title": tmp_name,
                    "link": tmp_href,
                    "photo": tmp_pic,
                    "hash": hashlib.md5(tmp_href.encode("utf-8")).hexdigest(),
                    "authors": tmp_Authors,
                    "genres": tmp_genre
                }
                data.append(tmp_data)
            except Exception as e:
                traceback.print_exc()
        print(data)
        return data

3.4 解析详细内容页面以获取详细信息

def parseInfo(self, content, item):
        """
        解析详情界面的content
        :return:返回包含新闻信息的字典的列表
        """
        print("正在解析详情界面的content...")
        soup = BeautifulSoup(str(content), "html.parser")
        try:
            tmp_text = soup.select("div.content-article p.one-p")
            item["text"] = ""
            for text in tmp_text:
                if text.select("img"):
                    continue
                item["text"] += text.getText()
            if item["text"] == "":
                print("网站无文本信息,跳过...")
                return False
            download_pics(item)
            app.logger.warning(item["text"])
            tmp_year = soup.select("div.year.through span")[0].getText()
            tmp_month_day = soup.select("div.md")[0].getText().replace("/", "-")
            tmp_time = soup.select("div.time")[0].getText() + ":00"
            tmp_date = tmp_year + "-" + tmp_month_day + " " + tmp_time
            item["date"] = tmp_date
            tmp_news_info = News.query.filter_by(hash=item["hash"]).first()
            item["view_counter"] = 0
            if tmp_news_info:
                return False
            tmp_model_info = News(**item)

            db.session.add(tmp_model_info)
            db.session.commit()
        except Exception as e:
            traceback.print_exc()
        return True
        
def download_pics(self, item):
        f = open('E:/dnr-bisher/static/images/news/' + str(item["hash"]) + ".jpg", 'wb')
        f.write((urllib.request.urlopen(item["photo"])).read())
        print("图片下载:" + item["photo"])
        f.close()

3.5 执行代码

getList()
posted @ 2022-05-30 00:06  ccql  阅读(84)  评论(0编辑  收藏  举报  来源