【爬虫技术_Selenium】批量新闻资讯爬取
1. 效果展示
2. 实现思路
-
step 1:获取新闻列表信息
无论是爬取新闻还是别的内容,点进网站后映入眼帘的一定是内容的缩略列表,我们需要先在此界面获取到各项新闻的基础信息,如详细界面网址(这个最重要!因为首页对单个项目的内容显示一定是非常少的,为了详细信息一定要获取此项数据,才能随后爬取其中的更多信息!)、新闻图片、新闻标题和新闻作者。
-
step 2:获取各项新闻的详细信息
获取新闻内容(除视频,因为视频是经过严格加密的,暂时无法获取)
3. 代码实现
3.1 引入第三方包&定义全局变量
import hashlib
import traceback
import time
import urllib
from urllib.parse import urlparse
from selenium import webdriver
from common.models.news import News# 这个是我自己写的
import os
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from application import app, db
from common.libs.FLHelper.DateHelper import getCurrentTime # 这个是我自己写的
date = getCurrentTime(frm="%Y%m%d")
source = "puffpost"
urls = ["https://new.qq.com/ch/antip/", "https://new.qq.com/ch/ent/", "https://new.qq.com/ch/milite/",
"https://new.qq.com/ch/world/", "https://new.qq.com/ch/tech/", "https://new.qq.com/ch/finance/ "]
a_urls = ["https://new.qq.com/ch/antip/", "https://new.qq.com/ch/ent/", "https://new.qq.com/ch/tech/", "https://new.qq.com/ch/finance/"]
b_urls = ["https://new.qq.com/ch/milite/", "https://new.qq.com/ch/world/"]
3.2 获取新闻列表
def getList(self):
"""
获取列表信息
:return:None
"""
print("正在获取新闻列表信息...")
for url in urls: # 页面循环
genre = "a"
if url in b_urls:
genre = "b"
print("get list: " + url)
content = getHttpContent(url, flag="list") # 线上获取content操作时,必须关闭vpn
items_data = parseList(content, url, genre) # 解析界面 获得[{单一新闻的名字、详情网址}{...}{...}]型的信息
for item in items_data: # 单新闻信息循环
tmp_content = getHttpContent(item["link"]) # 单个新闻详情页面的Content
parseInfo(tmp_content, item)
def getHttpContent(url, flag=""):
"""
从线上获取页面content
:param flag:
:param url: 页面网址
:return: content
"""
print("正在从线上获取页面content...")
try:
driver = webdriver.Chrome(executable_path="C:/Program Files/Google/Chrome/Application/chromedriver.exe") # chromedriver的地址
driver.get(url=url)
if flag == "list":
for i in range(1, 200):
time.sleep(0.5)
driver.execute_script("window.scrollTo(window.scrollX, %d);" % (i * 200))
return driver.page_source
except Exception as e:
traceback.print_exc()
return None
3.3 解析新闻列表以获取粗略信息
def parseList(self, content, url, genres):
"""
解析新闻列表页面的content
:param url:
:param content: 待解析的页面Content
:return: 包含新闻名称和新闻详情网址字典的列表
"""
print("正在解析新闻列表页面的content...")
data = []
url_info = urlparse(url=url)
url_domain = url_info[0] + "://" + url_info[1]
tmp_soup = BeautifulSoup(str(content), "html.parser")
if genres == "a":
tmp_list = tmp_soup.select("div#List div.channel_mod ul#dataFull.list li.item.cf.itme-ls")
else:
tmp_list = tmp_soup.select("div#List div.hotnews ul#hot_scroll.list li.item.cf.itme-ls")
for item in tmp_list:
try:
tmp_genre = url.split("/")[-2]
tmp_target = item.select("a.picture")
tmp_href = tmp_target[0]["href"]
tmp_target = item.select("a.picture img")
tmp_name = tmp_target[0]["alt"]
tmp_pic = tmp_target[0]["src"]
tmp_Authors = item.select("div.detail div.binfo.cf div.fl a.source")[0].getText()
if len(tmp_Authors) == 0:
tmp_Authors = item.select("div.detail div.binfo.cf div.fl span.source")[0].getText()
print(tmp_name)
if "http:" not in tmp_href and "https:" not in tmp_href:
tmp_href = url_domain + tmp_href
tmp_data = {
"title": tmp_name,
"link": tmp_href,
"photo": tmp_pic,
"hash": hashlib.md5(tmp_href.encode("utf-8")).hexdigest(),
"authors": tmp_Authors,
"genres": tmp_genre
}
data.append(tmp_data)
except Exception as e:
traceback.print_exc()
print(data)
return data
3.4 解析详细内容页面以获取详细信息
def parseInfo(self, content, item):
"""
解析详情界面的content
:return:返回包含新闻信息的字典的列表
"""
print("正在解析详情界面的content...")
soup = BeautifulSoup(str(content), "html.parser")
try:
tmp_text = soup.select("div.content-article p.one-p")
item["text"] = ""
for text in tmp_text:
if text.select("img"):
continue
item["text"] += text.getText()
if item["text"] == "":
print("网站无文本信息,跳过...")
return False
download_pics(item)
app.logger.warning(item["text"])
tmp_year = soup.select("div.year.through span")[0].getText()
tmp_month_day = soup.select("div.md")[0].getText().replace("/", "-")
tmp_time = soup.select("div.time")[0].getText() + ":00"
tmp_date = tmp_year + "-" + tmp_month_day + " " + tmp_time
item["date"] = tmp_date
tmp_news_info = News.query.filter_by(hash=item["hash"]).first()
item["view_counter"] = 0
if tmp_news_info:
return False
tmp_model_info = News(**item)
db.session.add(tmp_model_info)
db.session.commit()
except Exception as e:
traceback.print_exc()
return True
def download_pics(self, item):
f = open('E:/dnr-bisher/static/images/news/' + str(item["hash"]) + ".jpg", 'wb')
f.write((urllib.request.urlopen(item["photo"])).read())
print("图片下载:" + item["photo"])
f.close()
3.5 执行代码
getList()