团队项目——爬取搜狐新闻

按照标签名字爬取新闻

from bs4 import BeautifulSoup        #网页解析,获悉数据.231
import re                            #正则表达式
import urllib.request,urllib.error   #制定URL,获取网页数据
import pymysql
import traceback
import time
import requests
import json
#得到制定一个URL的网页内容
def askUrl(url):
    head={
        # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
        # "Connection": "keep-alive",
        # "Cache-Control": "max-age = 0",
        # "Accept-Language": "zh - CN, zh;q = 0.9",
        # "Accept-Encoding": "gzip, deflate, br",
        # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
    if __name__ == '__main__':
        request = urllib.request.Request(url, headers=head)
        html = ""
        try:
            response = urllib.request.urlopen(request)
            html = response.read().decode("utf-8")
        except urllib.error.URLError as e:
            if hasattr(e,"code"):
                print(e.code)
            if hasattr(e,"reason"):
                print(e.reasen)
        return html
#链接数据库
def get_conn():
    conn = pymysql.connect(
        host="localhost",
        user="root",
        passwd="qwer1234",
        db="news",
        charset="utf8mb4"
    )
    cursor = conn.cursor()
    return conn, cursor
#关闭数据库
def close_conn(conn, cursor):
    if cursor:
        cursor.close()
    if conn:
        conn.close()
#更新新闻数据
def update_news(allinfo):
    cursor = None
    conn = None
    try:
        conn, cursor = get_conn()
        sql = "insert into new(title, article, fenlei) values(%s,%s,%s)"
        print(f"{time.asctime()}开始更新最新数据")
        for item in allinfo:
            cursor.execute(sql, item)
        conn.commit()
        print(f"{time.asctime()}更新最新数据完毕")
    except:
        traceback.print_exc()
    finally:
        close_conn(conn, cursor)

#插入新闻数据
# def insert_news():
#     cursor = None
#     conn = None
#     try:
#         dic = getdata()[0]  # 0是历史数据字典,1是最新详细数据列表
#         print(f"{time.asctime()}开始插入历史数据")
#         conn, cursor = get_conn()
#         sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)"
#         for k, v in dic.items():
#             cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"),
#                                  v.get("suspect"), v.get("suspect_add"),
#                                  v.get("heal"), v.get("heal_add"),
#                                  v.get("dead"), v.get("dead_add"),
#                                  v.get("confirm"),
#                                  ])
#             conn.commit()
#             print(f"{time.asctime()}插入历史数据完毕")
#     except:
#         traceback.print_exc()
#     finally:
#         close_conn(conn, cursor)
#爬取网页信息
def get_info(baseurl):
    html = askUrl(baseurl)
    bs = BeautifulSoup(html, "html.parser")
    return bs
#soup处理并转换成字符串
def transport(bs, info):
    ex_info = bs.find_all(class_=info)
    info = str(ex_info)
    return ex_info, info
if __name__=="__main__":
    baseurl = "https://news.sohu.com/"
    html = askUrl(baseurl)
    bs = BeautifulSoup(html, "html.parser")
    ex_info = bs.find_all(class_="head-nav left")
    info = str(ex_info)
    findLink = re.compile(r'<a href="(.*?)">')
    link = re.findall(findLink, info)
    del link[0]
    del link[0]
    del link[9]
    #新闻分类总个数
    # for item in link:
    #     print(item)
#*************************************************************************************
'''
******************
    体育类新闻
******************
'''
#第一块上
bs = get_info(link[0])
ex_info, info = transport(bs, "theme__color__hover")
print("************************************************************")
findinfo = re.compile(r'<a class="theme__color__hover" href="(.*?)" target="_blank">')
link0 = re.findall(findinfo, info)
# print(len(link0))
# for item in link0:
#     print(item)
print("************************************************************")
#第一块下
ex_info, info = transport(bs, "s-one_center")
findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
link1 = re.findall(findinfo, info)
# print(len(link1))
# for item in link1:
#     print(item)
# print("************************************************************")
ex_info, info = transport(bs, "z-c-block-list-item")
findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
link2 = re.findall(findinfo, info)
# print(len(link2))
# for item in link2:
#     print(item)
# print("************************************************************")
ex_info, info = transport(bs, "z-c-block-list-item z-c-block-list-item-first")
findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
link3 = re.findall(findinfo, info)
# print(len(link3))
# for item in link3:
#     print(item)
# print("************************************************************")
ex_info, info = transport(bs, "z-c-block-list clear")
findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
link4 = re.findall(findinfo, info)
# print(len(link4))
# for item in link4:
#     print(item)
# print("************************************************************")
ex_info, info = transport(bs, "z-c-block")
findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
link5 = re.findall(findinfo, info)
# print(len(link5))
# for item in link5:
#     print(item)
# print("************************************************************")
ex_info, info = transport(bs, "z-head-news_item")
findinfo = re.compile(r'<a href="(.*?)" target="_blank">')
link6 = re.findall(findinfo, info)
# print(len(link5))
# for item in link5:
#     print(item)
# print("************************************************************")
#所有链接相加
linkall = link0+link1+link2+link3+link4+link5+link6
#去除非文本类新闻
i =0
j =0
h =0
for index, value in enumerate(linkall):
    if len(value) < 73:
        del linkall[index]
for index, value in enumerate(linkall):
    if len(value) < 73:
        del linkall[index]
for index, value in enumerate(linkall):
    if len(value) < 73:
        del linkall[index]
# print(len(linkall))
#测试
# for item in linkall:
#     print(item)
#去除重复链接
for index, value in enumerate(linkall):
    for index1, value1 in enumerate(linkall):
        if value == value1 and index != index1:
            del linkall[index1]
for item in linkall:
    print(item)
print(len(linkall))
# print("************************************************************")
allTitle = []
allArticle = []
allImg = []
#去除空页
for index, value in enumerate(linkall):
    bs = get_info(value)
    title = bs.select("h1")
    if title:
        #总标题表添加标题
        continue
    else:
        print(index)
        print(value)
        del linkall[index]
#爬取数据
for index, value in enumerate(linkall):
    bs = get_info(value)
    title = bs.select("h1")
    if title:
        a = []
        str = ''
        #总标题表添加标题
        allTitle.append(title[0].get_text().strip().replace("原创", "").replace("\n", ""))
        print(index)
        print(value)
        print(title[0].get_text().strip().replace("原创", ""))
        #总文章表添加文章
        article = bs.select("article > p")
        for item in range(1, len(article)):
           str += article[item].get_text()
        # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"\n", "")
        allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", ""))
        #总图片表添加图片
        # ex_info, info = transport(bs, "ql-align-center")
        # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>')
        # Img = re.findall(findImg, info)
        # if Img:
        #     allImg.append(Img)
        # else:
        #     allImg.append("")
    else:
        print(index)
        print(value)
        del linkall[index]
#测试
print(len(linkall))
print(len(allTitle))
print(len(allArticle))
# print(len(allImg))
#插入mysql体育新闻数据
allinfo = []
for index, value in enumerate(allTitle):
    allinfo.append([value])
    allinfo[index].append(allArticle[index])
    allinfo[index].append('体育')
# for item in allinfo:
#     print(item)
update_news(allinfo)
'''

爬取旅游类新闻

'''
head = {
        # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88",
        # "Connection": "keep-alive",
        # "Cache-Control": "max-age = 0",
        # "Accept-Language": "zh - CN, zh;q = 0.9",
        # "Accept-Encoding": "gzip, deflate, br",
        # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57"
    }
#存放所有的新闻网址
linkall = []
#所有存放新闻的.js文件
linkJQ = []
#所有超链接id
Linkid = []
#所有超链接Authorid
LinkAid = []
#存放所有标题
allTitle = []
#存放所有文章
allArticle = []
#存放所有图片链接
allImg = []
#汇总所有存入mysql的数据
allinfo = []
#制作每个js网页的链接
for i in range(1,10):
    linkJQ.append('https://cis.sohu.com/cis/feeds?callback=jQuery112404940224114573859_1619226100800&clientType=3&suv=2011032041009993&pvId=1619226100991dZepSty&sceneParam=%5B%7B%22page%22%3A'+str(i)+'%2C%22size%22%3A24%2C%22spm%22%3A%22smpc.travel-home.feed%22%7D%5D&refererSpm=smpc.travel-home.feed&refererPath=%2F')
    res = requests.get(linkJQ[i-1], headers=head)
    response_data = json.loads(res.text.replace('jQuery112404940224114573859_1619226100800(', '')[:-1])
#存入每个新闻的id和authorid
    for index, value in enumerate(response_data['smpc.travel-home.feed']['data']):
        if int(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['id']) > 1000000:
            Linkid.append(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['id'])
            LinkAid.append(str(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['contentData']['authorId']))

#制作旅游新闻所有网址
for index,value in enumerate(Linkid):
    linkall.append('https://www.sohu.com/a/'+str(Linkid[index])+'_'+str(LinkAid[index])+'?scm=1004.768163804164063232.0.0.4162&spm=smpc.travel-home.feed.5.1619267001122I92VC4c')
#最后一个链接是广告,删除
linkall.pop()
#开始爬取主要数据
for index, value in enumerate(linkall):
    bs = get_info(value)
    title = bs.select("h1")
    article = bs.select("article > p")
    if title and article:
        str = ''
        # 总文章表添加文章
        for item in range(1, len(article)):
            str += article[item].get_text()
        if len(str) * 4 > 16000:
            print("超出可储存长度")
            del linkall[index]
            continue
        # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"\n", "")
        allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", ""))
        # 总标题表添加标题
        allTitle.append(title[0].get_text().strip().replace("原创", "").replace("\n", ""))
        print(index)
        print(value)
        print(title[0].get_text().strip().replace("原创", ""))
        # 总图片表添加图片
        # ex_info, info = transport(bs, "ql-align-center")
        # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>')
        # Img = re.findall(findImg, info)
        # if Img:
        #     allImg.append(Img)
        # else:
        #     allImg.append("")
    else:
        print(index)
        print(value)
        del linkall[index]
# for item in linkall:
#     allinfo.append([item])
for index, value in enumerate(allTitle):
    allinfo.append([value])
    allinfo[index].append(allArticle[index])
    allinfo[index].append('旅游')
for item in allinfo:
    print(item)
update_news(allinfo)

 

posted @ 2021-05-13 08:04  帅超007  阅读(159)  评论(0编辑  收藏  举报