团队项目——爬取搜狐新闻
按照标签名字爬取新闻
from bs4 import BeautifulSoup #网页解析,获悉数据.231 import re #正则表达式 import urllib.request,urllib.error #制定URL,获取网页数据 import pymysql import traceback import time import requests import json #得到制定一个URL的网页内容 def askUrl(url): head={ # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88", # "Connection": "keep-alive", # "Cache-Control": "max-age = 0", # "Accept-Language": "zh - CN, zh;q = 0.9", # "Accept-Encoding": "gzip, deflate, br", # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57" } if __name__ == '__main__': request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode("utf-8") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reasen) return html #链接数据库 def get_conn(): conn = pymysql.connect( host="localhost", user="root", passwd="qwer1234", db="news", charset="utf8mb4" ) cursor = conn.cursor() return conn, cursor #关闭数据库 def close_conn(conn, cursor): if cursor: cursor.close() if conn: conn.close() #更新新闻数据 def update_news(allinfo): cursor = None conn = None try: conn, cursor = get_conn() sql = "insert into new(title, article, fenlei) values(%s,%s,%s)" print(f"{time.asctime()}开始更新最新数据") for item in allinfo: cursor.execute(sql, item) conn.commit() print(f"{time.asctime()}更新最新数据完毕") except: traceback.print_exc() finally: close_conn(conn, cursor) #插入新闻数据 # def insert_news(): # cursor = None # conn = None # try: # dic = getdata()[0] # 0是历史数据字典,1是最新详细数据列表 # print(f"{time.asctime()}开始插入历史数据") # conn, cursor = get_conn() # sql = "insert into history values(%s,%s,%s,%s,%s,%s,%s,%s,%s)" # for k, v in dic.items(): # cursor.execute(sql, [k, v.get("confirm"), v.get("confirm_add"), # v.get("suspect"), v.get("suspect_add"), # v.get("heal"), v.get("heal_add"), # v.get("dead"), v.get("dead_add"), # v.get("confirm"), # ]) # conn.commit() # print(f"{time.asctime()}插入历史数据完毕") # except: # traceback.print_exc() # finally: # close_conn(conn, cursor) #爬取网页信息 def get_info(baseurl): html = askUrl(baseurl) bs = BeautifulSoup(html, "html.parser") return bs #soup处理并转换成字符串 def transport(bs, info): ex_info = bs.find_all(class_=info) info = str(ex_info) return ex_info, info if __name__=="__main__": baseurl = "https://news.sohu.com/" html = askUrl(baseurl) bs = BeautifulSoup(html, "html.parser") ex_info = bs.find_all(class_="head-nav left") info = str(ex_info) findLink = re.compile(r'<a href="(.*?)">') link = re.findall(findLink, info) del link[0] del link[0] del link[9] #新闻分类总个数 # for item in link: # print(item) #************************************************************************************* ''' ****************** 体育类新闻 ****************** ''' #第一块上 bs = get_info(link[0]) ex_info, info = transport(bs, "theme__color__hover") print("************************************************************") findinfo = re.compile(r'<a class="theme__color__hover" href="(.*?)" target="_blank">') link0 = re.findall(findinfo, info) # print(len(link0)) # for item in link0: # print(item) print("************************************************************") #第一块下 ex_info, info = transport(bs, "s-one_center") findinfo = re.compile(r'<a href="(.*?)" target="_blank">') link1 = re.findall(findinfo, info) # print(len(link1)) # for item in link1: # print(item) # print("************************************************************") ex_info, info = transport(bs, "z-c-block-list-item") findinfo = re.compile(r'<a href="(.*?)" target="_blank">') link2 = re.findall(findinfo, info) # print(len(link2)) # for item in link2: # print(item) # print("************************************************************") ex_info, info = transport(bs, "z-c-block-list-item z-c-block-list-item-first") findinfo = re.compile(r'<a href="(.*?)" target="_blank">') link3 = re.findall(findinfo, info) # print(len(link3)) # for item in link3: # print(item) # print("************************************************************") ex_info, info = transport(bs, "z-c-block-list clear") findinfo = re.compile(r'<a href="(.*?)" target="_blank">') link4 = re.findall(findinfo, info) # print(len(link4)) # for item in link4: # print(item) # print("************************************************************") ex_info, info = transport(bs, "z-c-block") findinfo = re.compile(r'<a href="(.*?)" target="_blank">') link5 = re.findall(findinfo, info) # print(len(link5)) # for item in link5: # print(item) # print("************************************************************") ex_info, info = transport(bs, "z-head-news_item") findinfo = re.compile(r'<a href="(.*?)" target="_blank">') link6 = re.findall(findinfo, info) # print(len(link5)) # for item in link5: # print(item) # print("************************************************************") #所有链接相加 linkall = link0+link1+link2+link3+link4+link5+link6 #去除非文本类新闻 i =0 j =0 h =0 for index, value in enumerate(linkall): if len(value) < 73: del linkall[index] for index, value in enumerate(linkall): if len(value) < 73: del linkall[index] for index, value in enumerate(linkall): if len(value) < 73: del linkall[index] # print(len(linkall)) #测试 # for item in linkall: # print(item) #去除重复链接 for index, value in enumerate(linkall): for index1, value1 in enumerate(linkall): if value == value1 and index != index1: del linkall[index1] for item in linkall: print(item) print(len(linkall)) # print("************************************************************") allTitle = [] allArticle = [] allImg = [] #去除空页 for index, value in enumerate(linkall): bs = get_info(value) title = bs.select("h1") if title: #总标题表添加标题 continue else: print(index) print(value) del linkall[index] #爬取数据 for index, value in enumerate(linkall): bs = get_info(value) title = bs.select("h1") if title: a = [] str = '' #总标题表添加标题 allTitle.append(title[0].get_text().strip().replace("原创", "").replace("\n", "")) print(index) print(value) print(title[0].get_text().strip().replace("原创", "")) #总文章表添加文章 article = bs.select("article > p") for item in range(1, len(article)): str += article[item].get_text() # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"\n", "") allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", "")) #总图片表添加图片 # ex_info, info = transport(bs, "ql-align-center") # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>') # Img = re.findall(findImg, info) # if Img: # allImg.append(Img) # else: # allImg.append("") else: print(index) print(value) del linkall[index] #测试 print(len(linkall)) print(len(allTitle)) print(len(allArticle)) # print(len(allImg)) #插入mysql体育新闻数据 allinfo = [] for index, value in enumerate(allTitle): allinfo.append([value]) allinfo[index].append(allArticle[index]) allinfo[index].append('体育') # for item in allinfo: # print(item) update_news(allinfo) ''' 爬取旅游类新闻 ''' head = { # "Cookie": "pgv_pvid = 2445437098;RK = IWJFENCj / 2;ptcz = 0dc31e9c452a0701259378ea4d93881f2a4d4ab7d29d637d6da1b0b24d857f4c;Qs_lvt_323937 = 1588214559;Qs_pv_323937 = 3783410537228747000;pgv_pvi = 5491528704;eas_sid = t196y05258V4B6g478m7t073P2;luin = o0775929901;lskey = 000100001264ed0bece633b72b741fb54e5137a729bfa3647db8a18c0ee96579fd05aff03206e6cafbeb0f88", # "Connection": "keep-alive", # "Cache-Control": "max-age = 0", # "Accept-Language": "zh - CN, zh;q = 0.9", # "Accept-Encoding": "gzip, deflate, br", # "Accept": "text / html, application / xhtml + xml, application / xml;q = 0.9, image / webp, image / apng, * / *;q = 0.8", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57" } #存放所有的新闻网址 linkall = [] #所有存放新闻的.js文件 linkJQ = [] #所有超链接id Linkid = [] #所有超链接Authorid LinkAid = [] #存放所有标题 allTitle = [] #存放所有文章 allArticle = [] #存放所有图片链接 allImg = [] #汇总所有存入mysql的数据 allinfo = [] #制作每个js网页的链接 for i in range(1,10): linkJQ.append('https://cis.sohu.com/cis/feeds?callback=jQuery112404940224114573859_1619226100800&clientType=3&suv=2011032041009993&pvId=1619226100991dZepSty&sceneParam=%5B%7B%22page%22%3A'+str(i)+'%2C%22size%22%3A24%2C%22spm%22%3A%22smpc.travel-home.feed%22%7D%5D&refererSpm=smpc.travel-home.feed&refererPath=%2F') res = requests.get(linkJQ[i-1], headers=head) response_data = json.loads(res.text.replace('jQuery112404940224114573859_1619226100800(', '')[:-1]) #存入每个新闻的id和authorid for index, value in enumerate(response_data['smpc.travel-home.feed']['data']): if int(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['id']) > 1000000: Linkid.append(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['id']) LinkAid.append(str(response_data['smpc.travel-home.feed']['data'][index]['resourceData']['contentData']['authorId'])) #制作旅游新闻所有网址 for index,value in enumerate(Linkid): linkall.append('https://www.sohu.com/a/'+str(Linkid[index])+'_'+str(LinkAid[index])+'?scm=1004.768163804164063232.0.0.4162&spm=smpc.travel-home.feed.5.1619267001122I92VC4c') #最后一个链接是广告,删除 linkall.pop() #开始爬取主要数据 for index, value in enumerate(linkall): bs = get_info(value) title = bs.select("h1") article = bs.select("article > p") if title and article: str = '' # 总文章表添加文章 for item in range(1, len(article)): str += article[item].get_text() if len(str) * 4 > 16000: print("超出可储存长度") del linkall[index] continue # article = article[0].get_text().replace("返回搜狐,查看更多", "").replace("责任编辑:", "").replace(r"\n", "") allArticle.append(str.replace("返回搜狐,查看更多", "").replace("责任编辑:", "")) # 总标题表添加标题 allTitle.append(title[0].get_text().strip().replace("原创", "").replace("\n", "")) print(index) print(value) print(title[0].get_text().strip().replace("原创", "")) # 总图片表添加图片 # ex_info, info = transport(bs, "ql-align-center") # findImg = re.compile(r'<p class="ql-align-center"><img max-width="600" src="(.*?)"/></p>') # Img = re.findall(findImg, info) # if Img: # allImg.append(Img) # else: # allImg.append("") else: print(index) print(value) del linkall[index] # for item in linkall: # allinfo.append([item]) for index, value in enumerate(allTitle): allinfo.append([value]) allinfo[index].append(allArticle[index]) allinfo[index].append('旅游') for item in allinfo: print(item) update_news(allinfo)