Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容
Python3:爬取新浪、网易、今日头条、UC四大网站新闻标题及内容
以爬取相应网站的社会新闻内容为例:
一、新浪:
新浪网的新闻比较好爬取,我是用BeautifulSoup直接解析的,它并没有使用JS异步加载,直接爬取就行了。
''' 新浪新闻:http://news.sina.com.cn/society/ Date:20180920 Author:lizm Description:获取新浪新闻 ''' import requests from bs4 import BeautifulSoup from urllib import request import sys import re import os def getNews(title,url,m): Hostreferer = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } req = request.Request(url) response = request.urlopen(req) #过滤非utf-8的网页新闻 response = response.read().decode('utf-8',"ignore") soup = BeautifulSoup(response,'lxml') tag = soup.find('div',class_='article') if tag == None: return 0 #获取文章发布时间 fb_date = soup.find('div','date-source').span.string #获取发布网站名称 fb_www= soup.find('div','date-source').a.string #获取文章内容 rep = re.compile("[\s+\.\!\/_,$%^*(+\"\']+|[+<>?、~*()]+") title = rep.sub('',title) title = title.replace(':',':') filename = sys.path[0]+"/news/"+title+".txt" with open(filename,'w',encoding='utf8') as file_object: file_object.write(fb_date + " " + fb_www) file_object.write("\n") file_object.write("网址:"+url) file_object.write("\n") file_object.write(title) file_object.write(tag.get_text()) i = 0 for image in tag.find_all('div','img_wrapper'): title_img = title +str(i) #保存图片 #判断目录是否存在 if (os.path.exists(sys.path[0]+"/news/"+title)): pass else: #不存在,则新建目录 os.mkdir(sys.path[0]+"/news/"+title) os.chdir(sys.path[0]+"/news/"+title) file_name = "http://news.sina.com.cn/"+image.img.get('src').replace('//','') html = requests.get(file_name, headers=Hostreferer) # 图片不是文本文件,以二进制格式写入,所以是html.content title_img = title_img +".jpg" f = open(title_img, 'wb') f.write(html.content) f.close() i+=1 print('成功爬取第', m,'个新闻',title) return 0 #获取社会新闻(最新的162条新闻) def getTitle(url): req = request.Request(url) response = request.urlopen(req) response = response.read().decode('utf8') soup = BeautifulSoup(response,'lxml') y = 0 for tag in soup.find('ul',class_='seo_data_list').find_all('li'): if tag.a != None: #if y== 27: print(y,tag.a.string,tag.a.get('href')) temp = tag.a.string getNews(temp,tag.a.get('href'),y) y += 1 if __name__ == '__main__': url = 'http://news.sina.com.cn/society/' getTitle(url)
二、网易:
网易新闻的标题及内容是使用js异步加载的,单纯的下载网页源代码是没有标题及内容的,我们可以在Network的js中找到我们需要的内容,这里我使用了正则表达式来获取我们需要的标题及其链接,并使用了BeautifulSoup来获取相应标题的内容。
import re from urllib import request from bs4 import BeautifulSoup def download(title, url): req = request.urlopen(url) res = req.read() soup = BeautifulSoup(res,'lxml') #print(soup.prettify()) tag = soup.find('div',class_='post_text') #print(tag.get_text()) title = title.replace(':','') title = title.replace('"','') title = title.replace('|','') title = title.replace('/','') title = title.replace('\\','') title = title.replace('*','') title = title.replace('<','') title = title.replace('>','') title = title.replace('?','') #print(title) file_name = r'D:\code\python\spider_news\NetEase_news\sociaty\\' +title + '.txt' file = open(file_name,'w',encoding = 'utf-8') file.write(tag.get_text()) if __name__ == '__main__': urls = ['http://temp.163.com/special/00804KVA/cm_shehui.js?callback=data_callback', 'http://temp.163.com/special/00804KVA/cm_shehui_02.js?callback=data_callback', 'http://temp.163.com/special/00804KVA/cm_shehui_03.js?callback=data_callback'] for url in urls: #url = 'http://temp.163.com/special/00804KVA/cm_shehui_02.js?callback=data_callback' req = request.urlopen(url) res = req.read().decode('gbk') #print(res) pat1 = r'"title":"(.*?)",' pat2 = r'"tlink":"(.*?)",' m1 = re.findall(pat1,res) news_title = [] for i in m1: news_title.append(i) m2 = re.findall(pat2,res) news_url = [] for j in m2: news_url.append(j) for i in range(0,len(news_url)): #print(news_title[i],news_body[i]) download(news_title[i],news_url[i]) print('正在爬取第' + str(i) + '个新闻',news_title[i])
三、头条:
头条的新闻跟前两个也都不一样,它的标题和链接是封装到json文件中的,但是他json文件的url参数是通过一个js随机算法变化的,所以我们需要模拟json文件的参数,否则我们找不到json文件的具体url,我是通过http://www.jianshu.com/p/5a93673ce1c0这篇博客才了解到url获取方法的,而且也解决了总是下载重复新闻的问题,该网站自带反爬机制,需要添加cookie。关于新闻的内容,我用了正则表达式提取了中文。
from urllib import request import requests import json import time import math import hashlib import re from bs4 import BeautifulSoup def get_url(max_behot_time, AS, CP): url = 'https://www.toutiao.com/api/pc/feed/?category=news_society&utm_source=toutiao&widen=1' \ '&max_behot_time={0}' \ '&max_behot_time_tmp={0}' \ '&tadrequire=true' \ '&as={1}' \ '&cp={2}'.format(max_behot_time, AS, CP) return url def get_ASCP(): t = int(math.floor(time.time())) e = hex(t).upper()[2:] m = hashlib.md5() m.update(str(t).encode(encoding='utf-8')) i = m.hexdigest().upper() if len(e) != 8: AS = '479BB4B7254C150' CP = '7E0AC8874BB0985' return AS,CP n = i[0:5] a = i[-5:] s = '' r = '' for o in range(5): s += n[o] + e[o] r += e[o + 3] + a[o] AS = 'AL'+ s + e[-3:] CP = e[0:3] + r + 'E1' # print("AS:"+ AS,"CP:" + CP) return AS,CP def download(title, news_url): # print('正在爬') req = request.urlopen(news_url) if req.getcode() != 200: return 0 res = req.read().decode('utf-8') #print(res) pat1 = r'content:(.*?),' pat2 = re.compile('[\u4e00-\u9fa5]+') result1 = re.findall(pat1,res) #print(len(result1)) if len(result1) == 0: return 0 print(result1) result2 = re.findall(pat2,str(result1)) result3 = [] for i in result2: if i not in result3: result3.append(i) #print(result2) title = title.replace(':','') title = title.replace('"','') title = title.replace('|','') title = title.replace('/','') title = title.replace('\\','') title = title.replace('*','') title = title.replace('<','') title = title.replace('>','') title = title.replace('?','') with open(r'D:\code\python\spider_news\Toutiao_news\society\\' + title + '.txt','w') as file_object: file_object.write('\t\t\t\t') file_object.write(title) file_object.write('\n') file_object.write('该新闻地址:') file_object.write(news_url) file_object.write('\n') for i in result3: #print(i) file_object.write(i) file_object.write('\n') # file_object.write(tag.get_text()) #print('正在爬取') def get_item(url): #time.sleep(5) cookies = {'tt_webid': '6478612551432734221'} wbdata = requests.get(url,cookies = cookies) wbdata2 = json.loads(wbdata.text) data = wbdata2['data'] for news in data: title = news['title'] news_url = news['source_url'] news_url = 'https://www.toutiao.com' + news_url print(title, news_url) if 'ad_label' in news: print(news['ad_label']) continue download(title,news_url) next_data = wbdata2['next'] next_max_behot_time = next_data['max_behot_time'] # print("next_max_behot_time:{0}".format(next_max_behot_time)) return next_max_behot_time if __name__ == '__main__': refresh = 50 for x in range(0,refresh+1): print('第{0}次:'.format(x)) if x == 0: max_behot_time = 0 else: max_behot_time = next_max_behot_time #print(next_max_behot_time) AS,CP = get_ASCP() url = get_url(max_behot_time,AS,CP) next_max_behot_time = get_item(url)
四、UC
UC和新浪差不多,没有太复杂的反爬虫,直接解析爬取就好。
from bs4 import BeautifulSoup from urllib import request def download(title,url): req = request.Request(url) response = request.urlopen(req) response = response.read().decode('utf-8') soup = BeautifulSoup(response,'lxml') tag = soup.find('div',class_='sm-article-content') if tag == None: return 0 title = title.replace(':','') title = title.replace('"','') title = title.replace('|','') title = title.replace('/','') title = title.replace('\\','') title = title.replace('*','') title = title.replace('<','') title = title.replace('>','') title = title.replace('?','') with open(r'D:\code\python\spider_news\UC_news\society\\' + title + '.txt','w',encoding='utf-8') as file_object: file_object.write('\t\t\t\t') file_object.write(title) file_object.write('\n') file_object.write('该新闻地址:') file_object.write(url) file_object.write('\n') file_object.write(tag.get_text()) #print('正在爬取') if __name__ == '__main__': for i in range(0,7): url = 'https://news.uc.cn/c_shehui/' # headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.91 Safari/537.36", # "cookie":"sn=3957284397500558579; _uc_pramas=%7B%22fr%22%3A%22pc%22%7D"} # res = request.Request(url,headers = headers) res = request.urlopen(url) req = res.read().decode('utf-8') soup = BeautifulSoup(req,'lxml') #print(soup.prettify()) tag = soup.find_all('div',class_ = 'txt-area-title') #print(tag.name) for x in tag: news_url = 'https://news.uc.cn' + x.a.get('href') print(x.a.string,news_url) download(x.a.string,news_url)