python爬虫:抓取新浪新闻内容(从当前时间到之前某个时间段),并用jieba分词,用于训练自己的分词模型
新浪新闻内容采用的是ajax动态显示内容,通过抓包,发现如下规律:
每次请求下一页,js那一栏都会出现新的url:
"http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1"
"||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&"
"format=json&page=1&callback=newsloadercallback"
"http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1"
"||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&"
"format=json&page=3&callback=newsloadercallback"
以上两个不同的是:page=?
Python实现代码:
1 #-*- coding:utf-8 -*- 2 __author__ = 'Administrator' 3 4 import re 5 from bs4 import BeautifulSoup 6 import urllib.request 7 import jieba 8 import string 9 import urllib.parse 10 from urllib.error import HTTPError,URLError 11 import json 12 13 def get_page(num): 14 return ("http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1" 15 "||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&" 16 "format=json&page={}&callback=newsloadercallback").format(str(num)) 17 18 def get_url(page_url): 19 #解决请求路径中含义中文或特殊字符 20 page_url=urllib.parse.quote(page_url, safe=string.printable) 21 #print(page_url) 22 url_list=[] 23 try: 24 res = urllib.request.urlopen(page_url) 25 except HTTPError as e: 26 print('The server couldn\'t fulfill the request.') 27 print('Error code: ', e.code) 28 return url_list 29 except URLError as e: 30 print('We failed to reach a server.') 31 print('Reason: ', e.reason) 32 return url_list 33 else: 34 if res.getcode()==200: 35 jsdata=res.read().decode("utf-8") 36 ''' 37 截取url方法一 38 ''' 39 # result=re.findall('"url":"http.*?\.s?html"',jsdata)#.*后面再加?就可以变成非贪婪模式 40 # for url in result: 41 # url=url.split(":",maxsplit=1)[1] 42 # url=url.replace('\\',"") 43 # url_list.append(url) 44 ''' 45 截取url方法二 46 ''' 47 data=jsdata[21:-2] 48 data=re.sub('\'','\"',data) 49 data=re.sub(r"\\u","",data) 50 jsondata=json.loads(data) 51 for dat in jsondata["result"]["data"]: 52 url_list.append(dat["url"]) 53 return url_list 54 55 56 57 def get_context(new_url): 58 #解决请求路径中含义中文或特殊字符 59 httpurl=urllib.parse.quote(new_url, safe=string.printable) 60 #print(httpurl) 61 #print(type(httpurl)) 62 try: 63 html=urllib.request.urlopen(httpurl) 64 except HTTPError as e: 65 print('The server couldn\'t fulfill the request.') 66 print('Error code: ', e.code) 67 except URLError as e: 68 print('We failed to reach a server.') 69 print('Reason: ', e.reason) 70 else: 71 if html.getcode()==200: 72 res=html.read().decode("utf-8") 73 #print(res) 74 soup=BeautifulSoup(res,'html.parser') 75 #print(soup.prettify) 76 result={} 77 result["article"]=''.join([p.text.strip() for p in soup.select('#artibody p')[:-1]]) 78 context=result['article'] 79 pattern=',|。|“|”|?|!|:|《|》|、|;|·|——| |‘|’|,|\?|\.|\!|`|~|\@|\#|\$|%|\^|\&|\*|(|)|\(|\)|-|\_|\+|=|\[|\]|\{|\}|"|\'|\<|\>|\||' 80 li=re.split(pattern,context) 81 #print("li") 82 with open(r".\traindata.txt",'a',encoding='utf-8') as file: 83 for l in li: 84 if l!="": 85 sentence = " ".join(jieba.cut(l)) 86 file.write(sentence + '\n') 87 88 if __name__=="__main__": 89 for i in range(1,1001): 90 print("第 %d 页" % i) 91 page_url=get_page(i) 92 url_list=get_url(page_url) 93 #print(url_list) #['"http://news.sina.com.cn/c/nd/2017-06-11/doc-ifyfzhac1171724.shtml"', ...],双引号外层还有单引号 94 if url_list: 95 for url in url_list: 96 #print(eval(url)) 97 #print(type(url)) 98 # get_context(eval(url))#针对方法一截取url 99 get_context(url)#针对方法二截取url