爬取全部的校园新闻
作业要求来自:https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/3002
0.从新闻url获取点击次数,并整理成函数
- newsUrl
- newsId(re.search())
- clickUrl(str.format())
- requests.get(clickUrl)
- re.search()/.split()
- str.lstrip(),str.rstrip()
- int
- 整理成函数
- 获取新闻发布时间及类型转换也整理成函数
def click(url): newsId=re.search('/(\d*).html',url).group(1) clickUrl='http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId) resClick=requests.get(clickUrl) newsClick=int(re.search("hits'[)].html[(]'(\d+)'[)]",resClick).groups(1)) return newsClick def newsdt(showinfo): newsDate=showinfo.split()[0].split(':')[1] newsTime=showinfo.split()[1] newsDT=newsDate+' '+newsTime dt=datetime.strptime(newsDT,'%Y-%m-%d %H:%M:%S') return dt
1.从新闻url获取新闻详情: 字典,anews
def anews(url): newsDetail={} res=requests.get(url) res.encoding='utf-8' s=BeautifulSoup(res.text,'html.parser') newsDetail['newsTitle']=s.select('.show-title')[0].text showinfo=s.select('.show-info')[0].text newsDetail['newsDT']=newsdt(showinfo) newsDetail['newsClick']=click(url) return newsDetail
2.从列表页的url获取新闻url:列表append(字典) alist
def alist(url): res=requests.get(listUrl) res.encoding='utf-8' soup=BeautifulSoup(res.text,'html.parser') newsList=[] for news in soup.select('li'): if len(news.select('.news-list-title'))>0: newsUrl=news.select('a')[0]['href'] newsDest=news.select('.news-list-description')[0].text newsDict=anews(newsUrl) newsDict['newsUrl'] = newsUrl l newsDict['description']=newsDest newsList.append(newsDict) return newsList listUrl= 'http://news.gzcc.cn/html/xiaoyuanxinwen/' alist(listUrl)
3.生成所页列表页的url并获取全部新闻 :列表extend(列表) allnews
*每个同学爬学号尾数开始的10个列表页
allnews = [] for i in range(16,26): listUrl='http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) allnews.extend(alist(listUrl))
4.设置合理的爬取间隔
import time
import random
time.sleep(random.random()*3)
for i in range(5): time.sleep(random.random()*3)
5.用pandas做简单的数据处理并保存
保存到csv或excel文件
newsdf.to_csv(r'F:\duym\爬虫\gzccnews.csv')
pd.Series(allnews) newsdf=pd.DataFrame(allnews) newsdf.to_csv=(r'C:\Users\Czc\PycharmProjects\news.csv')
运行结果:
6.完整代码:
1 import requests 2 from bs4 import BeautifulSoup 3 from datetime import datetime 4 import re 5 import pandas as pd 6 import time 7 import random 8 9 def click(url): 10 newsId = re.search('/(\d*).html', url).group(1) 11 clickUrl = 'http://oa.gzcc.cn/api.php?op=count&id={}&modelid=80'.format(newsId) 12 resClick = requests.get(clickUrl) 13 newsClick = int(resClick.text.split('.html')[-1].lstrip("('").rstrip("');")) 14 return newsClick 15 16 def newsdt(showinfo): 17 newsDate = showinfo.split()[0].split(':')[1] 18 newsTime = showinfo.split()[1] 19 newsDT = newsDate + ' ' + newsTime 20 dt = datetime.strptime(newsDT, '%Y-%m-%d %H:%M:%S') 21 return dt 22 23 def anews(url): 24 newsDetail = {} 25 res = requests.get(url) 26 res.encoding = 'utf-8' 27 s = BeautifulSoup(res.text, 'html.parser') 28 newsDetail['newsTitle'] = s.select('.show-title')[0].text 29 showinfo = s.select('.show-info')[0].text 30 newsDetail['newsDT'] = newsdt(showinfo) 31 newsDetail['newsClick'] = click(url) 32 return newsDetail 33 34 def alist(url): 35 res = requests.get(listUrl) 36 res.encoding = 'utf-8' 37 soup = BeautifulSoup(res.text, 'html.parser') 38 newsList = [] 39 for news in soup.select('li'): 40 if len(news.select('.news-list-title')) > 0: 41 newsUrl = news.select('a')[0]['href'] 42 newsDest = news.select('.news-list-description')[0].text 43 newsDict = anews(newsUrl) 44 newsDict['newsUrl'] = newsUrl 45 newsDict['description'] = newsDest 46 newsList.append(newsDict) 47 return newsList 48 49 listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/' 50 alist(listUrl) 51 res = requests.get(listUrl) 52 res.encoding = 'utf-8' 53 soup = BeautifulSoup(res.text, 'html.parser') 54 for news in soup.select('li'): 55 if len(news.select('.news-list-title')) > 0: 56 newsUrl = news.select('a')[0]['href'] 57 58 59 i = int(soup.select('#pages')[0].text.split('..')[1].rstrip(' 下一页 ')) 60 allnews = [] 61 for i in range(16, 26): 62 listUrl = 'http://news.gzcc.cn/html/xiaoyuanxinwen/{}.html'.format(i) 63 allnews.extend(alist(listUrl)) 64 65 res = requests.get('http://news.gzcc.cn/html/xiaoyuanxinwen/') 66 res.encoding = 'utf-8' 67 soup = BeautifulSoup(res.text, 'html.parser') 68 for news in soup.select('li'): 69 if len(news.select('.news-list-title')) > 0: 70 newsUrl = news.select('a')[0]['href'] 71 72 pd.Series(anews) 73 newsdf = pd.DataFrame(allnews) 74 newsdf.to_csv(r'C:\Users\Czc\PycharmProjects\news.csv') 75 76 for i in range(5): 77 time.sleep(random.random() * 3) 78 print(newsdf)