爬取全部的校园新闻
作业来自于: https://edu.cnblogs.com/campus/gzcc/GZCC-16SE1/homework/3002
0.从新闻url获取点击次数,并整理成函数
- newsUrl
- newsId(re.search())
- clickUrl(str.format())
- requests.get(clickUrl)
- re.search()/.split()
- str.lstrip(),str.rstrip()
- int
- 整理成函数
- 获取新闻发布时间及类型转换也整理成函数
1.从新闻url获取新闻详情: 字典,anews
2.从列表页的url获取新闻url:列表append(字典) alist
3.生成所页列表页的url并获取全部新闻 :列表extend(列表) allnews
*每个同学爬学号尾数开始的10个列表页
4.设置合理的爬取间隔
import time
import random
time.sleep(random.random()*3)
5.用pandas做简单的数据处理并保存
保存到csv或excel文件
newsdf.to_csv(r'F:\duym\爬虫\gzccnews.csv')
代码:
1 import re 2 from bs4 import BeautifulSoup 3 import requests 4 from datetime import datetime 5 import pandas as pd 6 from pandas import DataFrame 7 import time 8 import random 9 10 11 def get_new_list(new_page): 12 new_list = [] 13 response = requests.get(new_page) 14 response.encoding = 'utf-8' 15 resopnse = response.text 16 soup = BeautifulSoup(resopnse, 'html.parser') 17 new_list_html = soup.select(".news-list a") 18 for i in new_list_html: 19 new_list.append(i['href']) 20 21 return new_list 22 def get_clicknum(new_url): 23 id = re.match(r'.*/(\d+).html', new_url).group(1) 24 click_url = r'http://oa.gzcc.cn/api.php?op=count&id=' + id + r'&modelid=80' 25 response = requests.get(click_url) 26 clicknum = response.text 27 clicknum = re.search(r"[(]'#hits'[)].html[(]'(\d+)'[)]", clicknum) 28 return int(clicknum.group(1)) 29 def str_to_date(date): 30 date_str = date.strip() 31 date = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S') 32 return date 33 34 def get_anew_msg(new_url): 35 anew_msg ={} 36 # 新闻的标题、发布时间、发布单位、作者、点击次数、内容 37 pattern = r'发布时间:([0-9\-\s\:]+)作者:(.*)审核:(.*)来源:(.*)点击:' 38 response = requests.get(new_url) 39 response.encoding = 'utf-8' 40 resopnse = response.text 41 soup = BeautifulSoup(resopnse, 'html.parser') 42 title = soup.title.text 43 release_msg = soup.select('.show-info')[0].text 44 result = re.match(pattern,release_msg) 45 date = str_to_date(result.group(1)) 46 author = result.group(2).strip() 47 shenhe = result.group(3).strip() 48 source = result.group(4).strip() 49 content = soup.select('#content')[0].text.strip() 50 clicknum = get_clicknum(new_url) 51 anew_msg['title'] = title 52 anew_msg['new_url'] = new_url 53 anew_msg['date'] = date 54 anew_msg['author'] = author 55 anew_msg['shenhe'] = shenhe 56 anew_msg['source'] = source 57 anew_msg['clicknum'] = clicknum 58 anew_msg['content'] = content 59 all_new_msg.append(anew_msg) 60 61 url = r'http://news.gzcc.cn/html/xiaoyuanxinwen/' 62 all_new_msg = [] 63 for i in range(38, 48): 64 new_page = url + str(i) + r'.html' 65 new_list = get_new_list(new_page) 66 for new_url in new_list: 67 get_anew_msg(new_url) 68 time.sleep(random.random() * 3) 69 newsdf = DataFrame(all_new_msg) 70 newsdf.to_csv(r'C:\Users\Administrator\Desktop\newsmsg.csv')
运行结果: