数据结构化与保存
1. 将新闻的正文内容保存到文本文件。
def writeToDocument(filename, content): f = open(filename, 'a', encoding='utf-8') f.write(content) f.close()
2. 将新闻数据结构化为字典的列表:
(1)单条新闻的详情-->字典news
def getNewsDetail(newsUrl): resdet = requests.get(newsUrl) resdet.encoding = 'utf-8' soupdet = BeautifulSoup(resdet.text, 'html.parser') news = {} news['title'] = soupdet.select('.show-title')[0].text if (soupdet.select('.show-info')): showinfo = soupdet.select('.show-info')[0].text date = showinfo.lstrip("发布时间:")[:19] news['dateTime'] = datetime.strptime(date, '%Y-%m-%d %H:%M:%S') if (showinfo.find('作者') > 0): news['author'] = re.search('作者:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1) # newsdetail['author'] = re.search('作者:(.*)\s*审|来|摄|点', showinfo).group(1) else: news['author'] = 'none' if (showinfo.find('审核') > 0): news['checker'] = re.search('审核:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1) # newsdetail['checker'] = re.search('审核:(.*)\s*来|摄|点', showinfo).group(1) else: news['checker'] = 'none' if (showinfo.find('来源') > 0): news['source'] = re.search('来源:(.*)\s*摄|点', showinfo).group(1) else: news['source'] = 'none' if (showinfo.find('摄影') > 0): news['photographer'] = re.search('摄影:((.{2,4}\s|.{2,4}、|.{2,4},|\w*\s){1,5})', showinfo).group(1) # newsdetail['photographer'] = re.search('摄影:(.*)\s*点', showinfo).group(1) else: news['photographer'] = 'none' news['clicktimes'] = getClickCount(newsUrl) else: return if (soupdet.select('.show-content')): news['contentdetail'] = soupdet.select('#content')[0].text else: return news['newsUrl'] = newsUrl # writeToDocument('gzccNews.txt', contentdetail) # print("发表时间:{0} 作者:{1} 审核:{2} 来源:{3} 摄像:{4} 点击次数:{5} 次".format( # news['dateTime'], news['author'], news['checker'], news['source'], news['photographer'], news['clicktimes'])) # print(newsdetail['contentdetail']) # print(newsdetail) return news
(2)一个列表页所有单条新闻汇总-->列表newsls.append(news)
def getListDetail(ListPageUrl): resl = requests.get(ListPageUrl) resl.encoding = 'utf-8' soupl = BeautifulSoup(resl.text, 'html.parser') gzccNewslist = {} newsls = [] for news in soupl.select('li'): if len(news.select('.news-list-title')) > 0: gzccNewslist['title'] = news.select('.news-list-title')[0].text gzccNewslist['description'] = news.select('.news-list-description')[0].text gzccNewslist['info'] = news.select('.news-list-info')[0].text gzccNewslist['address'] = news.select('a')[0]['href'] # print("\n标题: {0}\n描述: {1}\n信息: {2}\n链接: {3}".format( # gzccNewslist['title'], gzccNewslist['description'], gzccNewslist['info'], gzccNewslist['address'])) # print(list) newsls.append(getNewsDetail(gzccNewslist['address'])) return newsls
(3)所有列表页的所有新闻汇总列表newstotal.extend(newsls)
locale.setlocale(locale.LC_CTYPE, 'chinese') newstotal = [] Listurl = "http://news.gzcc.cn/html/xiaoyuanxinwen/" pagecount = getPageNum(Listurl) for i in range(1, pagecount + 1): if (i == 1): ListPageUrl = Listurl else: ListPageUrl = Listurl + '{}.html'.format(i) newstotal.extend(getListDetail(ListPageUrl)) break
3. 安装pandas,用pandas.DataFrame(newstotal),创建一个DataFrame对象df.
df = pandas.DataFrame(gzccNews)
4. 通过df将提取的数据保存到csv或excel 文件。
df.to_excel('gzccnews.xlsx')
5. 用pandas提供的函数和方法进行数据分析:
(1)提取包含点击次数、标题、来源的前6行数据
df[['clicktimes', 'title', 'source']].head(6) print(df[['clicktimes', 'title', 'source']].head(6))
(2)提取‘学校综合办’发布的,‘点击次数’超过3000的新闻。
df[(df['clicktimes'] > 3000) & (df['source'] == '学校综合办')] print(df[(df['clicktimes'] > 3000) & (df['source'] == '学校综合办')])
(3)提取'国际学院'和'学生工作处'发布的新闻。
soulist = ['国际学院', '学生工作处'] print(df[df['source'].isin(soulist)])