1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
import requests, re, jieba
from bs4 import BeautifulSoup
from datetime import datetime
# 获取新闻细节
def getNewsDetail(newsUrl):
resd = requests.get(newsUrl)
resd.encoding = 'gb2312'
soupd = BeautifulSoup(resd.text, 'html.parser')
content = soupd.select('.artText')[0].text
info = soupd.select('.artDate')[0].text
title = soupd.select('h1')[0].text
date = re.search('(\d{4}.\d{2}.\d{2})', info).group(1) # 识别时间格式
dateTime = datetime.strptime(date, '%Y-%m-%d') # 用datetime将时间字符串转换为datetime类型
writeNews(content)
keyWords = getKeyWords(content)
print('标题:{}'.format(title))
print('发布时间:{0}'.format(dateTime))
print('关键词:{}、{}、{}'.format(keyWords[0], keyWords[1], keyWords[2]))
print(content)
return
# 通过jieba分词,获取新闻关键词
def getKeyWords(content):
content = ''.join(re.findall('[\u4e00-\u9fa5]', content)) # 通过正则表达式选取中文字符数组,拼接为无标点字符内容
wordSet = set(jieba._lcut(content))
wordDict = {}
for i in wordSet:
wordDict[i] = content.count(i)
deleteList, keyWords = [], []
for i in wordDict.keys():
if len(i) < 2:
deleteList.append(i) # 去掉单字无意义字符
for i in deleteList:
del wordDict[i]
delete_worddict = {'率土之滨', '率土', '网易', '全新'}
for i in delete_worddict:
if i in wordDict:
del wordDict[i]
dictList = list(wordDict.items())
dictList.sort(key=lambda item: item[1], reverse=True) # 排序,返回前三关键字
for i in range(3):
keyWords.append(dictList[i][0])
return keyWords
# 获取一页的新闻
def getListPage(listUrl):
res = requests.get(listUrl)
res.encoding = 'gbk'
soup = BeautifulSoup(res.text, 'html.parser')
for new in range(0, 5):
newsUrl = soup.select('.item-inner')[new].select('a')[0]['href']
t = 'http:' + newsUrl
getNewsDetail(t) # 调用getNewsDetail()获取新闻详情
# break # z只获取单个新闻,若要获取整页则去掉breakt
# 将新闻内容写入到文件
def writeNews(content):
f = open('news.txt', 'a', encoding='utf-8')
f.write(content)
f.close()
for i in range(2, 12):
listUrl = 'http://stzb.163.com/news/index_{}.html'.format(i)
getListPage(listUrl)