爬虫大作业
1.选一个自己感兴趣的主题。
2.用python 编写爬虫程序,从网络上爬取相关主题的数据。
3.对爬了的数据进行文本分析,生成词云。
4.对文本分析结果进行解释说明。
5.写一篇完整的博客,描述上述实现过程、遇到的问题及解决办法、数据分析思想及结论。
6.最后提交爬取的全部数据、爬虫及数据分析源代码。
import requests, re, jieba from bs4 import BeautifulSoup from datetime import datetime def getNewsDetail(newsUrl): resd = requests.get(newsUrl) resd.encoding = 'gb2312' soupd = BeautifulSoup(resd.text, 'html.parser') content = soupd.select('#Cnt-Main-Article-QQ')[0].text info = soupd.select('.a_Info')[0].text date = re.search('(\d{4}.\d{2}.\d{2}\s\d{2}.\d{2})', info).group(1) dateTime = datetime.strptime(date, '%Y-%m-%d %H:%M') sources = soupd.select('.a_source')[0].text # if soupd.select('.a_author')!=null: # author = soupd.select('.a_author')[0].text writeNews(content) keyWords = getKeyWords(content) print('发布时间:{0}\n来源:{1}'.format(dateTime, sources)) print('关键词:{}、{}、{}'.format(keyWords[0], keyWords[1], keyWords[2])) print(content) # 将新闻内容写入到文件 def writeNews(content): f = open('news.txt', 'a', encoding='utf-8') f.write(content) f.close() def getKeyWords(content): content = ''.join(re.findall('[\u4e00-\u9fa5]', content)) wordSet = set(jieba._lcut(content)) wordDict = {} for i in wordSet: wordDict[i] = content.count(i) deleteList, keyWords = [], [] for i in wordDict.keys(): if len(i) < 2: deleteList.append(i) for i in deleteList: del wordDict[i] dictList = list(wordDict.items()) dictList.sort(key=lambda item: item[1], reverse=True) for i in range(3): keyWords.append(dictList[i][0]) return keyWords def getListPage(listUrl): res = requests.get(listUrl) res.encoding = 'gbk' soup = BeautifulSoup(res.text, 'html.parser') for new in soup.select('.Q-tpList'): newsUrl = new.select('a')[0]['href'] # title = new.select('a')[0].text # print('标题:{0}\n链接:{1}'.format(title, newsUrl)) print(newsUrl) getNewsDetail(newsUrl) # break listUrl = 'http://tech.qq.com/ydhl.htm' getListPage(listUrl) for i in range(2, 20): listUrl = '/http://tech.qq.com/a/20170628/it_2016_%02d/' % i getListPage(listUrl)