爬虫python

2017-08-29 18:14 zxbob 阅读(346) 评论(0) 编辑收藏举报
最近看到电影，也看了很多的评论，想了解下大多人对相关电影的评论，正好也在学习Python，就利用其爬虫的强大能力，这里利用Python3.6.1
下面是相关代码：
  1 #coding:utf-8
  2 __author__ = 'hang'
  3 
  4 import warnings
  5 warnings.filterwarnings("ignore")
  6 import jieba    #分词包
  7 import numpy    #numpy计算包
  8 import codecs   #codecs提供的open方法来指定打开的文件的语言编码，它会在读取的时候自动转换为内部unicode
  9 import re
 10 import pandas as pd
 11 import matplotlib.pyplot as plt
 12 from urllib import request
 13 from bs4 import BeautifulSoup as bs
 14 # %matplotlib inline  (ipython中应用)
 15 # from skimage import data
 16 import matplotlib
 17 matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)
 18 from wordcloud import WordCloud#词云包
 19 
 20 class KetWord:
 21     def __init__(self,name,count):
 22         self.name =name
 23         self.count = count
 24 
 25     def __cmp__(self, other):
 26 
 27         if isinstance(KetWord,other):
 28             if self.count > other.count:
 29                 return 1
 30             elif self.count < other.count:
 31                 return -1
 32             else:
 33                 return 0
 34 
 35     def __str__(self):
 36         return '[name='+ self.name +':count='+ str(self.count) +']'
 37 #分析网页函数
 38 def getNowPlayingMovie_list():
 39     resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
 40     html_data = resp.read().decode('utf-8')
 41     soup = bs(html_data, 'html.parser')
 42     nowplaying_movie = soup.find_all('div', id='nowplaying')
 43     nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item')
 44     nowplaying_list = []
 45     for item in nowplaying_movie_list:
 46         nowplaying_dict = {}
 47         nowplaying_dict['id'] = item['data-subject']
 48         for tag_img_item in item.find_all('img'):
 49             nowplaying_dict['name'] = tag_img_item['alt']
 50             nowplaying_list.append(nowplaying_dict)
 51     return nowplaying_list
 52 
 53 #爬取评论函数
 54 def getCommentsById(movieId, pageNum):
 55     eachCommentList = [];
 56     if pageNum>0:
 57          start = (pageNum-1) * 20
 58     else:
 59         return False
 60     requrl = 'https://movie.douban.com/subject/' + movieId + '/comments' +'?' +'start=' + str(start) + '&limit=20'
 61     print(requrl)
 62     resp = request.urlopen(requrl)
 63     html_data = resp.read().decode('utf-8')
 64     soup = bs(html_data, 'html.parser')
 65     comment_div_lits = soup.find_all('div', class_='comment')
 66     for item in comment_div_lits:
 67         if item.find_all('p')[0].string is not None:
 68             eachCommentList.append(item.find_all('p')[0].string)
 69     return eachCommentList
 70 
 71 def main():
 72     #循环获取第一个电影的前10页评论
 73     commentList = []
 74     NowPlayingMovie_list = getNowPlayingMovie_list()
 75     print('common=',NowPlayingMovie_list)
 76     #获取id电影[{'id': '11502973', 'name': '星际特工：千星之城'}, {'id': '25933890', 'name': '极盗车神'}, {'id': '25849480', 'name': '赛车总动员3：极速挑战'},
 77     # {'id': '26607693', 'name': '敦刻尔克'}, {'id': '26363254', 'name': '战狼2'}, {'id': '26826398', 'name': '杀破狼·贪狼'}, {'id': '26816086', 'name': '银魂 真人版'},
 78     #  {'id': '26430107', 'name': '二十二'}, {'id': '26759539', 'name': '十万个冷笑话2'}, {'id': '26752106', 'name': '黑白迷宫'}, {'id': '26647876', 'name': '地球：神奇的一天'},
 79     #  {'id': '26969037', 'name': '赛尔号大电影6：圣者无敌'}, {'id': '25980443', 'name': '海边的曼彻斯特'}, {'id': '26760160', 'name': '破·局'},
 80     #  {'id': '27040349', 'name': '二次初恋'}, {'id': '22232939', 'name': '大耳朵图图之美食狂想曲'}, {'id': '25857966', 'name': '鲛珠传'}, {'id': '26698000', 'name': '心理罪'},
 81     # {'id': '26692823', 'name': '建军大业'}, {'id': '25823277', 'name': '三生三世十里桃花'}, {'id': '2999500', 'name': '七天'}, {'id': '27107261', 'name': '一路向爱'},
 82     # {'id': '25858758', 'name': '侠盗联盟'}, {'id': '26790961', 'name': '闪光少女'}, {'id': '26991769', 'name': '恐怖毕业照2'}, {'id': '25812712', 'name': '神偷奶爸3'},
 83     #  {'id': '27107265', 'name': '杜丽娘'}]
 84     for i in range(10):
 85         num = i + 1
 86         commentList_temp = getCommentsById(NowPlayingMovie_list[4]['id'], num)
 87         commentList.append(commentList_temp)
 88 
 89     #将列表中的数据转换为字符串
 90     comments = ''
 91     for k in range(len(commentList)):
 92         comments = comments + (str(commentList[k])).strip()
 93 
 94     #使用正则表达式去除标点符号
 95     pattern = re.compile(r'[\u4e00-\u9fa5]+')
 96     filterdata = re.findall(pattern, comments)
 97     cleaned_comments = ''.join(filterdata)
 98 
 99     #使用结巴分词进行中文分词
100     segment = jieba.lcut(cleaned_comments)
101     words_df=pd.DataFrame({'segment':segment})
102 
103     #去掉停用词
104     stopwords=pd.read_csv("stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')#quoting=3全不引用
105     words_df=words_df[~words_df.segment.isin(stopwords.stopword)]
106 
107     #统计词频
108     words_stat=words_df.groupby(by=['segment'])['segment'].agg({"计数":numpy.size})
109     words_stat=words_stat.reset_index().sort_values(by=["计数"],ascending=False)
110 
111     #用词云进行显示
112     wordcloud=WordCloud(font_path="simhei.ttf",background_color="white",max_font_size=80)
113     word_frequence = {x[0]:x[1] for x in words_stat.head(1000).values}
114 
115     #利用字典存放
116     word_frequence_list = {}
117     x_val = []
118     y_val = []
119     for key in word_frequence:
120         word_frequence_list[str(key)] = word_frequence[key]
121 
122     wordcloud=wordcloud.generate_from_frequencies(word_frequence_list)
123     print(word_frequence_list)
124 
125     # print('x=',x_val)
126     # print('y=',y_val)
127     # map = dict()
128     # for i in range(len(y_val)):
129     #     # key_word = KetWord(x_val[i],y_val[i])
130     #     map[i] = KetWord(x_val[i],y_val[i])
131     # for key in map:
132     #     print('word=',map[key])
133     # plt.plot(x_val,y_val)
134     # plt.show()
135     plt.imshow(wordcloud)
136     #既然是IPython的内置magic函数，那么在Pycharm中是不会支持的。但是我们可以在matplotlib中的pyplot身上下功夫，pyplot不会不提供展示图像的功能。
137     plt.colorbar()
138     plt.show()
139 
140 #主函数
141 main()
会员力量，点亮园子希望
刷新页面返回顶部
zxbob

爬虫python

About