机器学习实战2--时光网评论分析

# coding: utf-8

# In[74]:

import sys  
reload(sys)  
sys.setdefaultencoding('utf8')
import graphlab
import datetime


# In[75]:

graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)


# In[76]:

products = graphlab.SFrame.read_csv('data/commit.csv')


# In[77]:

products.head()


# In[78]:

#文本分析工具,对评论进行统计分析
products['word_count'] = graphlab.text_analytics.count_words(products['commits'])


# In[79]:

products.head()


# In[80]:

graphlab.canvas.set_target('browser')


# In[81]:

products['movie_name_zh'].show()


# In[82]:

#获取战狼2的点评
giraffe_reviews = products[products['movie_name_zh'] == '战狼2']


# In[83]:

len(giraffe_reviews) #点评数量


# In[84]:

giraffe_reviews['userscore'].show(view='Categorical') #查看分类性质的评分分布
#发现战狼2打7分的人最多


# In[88]:

#忽略5分的评论,忽略没分的评论,分数转为浮点格式
products = products[products['userscore'] != 'None']
products['userscore']
products['userscore'] = products['userscore'].astype(float)
products = products[products['userscore'] != 5]


# In[89]:

#筛选正面评价
products['sentiment'] = products['userscore'] >=6.0


# In[90]:

products.head()


# In[91]:

#区分训练集和测试集,并限定seed
train_data,test_data = products.random_split(.8, seed=0)


# In[92]:

#创建逻辑回归分类器
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)


# In[93]:

#评估情感分析模型.ROC(受试者工作特征曲线)
sentiment_model.evaluate(test_data, metric='roc_curve')


# In[94]:

#可视化评估结果
sentiment_model.show(view='Evaluation')


# In[95]:

#添加新的一列,predicted_sentiment(情感预测),结果概率预测
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')


# In[96]:

giraffe_reviews.head()


# In[97]:

#排序,ascending决定是否升序
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False)


# In[98]:

giraffe_reviews.head()


# In[110]:

giraffe_reviews[0]


# In[112]:

giraffe_reviews[2]


# In[113]:

giraffe_reviews[-1]


# In[114]:

giraffe_reviews[-2]


# In[117]:

giraffe_reviews = giraffe_reviews[giraffe_reviews['userscore'] == 'None']


# In[119]:

giraffe_reviews.tail()


# In[ ]:

代码地址(附作业答案): https://github.com/RedheatWei/aiproject/tree/master/Machine%20Learning%20Specialization/week3

爬虫地址: https://github.com/RedheatWei/mtime_commit

posted @ 2018-07-04 16:34  Redheat  阅读(198)  评论(0编辑  收藏  举报