机器学习实战3--豆瓣读书简介
graphlab对中文的支持非常无解,怎么办?
# coding: utf-8 # # graphlab对中文的支持简直无解,怎么办?求解决方法 # In[34]: import sys reload(sys) sys.setdefaultencoding('utf8') import graphlab import datetime # In[35]: # Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing. graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4) # In[36]: douban = graphlab.SFrame.read_json('data/douban.json') # In[37]: douban.head() # In[38]: len(douban) # In[41]: weicheng = douban[douban['name'] == '围城'] # In[42]: weicheng # In[43]: weicheng['intro'] # In[44]: weicheng['word_count'] = graphlab.text_analytics.count_words(weicheng['intro']) # In[46]: weicheng['word_count'] # In[47]: #创建一张新表,stack可以将k-v转换为2列 weicheng_word_count_table = weicheng[['word_count']].stack('word_count', new_column_name = ['word','count']) # In[48]: weicheng_word_count_table.head() # In[49]: #排序,降序 weicheng_word_count_table.sort('count',ascending=False) # In[50]: #TF-IDF取决于所有文本 douban['word_count'] = graphlab.text_analytics.count_words(douban['intro']) douban.head() # In[51]: #计算tf-idf tfidf = graphlab.text_analytics.tf_idf(douban['word_count']) # Earlier versions of GraphLab Create returned an SFrame rather than a single SArray # This notebook was created using Graphlab Create version 1.7.1 if graphlab.version <= '1.6.1': tfidf = tfidf['docs'] tfidf # In[52]: douban['tfidf'] = tfidf # In[53]: weicheng = douban[douban['name'] == '围城'] # In[54]: #创建一个围城的tfidf列并排序 weicheng[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False) # In[55]: #创建一个临近模型 knn_model = graphlab.nearest_neighbors.create(douban,features=['tfidf'],label='name') # In[56]: knn_model.query(weicheng) # In[ ]:
代码地址(附作业答案): https://github.com/RedheatWei/aiproject/tree/master/Machine%20Learning%20Specialization/week4
本博客内容都是博主试验通过的方案与方法.
如需交流,请发邮件至qjyyn#qq.com