机器学习实战3--豆瓣读书简介

graphlab对中文的支持非常无解,怎么办?

# coding: utf-8

# # graphlab对中文的支持简直无解,怎么办?求解决方法

# In[34]:

import sys  
reload(sys)  
sys.setdefaultencoding('utf8')
import graphlab
import datetime


# In[35]:

# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)


# In[36]:

douban = graphlab.SFrame.read_json('data/douban.json')


# In[37]:

douban.head()


# In[38]:

len(douban)


# In[41]:

weicheng = douban[douban['name'] == '围城']


# In[42]:

weicheng


# In[43]:

weicheng['intro']


# In[44]:

weicheng['word_count'] = graphlab.text_analytics.count_words(weicheng['intro'])


# In[46]:

weicheng['word_count']


# In[47]:

#创建一张新表,stack可以将k-v转换为2列
weicheng_word_count_table = weicheng[['word_count']].stack('word_count', new_column_name = ['word','count'])


# In[48]:

weicheng_word_count_table.head()


# In[49]:

#排序,降序
weicheng_word_count_table.sort('count',ascending=False)


# In[50]:

#TF-IDF取决于所有文本
douban['word_count'] = graphlab.text_analytics.count_words(douban['intro'])
douban.head()


# In[51]:

#计算tf-idf
tfidf = graphlab.text_analytics.tf_idf(douban['word_count'])

# Earlier versions of GraphLab Create returned an SFrame rather than a single SArray
# This notebook was created using Graphlab Create version 1.7.1
if graphlab.version <= '1.6.1':
    tfidf = tfidf['docs']

tfidf


# In[52]:

douban['tfidf'] = tfidf


# In[53]:

weicheng = douban[douban['name'] == '围城']


# In[54]:

#创建一个围城的tfidf列并排序
weicheng[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)


# In[55]:

#创建一个临近模型
knn_model = graphlab.nearest_neighbors.create(douban,features=['tfidf'],label='name')


# In[56]:

knn_model.query(weicheng)


# In[ ]:

代码地址(附作业答案): https://github.com/RedheatWei/aiproject/tree/master/Machine%20Learning%20Specialization/week4

爬虫地址: https://github.com/RedheatWei/douban_book_intro