In [1]:
import graphlab
In [2]:
people = graphlab.SFrame('people_wiki.gl/')
In [3]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
In [6]:
elton = people[people['name'] == 'Elton John']
In [8]:
elton['word_count'] = graphlab.text_analytics.count_words(elton['text'])
In [12]:
elton_word_count_table = elton[['word_count']].stack('word_count', new_column_name=['word', 'count'])
In [14]:
elton_word_count_table.sort('count', ascending=False)
Out[14]:
In [16]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
In [17]:
people['tfidf'] = tfidf['docs']
In [19]:
elton2 = people[people['name'] == 'Elton John']
In [22]:
elton_tfidf_talbe = elton2[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf'])
In [23]:
elton_tfidf_talbe.sort('tfidf', ascending=False)
Out[23]:
In [26]:
paul = people[people['name'] == 'Paul McCartney']
In [25]:
victoria = people[people['name'] == 'Victoria Beckham']
In [30]:
graphlab.distances.cosine(elton2['tfidf'][0], paul['tfidf'][0])
Out[30]:
In [31]:
graphlab.distances.cosine(elton2['tfidf'][0], victoria['tfidf'][0])
Out[31]:
In [39]:
word_count_knn_model = graphlab.nearest_neighbors.create(people, features=['word_count'], distance='cosine', label='name')
In [41]:
tfidf_knn_model = graphlab.nearest_neighbors.create(people, features=['tfidf'], distance='cosine', label='name')
In [42]:
word_count_knn_model.query(elton2)
Out[42]:
In [44]:
tfidf_knn_model.query(elton2)
Out[44]:
In [45]:
word_count_knn_model.query(victoria)
Out[45]:
In [46]:
tfidf_knn_model.query(victoria)
Out[46]: