In [2]:
import graphlab
In [3]:
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 8)
In [4]:
#set canvas to open inline
graphlab.canvas.set_target('ipynb')
In [5]:
people = graphlab.SFrame('people_wiki.gl/')
In [6]:
people.head()
Out[6]:
In [7]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
In [8]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
In [9]:
people['tfidf'] = tfidf
In [10]:
people.head()
Out[10]:
In [11]:
elton = people[people['name'] == 'Elton John']
In [12]:
elton.head()
Out[12]:
In [13]:
elton[['tfidf']].stack('tfidf', new_column_name=['word','tfidf']).sort('tfidf', ascending=False)
Out[13]:
In [14]:
elton[['word_count']].stack('word_count', new_column_name=['word','count']).sort('count', ascending=False)
Out[14]:
What’s the cosine distance between the articles on ‘Elton John’ and ‘Victoria Beckham’? What’s the cosine distance between the articles on ‘Elton John’ and Paul McCartney’? Which one of the two is closest to Elton John? Does this result make sense to you? Save these results to answer the quiz at the end.
In [15]:
victoria = people[people['name'] == 'Victoria Beckham']
In [16]:
graphlab.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0])
Out[16]:
In [17]:
paul = people[people['name'] == 'Paul McCartney']
In [18]:
graphlab.distances.cosine(elton['tfidf'][0], paul['tfidf'][0])
Out[18]:
In [19]:
cosine_tfidf_model = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')
In [20]:
cosine_word_count_model = graphlab.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')
In [21]:
cosine_word_count_model.query(elton)
Out[21]:
What’s the most similar article, other than itself, to the one on ‘Elton John’ using TF-IDF features?
In [22]:
cosine_tfidf_model.query(elton)
Out[22]:
What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using word count features?
In [23]:
cosine_word_count_model.query(victoria)
Out[23]:
What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using TF-IDF features?
In [24]:
cosine_tfidf_model.query(victoria)
Out[24]:
In [ ]: