In [1]:
import graphlab
In [2]:
people = graphlab.SFrame('people_wiki.gl/')
Data contains: link to wikipedia article, name of person, text of article.
In [3]:
people.head()
Out[3]:
In [4]:
len(people)
Out[4]:
In [5]:
obama = people[people['name'] == 'Barack Obama']
In [6]:
obama
Out[6]:
In [7]:
obama['text']
Out[7]:
In [8]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']
Out[8]:
In [9]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])
In [10]:
print obama['word_count']
In [11]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])
In [12]:
obama_word_count_table.head()
Out[12]:
In [13]:
obama_word_count_table.sort('count',ascending=False)
Out[13]:
Most common words include uninformative words like "the", "in", "and",...
In [17]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()
Out[17]:
In [20]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
tfidf[0]
Out[20]:
In [21]:
people['tfidf'] = tfidf['docs']
In [22]:
obama = people[people['name'] == 'Barack Obama']
In [23]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
Out[23]:
Words with highest TF-IDF are much more informative.
In [24]:
clinton = people[people['name'] == 'Bill Clinton']
In [25]:
beckham = people[people['name'] == 'David Beckham']
In [26]:
graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])
Out[26]:
In [27]:
graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])
Out[27]:
In [28]:
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')
In [29]:
knn_model.query(obama)
Out[29]:
As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.
In [28]:
swift = people[people['name'] == 'Taylor Swift']
In [29]:
knn_model.query(swift)
Out[29]:
In [30]:
jolie = people[people['name'] == 'Angelina Jolie']
In [31]:
knn_model.query(jolie)
Out[31]:
In [32]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']
In [33]:
knn_model.query(arnold)
Out[33]:
In [36]:
elton_john = people[people['name'] == 'Elton John']
In [37]:
elton_john
Out[37]:
In [39]:
#stack('word_count', new_column_name = ['word','count'])
elton_john_wc_table = elton_john[['word_count']].stack('word_count',new_column_name = ['word','count'])
elton_john_wc_table
Out[39]:
In [40]:
elton_john_wc_table.sort('count',ascending=False)
Out[40]:
In [42]:
elton_john_tfidf_table = elton_john[['tfidf']].stack('tfidf',new_column_name= ['tf-idf','count'])
elton_john_tfidf_table.sort('count',ascending=False)
Out[42]:
In [43]:
vic_beckham = people[people['name'] == 'Victoria Beckham']
In [44]:
graphlab.distances.cosine(elton_john['tfidf'][0],vic_beckham['tfidf'][0])
Out[44]:
In [46]:
paul_m = people[people['name']== 'Paul McCartney']
In [47]:
graphlab.distances.cosine(elton_john['tfidf'][0],paul_m['tfidf'][0])
Out[47]:
In [48]:
knn_model_tfidf = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name',distance='cosine')
In [49]:
knn_model_wc= graphlab.nearest_neighbors.create(people,features=['word_count'],label = 'name',distance='cosine')
In [50]:
knn_model_wc.query(elton_john)
Out[50]:
In [51]:
knn_model_tfidf.query(elton_john)
Out[51]:
In [52]:
knn_model_wc.query(vic_beckham)
Out[52]:
In [53]:
knn_model_tfidf.query(vic_beckham)
Out[53]:
In [ ]: