In [2]:
import graphlab
In [7]:
people = graphlab.SFrame('people_wiki.gl/')
Data contains: link to wikipedia article, name of person, text of article.
In [8]:
people.head(5)
Out[8]:
In [9]:
len(people)
Out[9]:
In [10]:
obama = people[people['name'] == 'Barack Obama']
In [11]:
obama
Out[11]:
In [14]:
obama['text']
In [15]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']
In [16]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])
In [17]:
print obama['word_count']
In [18]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])
In [22]:
obama_word_count_table.head(5)
Out[22]:
In [23]:
obama_word_count_table.sort('count',ascending=False)
Out[23]:
Most common words include uninformative words like "the", "in", "and",...but some of the words does't contain any meaning full informaion about these article we can remove these words some time we call these words as stop words
To give more weight to informative words, we weigh them by their TF-IDF scores. TF-IDF basically way to score the importance of words in document based on how frequently they appear across multiple doc, but this will work for the whole corpus so we apply tf-idf to the whole
In [28]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head(5)
Out[28]:
In [30]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
tfidf
Out[30]:
In [31]:
people['tfidf'] = tfidf['docs']
In [32]:
obama = people[people['name'] == 'Barack Obama']
In [33]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
Out[33]:
In [34]:
clinton = people[people['name'] == 'Bill Clinton']
In [35]:
beckham = people[people['name'] == 'David Beckham']
We will use cosine distance, which is given by
(1-cosine_similarity) for computing the distance between the two documents
and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.
In [36]:
graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])
Out[36]:
In [38]:
graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])
Out[38]:
In [40]:
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')
In [41]:
knn_model.query(obama)
Out[41]:
As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.
In [42]:
swift = people[people['name'] == 'Taylor Swift']
In [44]:
knn_model.query(swift)
Out[44]:
In [30]:
jolie = people[people['name'] == 'Angelina Jolie']
In [31]:
knn_model.query(jolie)
Out[31]:
In [45]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']
In [46]:
knn_model.query(arnold)
Out[46]:
In [ ]: