In [1]:
import graphlab
In [2]:
people = graphlab.SFrame('people_wiki.gl/')
Data contains: link to wikipedia article, name of person, text of article.
In [4]:
people.head()
Out[4]:
In [3]:
len(people)
Out[3]:
In [4]:
obama = people[people['name'] == 'Barack Obama']
john = people[people['name'] == 'Elton John']
In [5]:
john
Out[5]:
In [8]:
obama['text']
Out[8]:
In [9]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']
Out[9]:
In [6]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])
john['word_count'] = graphlab.text_analytics.count_words(john['text'])
In [7]:
print john['word_count']
In [8]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])
john_word_count_table = john[['word_count']].stack('word_count', new_column_name = ['word','count'])
In [9]:
obama_word_count_table.head()
john_word_count_table.head()
Out[9]:
In [10]:
obama_word_count_table.sort('count',ascending=False)
john_word_count_table.sort('count',ascending=False)
Out[10]:
Most common words include uninformative words like "the", "in", "and",...
In [11]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()
Out[11]:
In [12]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
tfidf
Out[12]:
In [13]:
people['tfidf'] = tfidf['docs']
In [14]:
obama = people[people['name'] == 'Barack Obama']
john = people[people['name'] == 'Elton John']
In [15]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
john[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
Out[15]:
Words with highest TF-IDF are much more informative.
In [19]:
clinton = people[people['name'] == 'Bill Clinton']
paul = people[people['name'] == 'Paul McCartney']
In [28]:
beckham = people[people['name'] == 'Victoria Beckham']
In [37]:
print graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])
print graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])
In [36]:
print graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])
print graphlab.distances.cosine(john['tfidf'][0],beckham['tfidf'][0])
print graphlab.distances.cosine(john['tfidf'][0],paul['tfidf'][0])
In [34]:
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name', distance='cosine')
wc_model = graphlab.nearest_neighbors.create(people,features=['word_count'],label='name', distance='cosine')
In [32]:
knn_model.query(beckham)
Out[32]:
In [35]:
print wc_model.query(john)
print knn_model.query(john)
As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.
In [28]:
swift = people[people['name'] == 'Taylor Swift']
In [29]:
knn_model.query(swift)
Out[29]:
In [30]:
jolie = people[people['name'] == 'Angelina Jolie']
In [31]:
knn_model.query(jolie)
Out[31]:
In [32]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']
In [33]:
knn_model.query(arnold)
Out[33]:
In [ ]: