In [2]:
import graphlab
In [3]:
people = graphlab.SFrame('people_wiki.gl/')
Data contains: link to wikipedia article, name of person, text of article.
In [4]:
people.head()
Out[4]:
In [5]:
len(people)
Out[5]:
In [6]:
obama = people[people['name'] == 'Barack Obama']
In [7]:
obama
Out[7]:
In [8]:
obama['text']
Out[8]:
In [9]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']
Out[9]:
In [10]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])
In [11]:
print obama['word_count']
In [12]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])
In [13]:
obama_word_count_table.head()
Out[13]:
In [14]:
obama_word_count_table.sort('count',ascending=False)
Out[14]:
Most common words include uninformative words like "the", "in", "and",...
In [14]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()
Out[14]:
In [16]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
tfidf
Out[16]:
In [17]:
people['tfidf'] = tfidf['docs']
In [18]:
obama = people[people['name'] == 'Barack Obama']
In [19]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
Out[19]:
Words with highest TF-IDF are much more informative.
In [20]:
clinton = people[people['name'] == 'Bill Clinton']
In [ ]:
beckham = people[people['name'] == 'David Beckham']
In [24]:
graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])
Out[24]:
In [25]:
graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])
Out[25]:
In [26]:
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')
In [27]:
knn_model.query(obama)
Out[27]:
As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.
In [28]:
swift = people[people['name'] == 'Taylor Swift']
In [29]:
knn_model.query(swift)
Out[29]:
In [30]:
jolie = people[people['name'] == 'Angelina Jolie']
In [31]:
knn_model.query(jolie)
Out[31]:
In [32]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']
In [33]:
knn_model.query(arnold)
Out[33]:
In [ ]: