In [1]:
import graphlab
In [2]:
people = graphlab.SFrame('./people_wiki.gl/')
In [5]:
len(people)
Out[5]:
In [6]:
obama = people[people['name'] == 'Barack Obama']
In [7]:
obama
Out[7]:
In [8]:
people.head()
Out[8]:
In [9]:
clooney = people[people['name']=='George Clooney']
In [10]:
clooney
Out[10]:
In [11]:
clooney['text']
Out[11]:
In [12]:
obama['wordCount'] = graphlab.text_analytics.count_words(obama['text'])
In [13]:
obama
Out[13]:
In [14]:
obama_wordCountTable = obama[['wordCount']].stack('wordCount', new_column_name=['word', 'count'])
In [15]:
obama_wordCountTable.head()
Out[15]:
In [16]:
obama_wordCountTable.sort('count', ascending=False)
Out[16]:
In [17]:
people['wordCount'] = graphlab.text_analytics.count_words(people['text'])
people.head()
Out[17]:
In [18]:
tfidf = graphlab.text_analytics.tf_idf(people['wordCount'])
tfidf
Out[18]:
In [19]:
people['tfidf'] = tfidf['docs']
In [20]:
obama = people[people['name']=='Barack Obama']
In [22]:
obama_tfidf_table = obama[['tfidf']].stack('tfidf', new_column_name=['word','tfidf']).sort('tfidf', ascending=False)
In [23]:
obama_tfidf_table.head()
Out[23]:
In [24]:
people.head()
Out[24]:
In [25]:
clinton = people[people['name'] == 'Bill Clinton']
In [26]:
beckham = people[people['name']== 'David Beckham']
In [28]:
# various ways to find similarity between 2 docs.
# We use distance metric called cosine distance
# higher the number, farther the articles are
# lower the distance, closer the articles are
graphlab.distances.cosine(obama['tfidf'][0], clinton['tfidf'][0])
Out[28]:
In [29]:
graphlab.distances.cosine(obama['tfidf'][0], beckham['tfidf'][0])
Out[29]:
In [32]:
knnModel = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name')
In [33]:
knnModel.query(obama)
Out[33]:
In [34]:
knnModel
Out[34]:
In [35]:
swift = people[people['name']=='Taylor Swift']
In [36]:
knnModel.query(swift)
Out[36]:
In [37]:
jolie = people[people['name']=='Angelina Jolie']
In [38]:
knnModel.query(jolie)
Out[38]:
In [39]:
arnold = people[people['name']=='Arnold Schwarzenegger']
In [40]:
knnModel.query(arnold)
Out[40]:
In [44]:
elton = people[people['name'] == 'Elton John']
In [45]:
elton_wordCountTable = elton[['wordCount']].stack('wordCount', new_column_name=['word', 'count']).sort('count', ascending=False)
In [46]:
elton_wordCountTable
Out[46]:
In [47]:
elton_tfidf_table = elton[['tfidf']].stack('tfidf', new_column_name=['word','tfidf']).sort('tfidf', ascending=False)
In [48]:
elton_tfidf_table
Out[48]:
In [55]:
knnCosineModel_tfidf = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')
In [56]:
knnCosineModel_words = graphlab.nearest_neighbors.create(people, features=['wordCount'], label='name', distance='cosine')
In [51]:
victoria = people[people['name']=='Victoria Beckham']
In [52]:
paul = people[people['name']=='Paul McCartney']
In [53]:
graphlab.distances.cosine(elton['tfidf'][0],victoria['tfidf'][0])
Out[53]:
In [54]:
graphlab.distances.cosine(elton['tfidf'][0],paul['tfidf'][0])
Out[54]:
In [57]:
knnCosineModel_words.query(elton)
Out[57]:
In [58]:
knnCosineModel_tfidf.query(elton)
Out[58]:
In [59]:
knnCosineModel_words.query(victoria)
Out[59]:
In [60]:
knnCosineModel_tfidf.query(victoria)
Out[60]:
In [ ]: