In [3]:
import graphlab
In [4]:
people = graphlab.SFrame('people_wiki.gl/')
Los datos contienen articulos de wikipedia sobre diferentes personas.
In [5]:
people.head()
Out[5]:
In [6]:
len(people)
Out[6]:
In [11]:
obama = people[people['name'] == 'Barack Obama']
In [12]:
obama
Out[12]:
In [13]:
obama['text']
Out[13]:
In [14]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])
In [15]:
print obama['word_count']
In [16]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])
In [17]:
obama_word_count_table.head()
Out[17]:
In [18]:
obama_word_count_table.sort('count',ascending=False)
Out[18]:
Las palabras más comunes no nos aportan información.
In [19]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()
Out[19]:
In [20]:
people['tfidf'] = graphlab.text_analytics.tf_idf(people['word_count'])
In [21]:
obama = people[people['name'] == 'Barack Obama']
In [22]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)
Out[22]:
El algoritmo TF-IDF nos aporta más información.
In [23]:
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')
In [18]:
knn_model.query(obama)
Out[18]:
In [19]:
swift = people[people['name'] == 'Taylor Swift']
In [20]:
knn_model.query(swift)
Out[20]:
In [21]:
jolie = people[people['name'] == 'Angelina Jolie']
In [22]:
knn_model.query(jolie)
Out[22]:
In [23]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']
In [24]:
knn_model.query(arnold)
Out[24]:
In [ ]:
In [ ]: