In [1]:
import graphlab
graphlab.canvas.set_target("ipynb")
In [2]:
sf = graphlab.SFrame.read_csv("/Users/chengjun/bigdata/w15", header=False)
In [3]:
sf
Out[3]:
In [4]:
dir(sf['X1'])
Out[4]:
In [10]:
bow = sf['X1']._count_words()
In [11]:
type(sf['X1'])
Out[11]:
In [12]:
type(bow)
Out[12]:
In [13]:
bow.dict_has_any_keys(['limited'])
Out[13]:
In [14]:
bow.dict_values()[0][:20]
Out[14]:
In [15]:
sf['bow'] = bow
In [16]:
type(sf['bow'])
Out[16]:
In [17]:
len(sf['bow'])
Out[17]:
In [18]:
sf['bow'][0].items()[:5]
Out[18]:
In [5]:
sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['X1'])
In [7]:
sf['tfidf'][0].items()[:5]
Out[7]:
In [19]:
sf.show()
In [20]:
sf
Out[20]:
In [21]:
docs = sf['bow'].dict_trim_by_values(2)
In [23]:
docs = docs.dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)
In [24]:
m = graphlab.topic_model.create(docs)
In [25]:
m
Out[25]:
In [26]:
m.get_topics()
Out[26]:
In [27]:
topics = m.get_topics().unstack(['word','score'], new_column_name='topic_words')['topic_words'].apply(lambda x: x.keys())
for topic in topics:
print topic
In [28]:
pred = m.predict(docs)
In [29]:
pred.show()
In [30]:
pred = m.predict(docs, output_type='probabilities')
In [31]:
m['vocabulary']
Out[31]:
In [32]:
m['topics']
Out[32]:
In [ ]:
def print_topics(m):
topics = m.get_topics(num_words=5)
topics = topics.unstack(['word','score'], new_column_name='topic_words')['topic_words']
topics = topics.apply(lambda x: x.keys())
for topic in topics:
print topic
print_topics(m)
In [ ]:
m2 = graphlab.topic_model.create(docs,
num_topics=20,
initial_topics=m['topics'])
In [ ]:
associations = graphlab.SFrame()
associations['word'] = ['recognition']
associations['topic'] = [0]
In [ ]:
m2 = graphlab.topic_model.create(docs,
num_topics=20,
num_iterations=50,
associations=associations,
verbose=False)
In [ ]:
m2.get_topics(num_words=10)
In [ ]:
print_topics(m2)
In [ ]: