In [1]:
import graphlab
graphlab.canvas.set_target("ipynb")

Using SFrames to work with text data


In [2]:
sf = graphlab.SFrame.read_csv("/Users/chengjun/bigdata/w15", header=False)


This non-commercial license of GraphLab Create is assigned to wangchengjun@nju.edu.cn and will expire on July 31, 2016. For commercial licensing options, visit https://dato.com/buy/.
2016-04-14 01:12:14,140 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1460567529.log
Finished parsing file /Users/chengjun/bigdata/w15
Parsing completed. Parsed 100 lines in 0.546547 secs.
------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------
Read 12278 lines. Lines per second: 12121.5
Finished parsing file /Users/chengjun/bigdata/w15
Parsing completed. Parsed 72269 lines in 2.23078 secs.

In [3]:
sf


Out[3]:
X1
aynrand born and educated
in russia rand migrated ...
asphalt in american
english asphalt or ...
actinopterygii the
actinopterygii consti ...
altaiclanguages these
language families share ...
argon the name argon is
derived from the greek ...
augustderleth a 1938
guggenheim fellow der ...
amateur amateurism can be
seen in both a negative ...
assemblyline an assembly
line is a manufacturing ...
astronomicalunit an
astronomical unit ...
abbess an abbess latin
abbatissa feminine form ...
[72269 rows x 1 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Transformations


In [4]:
dir(sf['X1'])


Out[4]:
['_SArray__check_min_observations',
 '_SArray__construct_ctr',
 '__abs__',
 '__add__',
 '__and__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__div__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__get_content_identifier__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__has_size__',
 '__hash__',
 '__init__',
 '__is_materialized__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__materialize__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__proxy__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '_count_ngrams',
 '_count_words',
 '_getitem_cache',
 '_save_as_text',
 'all',
 'any',
 'append',
 'apply',
 'argmax',
 'argmin',
 'astype',
 'clip',
 'clip_lower',
 'clip_upper',
 'contains',
 'cumulative_max',
 'cumulative_mean',
 'cumulative_min',
 'cumulative_std',
 'cumulative_sum',
 'cumulative_var',
 'date_range',
 'datetime_to_str',
 'dict_has_all_keys',
 'dict_has_any_keys',
 'dict_keys',
 'dict_trim_by_keys',
 'dict_trim_by_values',
 'dict_values',
 'dropna',
 'dtype',
 'fillna',
 'filter',
 'from_avro',
 'from_const',
 'from_sequence',
 'head',
 'item_length',
 'max',
 'mean',
 'min',
 'nnz',
 'num_missing',
 'pixel_array_to_image',
 'rolling_count',
 'rolling_max',
 'rolling_mean',
 'rolling_min',
 'rolling_stdv',
 'rolling_sum',
 'rolling_var',
 'sample',
 'save',
 'show',
 'size',
 'sketch_summary',
 'sort',
 'split_datetime',
 'std',
 'str_to_datetime',
 'subslice',
 'sum',
 'tail',
 'to_numpy',
 'topk_index',
 'unique',
 'unpack',
 'var',
 'vector_slice']

In [10]:
bow = sf['X1']._count_words()

In [11]:
type(sf['X1'])


Out[11]:
graphlab.data_structures.sarray.SArray

In [12]:
type(bow)


Out[12]:
graphlab.data_structures.sarray.SArray

In [13]:
bow.dict_has_any_keys(['limited'])


Out[13]:
dtype: int
Rows: 72269
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... ]

In [14]:
bow.dict_values()[0][:20]


Out[14]:
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1]

In [15]:
sf['bow'] = bow

In [16]:
type(sf['bow'])


Out[16]:
graphlab.data_structures.sarray.SArray

In [17]:
len(sf['bow'])


Out[17]:
72269

In [18]:
sf['bow'][0].items()[:5]


Out[18]:
[('limited', 3),
 ('writings', 2),
 ('personally', 1),
 ('four', 1),
 ('controversial', 1)]

In [5]:
sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['X1'])

In [7]:
sf['tfidf'][0].items()[:5]


Out[7]:
[('limited', 10.04705669672047),
 ('writings', 9.76010421134325),
 ('personally', 5.001941923280662),
 ('four', 2.1272386886969024),
 ('controversial', 4.375805453003677)]

In [19]:
sf.show()



In [20]:
sf


Out[20]:
X1 tfidf bow
aynrand born and educated
in russia rand migrated ...
{'limited':
10.04705669672047, ...
{'limited': 3,
'writings': 2, ...
asphalt in american
english asphalt or ...
{'all':
1.3891905239989626, ...
{'all': 1, 'accadian': 1,
'similarity': 1, ...
actinopterygii the
actinopterygii consti ...
{'andreolepis':
11.188150547181156, ...
{'andreolepis': 1, 'all':
1, 'evolutionary': 2, ...
altaiclanguages these
language families share ...
{'sergei':
20.031873121992916, ...
{'sergei': 3, 'all': 6,
'todays': 1, 'chinese': ...
argon the name argon is
derived from the greek ...
{'limited':
3.3490188989068232, ...
{'limited': 1,
'embolism': 1, ...
augustderleth a 1938
guggenheim fellow der ...
{'evelyn':
6.7937013925087175, ...
{'evelyn': 1,
'detective': 4, ...
amateur amateurism can be
seen in both a negative ...
{'since':
1.8775124538896095, ...
{'since': 1, 'subpar': 1,
'lack': 2, 'valuable' ...
assemblyline an assembly
line is a manufacturing ...
{'all':
4.167571571996888, ...
{'all': 3, 'concept': 6,
'consider': 1, 'chine ...
astronomicalunit an
astronomical unit ...
{'precise':
5.491057060675752, 'a ...
{'precise': 1, 'all': 2,
'chinese': 1, 'suns': 1, ...
abbess an abbess latin
abbatissa feminine form ...
{'kildares':
11.188150547181156, ...
{'kildares': 1, 'they':
4, 'founder': 1, ...
[72269 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Text cleaning


In [21]:
docs = sf['bow'].dict_trim_by_values(2)

In [23]:
docs = docs.dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)

Topic modeling


In [24]:
m = graphlab.topic_model.create(docs)


Learning a topic model
       Number of documents     72269
           Vocabulary size    171005
   Running collapsed Gibbs sampling
+-----------+---------------+----------------+-----------------+
| Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
+-----------+---------------+----------------+-----------------+
| 10        | 2.48s         | 8.92734e+06    | 0               |
+-----------+---------------+----------------+-----------------+

In [25]:
m


Out[25]:
Class                         : TopicModel

Schema
------
Vocabulary Size               : 171005

Settings
--------
Number of Topics              : 10
alpha                         : 5.0
beta                          : 0.1
Iterations                    : 10
Training time                 : 3.4936
Verbose                       : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.

In [26]:
m.get_topics()


Out[26]:
topic word score
0 series 0.018582602707
0 time 0.0160412461512
0 played 0.0142993990545
0 back 0.00951875933204
0 game 0.00839911774869
1 war 0.0176185833315
1 film 0.0159278169528
1 group 0.0140632734063
1 party 0.0103356107163
1 year 0.0102957274319
[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [27]:
topics = m.get_topics().unstack(['word','score'], new_column_name='topic_words')['topic_words'].apply(lambda x: x.keys())
for topic in topics:
    print topic


['series', 'game', 'time', 'back', 'played']
['club', 'city', 'work', 'season', 'league']
['party', 'group', 'war', 'film', 'year']
['years', 'university', 'law', 'year', 'time']
['town', 'age', 'system', 'south', 'church']
['school', 'de', 'river', 'family', 'century']
['world', 'national', 'including', 'number', 'team']
['states', 'city', 'state', 'population', 'government']
['album', 'band', 'song', 'music', 'released']
['land', 'game', 'army', 'local', 'area']

In [28]:
pred = m.predict(docs)

In [29]:
pred.show()



In [30]:
pred = m.predict(docs, output_type='probabilities')

In [31]:
m['vocabulary']


Out[31]:
dtype: str
Rows: 171005
['duke', 'studies', 'journal', 'chris', 'research', 'matthew', 'crisis', 'financial', 'paul', '1987', 'reagan', 'traditional', 'rightwing', 'nominee', 'libertarianism', 'cato', 'chief', 'smith', 'line', 'south', 'nick', '1999', 'documentary', 'animated', 'shows', 'references', 'commentator', 'powerful', 'ethics', 'rush', 'neil', 'lives', 'cited', 'produced', 'night', 'originality', 'interest', '2007', 'individual', 'authors', 'admirer', 'married', 'club', 'library', 'essays', 'recent', '2009', 'burns', 'inspiration', 'artist', 'women', 'early', 'barbara', 'organized', 'gave', 'referred', 'company', 'personalist', 'criticism', 'john', 'reviewers', 'language', 'understanding', 'writes', 'fewer', 'attention', 'positive', 'masterful', 'review', 'times', 'critic', 'praise', 'theory', 'randian', 'importance', 'calling', 'nonfiction', 'academics', 'kant', 'philosophers', 'italian', 'remarked', 'wife', 'house', 'subject', 'scholarly', 'edward', 'system', 'influence', 'acknowledged', '100', 'branden', 'criticized', 'sacrificing', 'exist', 'selfinterest', 'rational', 'communism', 'journals', 'copies', ... ]

In [32]:
m['topics']


Out[32]:
topic_probabilities vocabulary
[1.6417032014e-07,
1.42440301489e-07, ...
duke
[1.6417032014e-07,
1.42440301489e-07, ...
studies
[1.6417032014e-07,
1.42440301489e-07, ...
journal
[1.6417032014e-07,
1.42440301489e-07, ...
chris
[1.6417032014e-07,
1.42440301489e-07, ...
research
[0.000305520965781,
1.42440301489e-07, ...
matthew
[3.44757672294e-06,
1.42440301489e-07, ...
crisis
[1.6417032014e-07,
4.41564934616e-06, ...
financial
[1.6417032014e-07,
1.42440301489e-07, ...
paul
[1.6417032014e-07,
0.00033772595483, ...
1987
[171005 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


In [ ]:
def print_topics(m):
    topics = m.get_topics(num_words=5)
    topics = topics.unstack(['word','score'], new_column_name='topic_words')['topic_words']
    topics = topics.apply(lambda x: x.keys())
    for topic in topics:
        print topic
print_topics(m)

Initializing from other models


In [ ]:
m2 = graphlab.topic_model.create(docs,
                                 num_topics=20,
                                 initial_topics=m['topics'])

Seeding the model with prior knowledge


In [ ]:
associations = graphlab.SFrame()
associations['word'] = ['recognition']
associations['topic'] = [0]

In [ ]:
m2 = graphlab.topic_model.create(docs,
                                 num_topics=20,
                                 num_iterations=50,
                                 associations=associations, 
                                 verbose=False)

In [ ]:
m2.get_topics(num_words=10)

In [ ]:
print_topics(m2)

In [ ]: