In [1]:

    
import graphlab
graphlab.canvas.set_target("ipynb")

Using SFrames to work with text data

Download Data: http://select.cs.cmu.edu/code/graphlab/datasets/wikipedia/wikipedia_raw/w15



In [2]:

    
sf = graphlab.SFrame.read_csv("/Users/chengjun/bigdata/w15", header=False)









    



This non-commercial license of GraphLab Create is assigned to wangchengjun@nju.edu.cn and will expire on July 31, 2016. For commercial licensing options, visit https://dato.com/buy/.






    



2016-04-14 01:12:14,140 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1460567529.log






    




Finished parsing file /Users/chengjun/bigdata/w15






    




Parsing completed. Parsed 100 lines in 0.546547 secs.






    



------------------------------------------------------
Inferred types from first line of file as 
column_type_hints=[str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------






    




Read 12278 lines. Lines per second: 12121.5






    




Finished parsing file /Users/chengjun/bigdata/w15






    




Parsing completed. Parsed 72269 lines in 2.23078 secs.



In [3]:

    
sf









    Out[3]:





    
        X1
    
    
        aynrand born and educated
in russia rand migrated ...
    
    
        asphalt in american
english asphalt or ...
    
    
        actinopterygii the
actinopterygii consti ...
    
    
        altaiclanguages these
language families share ...
    
    
        argon the name argon is
derived from the greek ...
    
    
        augustderleth a 1938
guggenheim fellow der ...
    
    
        amateur amateurism can be
seen in both a negative ...
    
    
        assemblyline an assembly
line is a manufacturing ...
    
    
        astronomicalunit an
astronomical unit ...
    
    
        abbess an abbess latin
abbatissa feminine form ...
    

[72269 rows x 1 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Transformations

https://dato.com/learn/userguide/text/analysis.html



In [4]:

    
dir(sf['X1'])









    Out[4]:





['_SArray__check_min_observations',
 '_SArray__construct_ctr',
 '__abs__',
 '__add__',
 '__and__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__div__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__get_content_identifier__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__has_size__',
 '__hash__',
 '__init__',
 '__is_materialized__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__materialize__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__proxy__',
 '__radd__',
 '__rdiv__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmod__',
 '__rmul__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '_count_ngrams',
 '_count_words',
 '_getitem_cache',
 '_save_as_text',
 'all',
 'any',
 'append',
 'apply',
 'argmax',
 'argmin',
 'astype',
 'clip',
 'clip_lower',
 'clip_upper',
 'contains',
 'cumulative_max',
 'cumulative_mean',
 'cumulative_min',
 'cumulative_std',
 'cumulative_sum',
 'cumulative_var',
 'date_range',
 'datetime_to_str',
 'dict_has_all_keys',
 'dict_has_any_keys',
 'dict_keys',
 'dict_trim_by_keys',
 'dict_trim_by_values',
 'dict_values',
 'dropna',
 'dtype',
 'fillna',
 'filter',
 'from_avro',
 'from_const',
 'from_sequence',
 'head',
 'item_length',
 'max',
 'mean',
 'min',
 'nnz',
 'num_missing',
 'pixel_array_to_image',
 'rolling_count',
 'rolling_max',
 'rolling_mean',
 'rolling_min',
 'rolling_stdv',
 'rolling_sum',
 'rolling_var',
 'sample',
 'save',
 'show',
 'size',
 'sketch_summary',
 'sort',
 'split_datetime',
 'std',
 'str_to_datetime',
 'subslice',
 'sum',
 'tail',
 'to_numpy',
 'topk_index',
 'unique',
 'unpack',
 'var',
 'vector_slice']



In [10]:

    
bow = sf['X1']._count_words()



In [11]:

    
type(sf['X1'])









    Out[11]:





graphlab.data_structures.sarray.SArray



In [12]:

    
type(bow)









    Out[12]:





graphlab.data_structures.sarray.SArray



In [13]:

    
bow.dict_has_any_keys(['limited'])









    Out[13]:





dtype: int
Rows: 72269
[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... ]



In [14]:

    
bow.dict_values()[0][:20]









    Out[14]:





[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1]



In [15]:

    
sf['bow'] = bow



In [16]:

    
type(sf['bow'])









    Out[16]:





graphlab.data_structures.sarray.SArray



In [17]:

    
len(sf['bow'])









    Out[17]:





72269



In [18]:

    
sf['bow'][0].items()[:5]









    Out[18]:





[('limited', 3),
 ('writings', 2),
 ('personally', 1),
 ('four', 1),
 ('controversial', 1)]



In [5]:

    
sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['X1'])



In [7]:

    
sf['tfidf'][0].items()[:5]









    Out[7]:





[('limited', 10.04705669672047),
 ('writings', 9.76010421134325),
 ('personally', 5.001941923280662),
 ('four', 2.1272386886969024),
 ('controversial', 4.375805453003677)]



In [19]:

    
sf.show()



In [20]:

    
sf









    Out[20]:





    
        X1
        tfidf
        bow
    
    
        aynrand born and educated
in russia rand migrated ...
        {'limited':
10.04705669672047, ...
        {'limited': 3,
'writings': 2, ...
    
    
        asphalt in american
english asphalt or ...
        {'all':
1.3891905239989626, ...
        {'all': 1, 'accadian': 1,
'similarity': 1, ...
    
    
        actinopterygii the
actinopterygii consti ...
        {'andreolepis':
11.188150547181156, ...
        {'andreolepis': 1, 'all':
1, 'evolutionary': 2, ...
    
    
        altaiclanguages these
language families share ...
        {'sergei':
20.031873121992916, ...
        {'sergei': 3, 'all': 6,
'todays': 1, 'chinese': ...
    
    
        argon the name argon is
derived from the greek ...
        {'limited':
3.3490188989068232, ...
        {'limited': 1,
'embolism': 1, ...
    
    
        augustderleth a 1938
guggenheim fellow der ...
        {'evelyn':
6.7937013925087175, ...
        {'evelyn': 1,
'detective': 4, ...
    
    
        amateur amateurism can be
seen in both a negative ...
        {'since':
1.8775124538896095, ...
        {'since': 1, 'subpar': 1,
'lack': 2, 'valuable' ...
    
    
        assemblyline an assembly
line is a manufacturing ...
        {'all':
4.167571571996888, ...
        {'all': 3, 'concept': 6,
'consider': 1, 'chine ...
    
    
        astronomicalunit an
astronomical unit ...
        {'precise':
5.491057060675752, 'a ...
        {'precise': 1, 'all': 2,
'chinese': 1, 'suns': 1, ...
    
    
        abbess an abbess latin
abbatissa feminine form ...
        {'kildares':
11.188150547181156, ...
        {'kildares': 1, 'they':
4, 'founder': 1, ...
    

[72269 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Text cleaning



In [21]:

    
docs = sf['bow'].dict_trim_by_values(2)



In [23]:

    
docs = docs.dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)

Topic modeling



In [24]:

    
m = graphlab.topic_model.create(docs)









    




Learning a topic model






    




       Number of documents     72269






    




           Vocabulary size    171005






    




   Running collapsed Gibbs sampling






    




+-----------+---------------+----------------+-----------------+






    




| Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |






    




+-----------+---------------+----------------+-----------------+






    




| 10        | 2.48s         | 8.92734e+06    | 0               |






    




+-----------+---------------+----------------+-----------------+



In [25]:

    
m









    Out[25]:





Class                         : TopicModel

Schema
------
Vocabulary Size               : 171005

Settings
--------
Number of Topics              : 10
alpha                         : 5.0
beta                          : 0.1
Iterations                    : 10
Training time                 : 3.4936
Verbose                       : False

Accessible fields             : 
m['topics']                   : An SFrame containing the topics.
m['vocabulary']               : An SArray containing the words in the vocabulary.
Useful methods                : 
m.get_topics()                : Get the most probable words per topic.
m.predict(new_docs)           : Make predictions for new documents.



In [26]:

    
m.get_topics()









    Out[26]:





    
        topic
        word
        score
    
    
        0
        series
        0.018582602707
    
    
        0
        time
        0.0160412461512
    
    
        0
        played
        0.0142993990545
    
    
        0
        back
        0.00951875933204
    
    
        0
        game
        0.00839911774869
    
    
        1
        war
        0.0176185833315
    
    
        1
        film
        0.0159278169528
    
    
        1
        group
        0.0140632734063
    
    
        1
        party
        0.0103356107163
    
    
        1
        year
        0.0102957274319
    

[50 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [27]:

    
topics = m.get_topics().unstack(['word','score'], new_column_name='topic_words')['topic_words'].apply(lambda x: x.keys())
for topic in topics:
    print topic









    



['series', 'game', 'time', 'back', 'played']
['club', 'city', 'work', 'season', 'league']
['party', 'group', 'war', 'film', 'year']
['years', 'university', 'law', 'year', 'time']
['town', 'age', 'system', 'south', 'church']
['school', 'de', 'river', 'family', 'century']
['world', 'national', 'including', 'number', 'team']
['states', 'city', 'state', 'population', 'government']
['album', 'band', 'song', 'music', 'released']
['land', 'game', 'army', 'local', 'area']



In [28]:

    
pred = m.predict(docs)



In [29]:

    
pred.show()



In [30]:

    
pred = m.predict(docs, output_type='probabilities')



In [31]:

    
m['vocabulary']









    Out[31]:





dtype: str
Rows: 171005
['duke', 'studies', 'journal', 'chris', 'research', 'matthew', 'crisis', 'financial', 'paul', '1987', 'reagan', 'traditional', 'rightwing', 'nominee', 'libertarianism', 'cato', 'chief', 'smith', 'line', 'south', 'nick', '1999', 'documentary', 'animated', 'shows', 'references', 'commentator', 'powerful', 'ethics', 'rush', 'neil', 'lives', 'cited', 'produced', 'night', 'originality', 'interest', '2007', 'individual', 'authors', 'admirer', 'married', 'club', 'library', 'essays', 'recent', '2009', 'burns', 'inspiration', 'artist', 'women', 'early', 'barbara', 'organized', 'gave', 'referred', 'company', 'personalist', 'criticism', 'john', 'reviewers', 'language', 'understanding', 'writes', 'fewer', 'attention', 'positive', 'masterful', 'review', 'times', 'critic', 'praise', 'theory', 'randian', 'importance', 'calling', 'nonfiction', 'academics', 'kant', 'philosophers', 'italian', 'remarked', 'wife', 'house', 'subject', 'scholarly', 'edward', 'system', 'influence', 'acknowledged', '100', 'branden', 'criticized', 'sacrificing', 'exist', 'selfinterest', 'rational', 'communism', 'journals', 'copies', ... ]



In [32]:

    
m['topics']









    Out[32]:





    
        topic_probabilities
        vocabulary
    
    
        [1.6417032014e-07,
1.42440301489e-07, ...
        duke
    
    
        [1.6417032014e-07,
1.42440301489e-07, ...
        studies
    
    
        [1.6417032014e-07,
1.42440301489e-07, ...
        journal
    
    
        [1.6417032014e-07,
1.42440301489e-07, ...
        chris
    
    
        [1.6417032014e-07,
1.42440301489e-07, ...
        research
    
    
        [0.000305520965781,
1.42440301489e-07, ...
        matthew
    
    
        [3.44757672294e-06,
1.42440301489e-07, ...
        crisis
    
    
        [1.6417032014e-07,
4.41564934616e-06, ...
        financial
    
    
        [1.6417032014e-07,
1.42440301489e-07, ...
        paul
    
    
        [1.6417032014e-07,
0.00033772595483, ...
        1987
    

[171005 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [ ]:

    
def print_topics(m):
    topics = m.get_topics(num_words=5)
    topics = topics.unstack(['word','score'], new_column_name='topic_words')['topic_words']
    topics = topics.apply(lambda x: x.keys())
    for topic in topics:
        print topic
print_topics(m)

Initializing from other models



In [ ]:

    
m2 = graphlab.topic_model.create(docs,
                                 num_topics=20,
                                 initial_topics=m['topics'])

Seeding the model with prior knowledge



In [ ]:

    
associations = graphlab.SFrame()
associations['word'] = ['recognition']
associations['topic'] = [0]



In [ ]:

    
m2 = graphlab.topic_model.create(docs,
                                 num_topics=20,
                                 num_iterations=50,
                                 associations=associations, 
                                 verbose=False)



In [ ]:

    
m2.get_topics(num_words=10)



In [ ]:

    
print_topics(m2)



In [ ]:

X1	tfidf	bow
aynrand born and educated in russia rand migrated ...	{'limited': 10.04705669672047, ...	{'limited': 3, 'writings': 2, ...
asphalt in american english asphalt or ...	{'all': 1.3891905239989626, ...	{'all': 1, 'accadian': 1, 'similarity': 1, ...
actinopterygii the actinopterygii consti ...	{'andreolepis': 11.188150547181156, ...	{'andreolepis': 1, 'all': 1, 'evolutionary': 2, ...
altaiclanguages these language families share ...	{'sergei': 20.031873121992916, ...	{'sergei': 3, 'all': 6, 'todays': 1, 'chinese': ...
argon the name argon is derived from the greek ...	{'limited': 3.3490188989068232, ...	{'limited': 1, 'embolism': 1, ...
augustderleth a 1938 guggenheim fellow der ...	{'evelyn': 6.7937013925087175, ...	{'evelyn': 1, 'detective': 4, ...
amateur amateurism can be seen in both a negative ...	{'since': 1.8775124538896095, ...	{'since': 1, 'subpar': 1, 'lack': 2, 'valuable' ...
assemblyline an assembly line is a manufacturing ...	{'all': 4.167571571996888, ...	{'all': 3, 'concept': 6, 'consider': 1, 'chine ...
astronomicalunit an astronomical unit ...	{'precise': 5.491057060675752, 'a ...	{'precise': 1, 'all': 2, 'chinese': 1, 'suns': 1, ...
abbess an abbess latin abbatissa feminine form ...	{'kildares': 11.188150547181156, ...	{'kildares': 1, 'they': 4, 'founder': 1, ...

topic	word	score
0	series	0.018582602707
0	time	0.0160412461512
0	played	0.0142993990545
0	back	0.00951875933204
0	game	0.00839911774869
1	war	0.0176185833315
1	film	0.0159278169528
1	group	0.0140632734063
1	party	0.0103356107163
1	year	0.0102957274319

topic_probabilities	vocabulary
[1.6417032014e-07, 1.42440301489e-07, ...	duke
[1.6417032014e-07, 1.42440301489e-07, ...	studies
[1.6417032014e-07, 1.42440301489e-07, ...	journal
[1.6417032014e-07, 1.42440301489e-07, ...	chris
[1.6417032014e-07, 1.42440301489e-07, ...	research
[0.000305520965781, 1.42440301489e-07, ...	matthew
[3.44757672294e-06, 1.42440301489e-07, ...	crisis
[1.6417032014e-07, 4.41564934616e-06, ...	financial
[1.6417032014e-07, 1.42440301489e-07, ...	paul
[1.6417032014e-07, 0.00033772595483, ...	1987