In [4]:
from sframe import SFrame, aggregate # Note: sframe only supports Python 2.7+ and 3.4.

save_munging = True

In [20]:
try:
    sf = SFrame.load_sframe('wiki-triplets-24G_sframe/')
except:
    sf = SFrame.read_csv('wiki-triplets-24G.tsv', column_type_hints=[str, str, str],
                         delimiter='\t', header=False, quote_char='\0')


Read 543905 lines. Lines per second: 159452
Read 2158490 lines. Lines per second: 254775
Read 4262498 lines. Lines per second: 290401
Read 5813276 lines. Lines per second: 285590
Read 8003538 lines. Lines per second: 303614
Read 9664686 lines. Lines per second: 305934
Read 11850019 lines. Lines per second: 311951
Read 14016226 lines. Lines per second: 305002
Read 15582589 lines. Lines per second: 292772
Read 17135349 lines. Lines per second: 289599
Read 18699672 lines. Lines per second: 290100
Read 20301670 lines. Lines per second: 290644
Read 21912693 lines. Lines per second: 286081
Read 23540208 lines. Lines per second: 285350
Read 25145459 lines. Lines per second: 284484
Read 26722498 lines. Lines per second: 285465
Read 28336739 lines. Lines per second: 283897
Read 29947899 lines. Lines per second: 285353
Read 32114071 lines. Lines per second: 286868
Read 34259056 lines. Lines per second: 288758
Read 35889912 lines. Lines per second: 288288
Read 37515353 lines. Lines per second: 289103
Read 39124082 lines. Lines per second: 289865
Read 40699231 lines. Lines per second: 289972
Read 42810855 lines. Lines per second: 291117
Read 44391810 lines. Lines per second: 290810
Read 46563762 lines. Lines per second: 292836
Read 48678399 lines. Lines per second: 294190
Read 50268107 lines. Lines per second: 294830
Read 52410794 lines. Lines per second: 296577
Read 54670153 lines. Lines per second: 298323
Read 56287351 lines. Lines per second: 298805
Read 57931295 lines. Lines per second: 299049
Read 59571012 lines. Lines per second: 298029
Read 61693386 lines. Lines per second: 298794
Read 63300021 lines. Lines per second: 299155
Read 63845441 lines. Lines per second: 293854
Read 64931670 lines. Lines per second: 291469
Read 67081723 lines. Lines per second: 292691
Read 69150905 lines. Lines per second: 292575
Read 70797542 lines. Lines per second: 293341
Read 73089065 lines. Lines per second: 295026
Read 74806089 lines. Lines per second: 295487
Read 76539674 lines. Lines per second: 293923
Read 78133435 lines. Lines per second: 290687
Read 79191384 lines. Lines per second: 289302
Read 80806351 lines. Lines per second: 288402
Read 82404885 lines. Lines per second: 287262
Read 83960694 lines. Lines per second: 286082
Read 85570003 lines. Lines per second: 285520
Read 87182382 lines. Lines per second: 285643
Read 88763255 lines. Lines per second: 285920
Read 90925722 lines. Lines per second: 287283
Read 93040555 lines. Lines per second: 288139
Read 94651487 lines. Lines per second: 287503
Read 96837860 lines. Lines per second: 288281
Read 98462260 lines. Lines per second: 288329
Read 100215918 lines. Lines per second: 287195
Read 102000768 lines. Lines per second: 288049
Read 103785202 lines. Lines per second: 288586
Read 104974708 lines. Lines per second: 286601
Read 106759667 lines. Lines per second: 286927
Read 108313230 lines. Lines per second: 286748
Read 109957999 lines. Lines per second: 286577
Read 111953749 lines. Lines per second: 287128
Read 113521899 lines. Lines per second: 287388
Read 115096137 lines. Lines per second: 287725
Read 116706461 lines. Lines per second: 287697
Read 118291367 lines. Lines per second: 287936
Read 119851944 lines. Lines per second: 287639
Read 121392802 lines. Lines per second: 287548
Read 122931656 lines. Lines per second: 287359
Read 124468270 lines. Lines per second: 287584
Read 126056702 lines. Lines per second: 286798
Read 127643998 lines. Lines per second: 286543
Read 129238554 lines. Lines per second: 286450
Read 130884976 lines. Lines per second: 286956
Read 133004473 lines. Lines per second: 287352
Read 135102962 lines. Lines per second: 287653
Read 136649512 lines. Lines per second: 287806
Read 138686028 lines. Lines per second: 288339
Read 140774488 lines. Lines per second: 289060
Read 142383257 lines. Lines per second: 288584
Read 143456126 lines. Lines per second: 287566
Read 145011663 lines. Lines per second: 287034
Read 146602017 lines. Lines per second: 286941
Read 148221747 lines. Lines per second: 287095
Read 149801225 lines. Lines per second: 286903
Read 151372960 lines. Lines per second: 287016
Read 152968240 lines. Lines per second: 287008
Read 155123918 lines. Lines per second: 287638
Read 156800643 lines. Lines per second: 287993
Read 158587942 lines. Lines per second: 288156
Read 160303908 lines. Lines per second: 288649
Read 162449140 lines. Lines per second: 289275
Read 164038847 lines. Lines per second: 289318
Read 166162681 lines. Lines per second: 289341
Read 168247699 lines. Lines per second: 289956
Read 169812734 lines. Lines per second: 289745
Read 171423058 lines. Lines per second: 289870
Read 173085073 lines. Lines per second: 287318
Read 174685153 lines. Lines per second: 286823
Read 176744330 lines. Lines per second: 287075
Read 177784427 lines. Lines per second: 284012
Read 178875716 lines. Lines per second: 282208
Read 179969295 lines. Lines per second: 281408
Read 182078153 lines. Lines per second: 282124
Read 183655738 lines. Lines per second: 281672
Read 184712981 lines. Lines per second: 280754
Read 186304642 lines. Lines per second: 280850
Read 186830693 lines. Lines per second: 278686
Read 187887430 lines. Lines per second: 276869
Read 188959071 lines. Lines per second: 275845
Read 191117229 lines. Lines per second: 276555
Read 193242749 lines. Lines per second: 276952
Read 194895543 lines. Lines per second: 277236
Read 197118777 lines. Lines per second: 277937
Read 198787186 lines. Lines per second: 277717
Read 199850957 lines. Lines per second: 277192
Read 201475365 lines. Lines per second: 277301
Read 203625197 lines. Lines per second: 277701
Read 205235599 lines. Lines per second: 277966
Read 206840312 lines. Lines per second: 278192
Read 209027996 lines. Lines per second: 278678
Read 211166825 lines. Lines per second: 279483
Read 213322587 lines. Lines per second: 280109
Read 214864675 lines. Lines per second: 280099
Read 216989175 lines. Lines per second: 280307
Read 219142888 lines. Lines per second: 280829
Read 220767486 lines. Lines per second: 281111
Read 222894127 lines. Lines per second: 281528
Read 224963143 lines. Lines per second: 282134
Read 226545304 lines. Lines per second: 280053
Read 228131425 lines. Lines per second: 279884
Read 229205206 lines. Lines per second: 266708
Read 229741122 lines. Lines per second: 263060
Read 230807858 lines. Lines per second: 262399
Read 232381002 lines. Lines per second: 262692
Read 234531280 lines. Lines per second: 263537
Read 236160130 lines. Lines per second: 263237
Read 238285553 lines. Lines per second: 264032
Read 240441489 lines. Lines per second: 264757
Read 242511189 lines. Lines per second: 265554
Read 244612122 lines. Lines per second: 266323
Read 246747160 lines. Lines per second: 267011
Read 248352594 lines. Lines per second: 266952
Read 250475937 lines. Lines per second: 267733
Read 252616309 lines. Lines per second: 268509
Read 254755583 lines. Lines per second: 269182
Finished parsing file /home/alvas/git/hypopotamus/wiki-triplets-24G.tsv
Parsing completed. Parsed 256957055 lines in 950.849 secs.

In [ ]:
sf.rename({'X1': 'x', 'X2':'y', 'X3':'path'})

In [24]:
sf.save('wiki-triplets-24G_sframe/')

In [28]:
entities = sf['x'].unique().append(sf['y'].unique())

In [29]:
entities.head()


Out[29]:
dtype: str
Rows: 10
['annual turnover', 'parasitic genetic elements', 'retail tenants', "brad pitt 's plan b production company", 'labor luminaries', 'maisonneuve', 'two paintings', 'birefringence', '" the sun - herald', 'former blue thunder driver']

In [30]:
path_counts = sf.groupby(key_columns='path', operations={'count': aggregate.COUNT()})

In [31]:
path_counts.head()


Out[31]:
path count
invisible/ADJ/amod<_X/NOU
N/ROOT>_swallow/VERB/ ...
1
X/NOUN/ROOT>_be/VERB/ROOT
_<establish/VERB/conj ...
2
X/PROPN/compound>_battey/
PROPN/nsubj>_win/VERB ...
2
X/NOUN/ROOT>_release/VERB
/ROOT_<notice/VERB/ad ...
2
X/NOUN/dep>_be/VERB/ROOT_
<part/NOUN/attr_<of/A ...
4
large/ADJ/amod<_X/NOUN/RO
OT>_grow/VERB/ROOT_<f ...
1
X/PROPN/compound>_order/P
ROPN/ROOT_<Y/PROPN/conj ...
8
X/NOUN/nsubj>_comprise/VE
RB/ROOT_<johnson/PROP ...
1
X/PROPN/ROOT>_create/VERB
/ROOT_<use/VERB/conj_ ...
1
other/ADJ/amod<_X/NOUN/co
nj>_bowl/PROPN/ROOT_< ...
1
[10 rows x 2 columns]


In [32]:
frequent_paths = path_counts[path_counts['count'] >= 5]

In [33]:
frequent_paths.head()


Out[33]:
path count
X/PROPN/compound>_order/P
ROPN/ROOT_<Y/PROPN/conj ...
8
X/PROPN/ROOT>_be/VERB/ROO
T_<end/VERB/conj_<in/ ...
8
X/NOUN/ROOT>_have/VERB/RO
OT_<be/VERB/conj_<one ...
5
X/PROPN/ROOT>_rout/VERB/R
OOT_<onto/ADP/prep_<Y ...
8
X/PROPN/ROOT>_replace/VER
B/ROOT_<to/ADP/prep_< ...
6
X/NOUN/attr>_be/VERB/ROOT
_<clump/NOUN/conj_<of ...
8
X/NOUN/dobj>_start/VERB/R
OOT_<confrontation/NO ...
130
X/NOUN/pobj>_from/ADP/ROO
T_<presidency/NOUN/po ...
48
X/NOUN/attr>_be/VERB/ROOT
_<guest/NOUN/attr_<on ...
27
X/NOUN/ROOT>_remove/VERB/
ROOT_<from/ADP/prep_< ...
7
[10 rows x 2 columns]


In [34]:
print len(sf) # No. of total paths (non-unique)
print len(path_counts) # No. of unique paths.
print len(frequent_paths) # No. of frequent paths (i.e. >=5)


256957055
10855414
2519717

In [35]:
if save_munging:
    entities.save('wiki-entities-unique-24GB_sframe/')
    frequent_paths.save('wiki-freqpaths-unique-24GB_sframe/')
    path_counts.save('wiki-paths-unique-24GB_sframe/')
    frequent_paths['path'].save('frequent-paths-24GB.csv', format='csv')
    path_counts.save('path-counts-24GB.csv', format='csv')
    entities.save('entities-unique-24GB.csv', format='csv')

In [38]:


In [37]:


In [ ]: