In [4]:
from sframe import SFrame, aggregate # Note: sframe only supports Python 2.7+ and 3.4.
save_munging = True
In [20]:
try:
sf = SFrame.load_sframe('wiki-triplets-24G_sframe/')
except:
sf = SFrame.read_csv('wiki-triplets-24G.tsv', column_type_hints=[str, str, str],
delimiter='\t', header=False, quote_char='\0')
In [ ]:
sf.rename({'X1': 'x', 'X2':'y', 'X3':'path'})
In [24]:
sf.save('wiki-triplets-24G_sframe/')
In [28]:
entities = sf['x'].unique().append(sf['y'].unique())
In [29]:
entities.head()
Out[29]:
In [30]:
path_counts = sf.groupby(key_columns='path', operations={'count': aggregate.COUNT()})
In [31]:
path_counts.head()
Out[31]:
In [32]:
frequent_paths = path_counts[path_counts['count'] >= 5]
In [33]:
frequent_paths.head()
Out[33]:
In [34]:
print len(sf) # No. of total paths (non-unique)
print len(path_counts) # No. of unique paths.
print len(frequent_paths) # No. of frequent paths (i.e. >=5)
In [35]:
if save_munging:
entities.save('wiki-entities-unique-24GB_sframe/')
frequent_paths.save('wiki-freqpaths-unique-24GB_sframe/')
path_counts.save('wiki-paths-unique-24GB_sframe/')
frequent_paths['path'].save('frequent-paths-24GB.csv', format='csv')
path_counts.save('path-counts-24GB.csv', format='csv')
entities.save('entities-unique-24GB.csv', format='csv')
In [38]:
In [37]:
In [ ]: