notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np
import tables
import frogress
print('pandas {}'.format(pd.__version__))
print('numpy {}'.format(np.__version__))
print('tables {}'.format(tables.__version__))
print('frogress {}'.format(frogress.__version__))









    



pandas 0.16.1
numpy 1.9.2
tables 3.2.0
frogress 0.9.1



In [9]:

    
!head -n 3 data/testSearchStream.tsv









    



ID	SearchID	AdID	Position	ObjectType	HistCTR
1	1	10915336	1	3	0.004999
2	1	12258424	6	1



In [22]:

    
!head -n 3 data/trainSearchStream.tsv









    



SearchID	AdID	Position	ObjectType	HistCTR	IsClick
2	11441863	1	3	0.001804	0
2	22968355	7	3	0.004723	0



In [3]:

    
filename = 'data/trainSearchStream.tsv'
n=4000
with pd.HDFStore(filename.replace('tsv', 'hd5'),mode='w') as store:
    for chunk in frogress.Bar(pd.read_csv(filename,chunksize=n, sep='\t'), steps=(! wc -l filename)//n):
        store.append('df',chunk)









    



[##########] | Progress: 98090 | Time: 45min12s | ETA: --



In [4]:

    
s = pd.HDFStore('data/trainSearchStream.hd5')
s









    Out[4]:





<class 'pandas.io.pytables.HDFStore'>
File path: data/trainSearchStream.hd5
/df            frame_table  (typ->appendable,nrows->392356948,ncols->6,indexers->[index])



In [12]:

    
s.get_storer('df').attrs









    Out[12]:





/df._v_attrs (AttributeSet), 15 attributes:
   [CLASS := 'GROUP',
    TITLE := '',
    VERSION := '1.0',
    data_columns := [],
    encoding := 'UTF-8',
    index_cols := [(0, 'index')],
    info := {1: {'names': [None], 'type': 'Index'}, 'index': {}},
    levels := 1,
    metadata := [],
    nan_rep := 'nan',
    non_index_axes := [(1, ['SearchID', 'AdID', 'Position', 'ObjectType', 'HistCTR', 'IsClick'])],
    pandas_type := 'frame_table',
    pandas_version := '0.15.2',
    table_type := 'appendable_frame',
    values_cols := ['values_block_0', 'values_block_1']]



In [ ]: