In [2]:
import pandas as pd
import numpy as np
import tables
import frogress
print('pandas {}'.format(pd.__version__))
print('numpy {}'.format(np.__version__))
print('tables {}'.format(tables.__version__))
print('frogress {}'.format(frogress.__version__))


pandas 0.16.1
numpy 1.9.2
tables 3.2.0
frogress 0.9.1

In [9]:
!head -n 3 data/testSearchStream.tsv


ID	SearchID	AdID	Position	ObjectType	HistCTR
1	1	10915336	1	3	0.004999
2	1	12258424	6	1	

In [22]:
!head -n 3 data/trainSearchStream.tsv


SearchID	AdID	Position	ObjectType	HistCTR	IsClick
2	11441863	1	3	0.001804	0
2	22968355	7	3	0.004723	0

In [3]:
filename = 'data/trainSearchStream.tsv'
n=4000
with pd.HDFStore(filename.replace('tsv', 'hd5'),mode='w') as store:
    for chunk in frogress.Bar(pd.read_csv(filename,chunksize=n, sep='\t'), steps=(! wc -l filename)//n):
        store.append('df',chunk)


[##########] | Progress: 98090 | Time: 45min12s | ETA: --

In [4]:
s = pd.HDFStore('data/trainSearchStream.hd5')
s


Out[4]:
<class 'pandas.io.pytables.HDFStore'>
File path: data/trainSearchStream.hd5
/df            frame_table  (typ->appendable,nrows->392356948,ncols->6,indexers->[index])

In [12]:
s.get_storer('df').attrs


Out[12]:
/df._v_attrs (AttributeSet), 15 attributes:
   [CLASS := 'GROUP',
    TITLE := '',
    VERSION := '1.0',
    data_columns := [],
    encoding := 'UTF-8',
    index_cols := [(0, 'index')],
    info := {1: {'names': [None], 'type': 'Index'}, 'index': {}},
    levels := 1,
    metadata := [],
    nan_rep := 'nan',
    non_index_axes := [(1, ['SearchID', 'AdID', 'Position', 'ObjectType', 'HistCTR', 'IsClick'])],
    pandas_type := 'frame_table',
    pandas_version := '0.15.2',
    table_type := 'appendable_frame',
    values_cols := ['values_block_0', 'values_block_1']]

In [ ]: