In [1]:
import numpy as np
import pandas
from sqlalchemy import create_engine # database connection
import datetime as dt
from IPython.display import display

In [2]:
display(pandas.read_csv('../data/data_train.txt', sep=' ', header=None, nrows=2).head())
display(pandas.read_csv('../data/data_train.txt', sep=' ', header=None, nrows=2).tail())


0 1 2 3 4 5 6 7 8 9 ... 890 891 892 893 894 895 896 897 898 899
0 1 0 0 1 0 0 0 1 0 0 ... 0 0 1 1 0 1 0 0 5 2
1 0 0 1 1 0 0 0 0 1 2 ... 0 1 0 0 0 2 0 1 0 0

2 rows × 900 columns

0 1 2 3 4 5 6 7 8 9 ... 890 891 892 893 894 895 896 897 898 899
0 1 0 0 1 0 0 0 1 0 0 ... 0 0 1 1 0 1 0 0 5 2
1 0 0 1 1 0 0 0 0 1 2 ... 0 1 0 0 0 2 0 1 0 0

2 rows × 900 columns


In [3]:
y = pandas.read_table('../data/label_train.txt', sep=' ', header=None, dtype='int')

y.head(2)


Out[3]:
0
0 161
1 163

In [4]:
## Size of the datafile:

!f=`readlink -f '../data/data_train.txt'`; du -h $f


1.7G	/home/vahid/Downloads/data/ml/data_train.txt

Initialize Database


In [5]:
disk_engine = create_engine('sqlite:///../data/datatrain.db')

In [7]:
import sys

start = dt.datetime.now()
chunksize = 10000
j = 0
index_start = 0

for df in pandas.read_table('../data/data_train.txt', chunksize=chunksize, iterator=True, sep=' ', header=None, dtype='int'):
    
    df.index += index_start
    index_end = df.index[-1] 
    
    j+=1
    sys.stderr.write('{} seconds: completed {} rows: index {} to {} \n'. \
        format((dt.datetime.now() - start).seconds, j*chunksize, index_start, index_end))

    action = 'append'
    if index_start == 0:
        action = 'replace'
        
    df.to_sql('data', disk_engine, if_exists=action)
    #y.iloc[index_start:index_end, 0].to_sql('label', disk_engine, if_exists=action)
    
    index_start = df.index[-1] + 1


1 seconds: completed 10000 rows: index 0 to 9999 
22 seconds: completed 20000 rows: index 10000 to 19999 
37 seconds: completed 30000 rows: index 20000 to 29999 
52 seconds: completed 40000 rows: index 30000 to 39999 
67 seconds: completed 50000 rows: index 40000 to 49999 
82 seconds: completed 60000 rows: index 50000 to 59999 
97 seconds: completed 70000 rows: index 60000 to 69999 
112 seconds: completed 80000 rows: index 70000 to 79999 
127 seconds: completed 90000 rows: index 80000 to 89999 
141 seconds: completed 100000 rows: index 90000 to 99999 
156 seconds: completed 110000 rows: index 100000 to 109999 
170 seconds: completed 120000 rows: index 110000 to 119999 
185 seconds: completed 130000 rows: index 120000 to 129999 
200 seconds: completed 140000 rows: index 130000 to 139999 
215 seconds: completed 150000 rows: index 140000 to 149999 
229 seconds: completed 160000 rows: index 150000 to 159999 
244 seconds: completed 170000 rows: index 160000 to 169999 
259 seconds: completed 180000 rows: index 170000 to 179999 
273 seconds: completed 190000 rows: index 180000 to 189999 
288 seconds: completed 200000 rows: index 190000 to 199999 
303 seconds: completed 210000 rows: index 200000 to 209999 
317 seconds: completed 220000 rows: index 210000 to 219999 
332 seconds: completed 230000 rows: index 220000 to 229999 
347 seconds: completed 240000 rows: index 230000 to 239999 
361 seconds: completed 250000 rows: index 240000 to 249999 
376 seconds: completed 260000 rows: index 250000 to 259999 
391 seconds: completed 270000 rows: index 260000 to 269999 
406 seconds: completed 280000 rows: index 270000 to 279999 
421 seconds: completed 290000 rows: index 280000 to 289999 
436 seconds: completed 300000 rows: index 290000 to 299999 
450 seconds: completed 310000 rows: index 300000 to 309999 
465 seconds: completed 320000 rows: index 310000 to 319999 
479 seconds: completed 330000 rows: index 320000 to 329999 
494 seconds: completed 340000 rows: index 330000 to 339999 
509 seconds: completed 350000 rows: index 340000 to 349999 
523 seconds: completed 360000 rows: index 350000 to 359999 
538 seconds: completed 370000 rows: index 360000 to 369999 
553 seconds: completed 380000 rows: index 370000 to 379999 
568 seconds: completed 390000 rows: index 380000 to 389999 
582 seconds: completed 400000 rows: index 390000 to 399999 
597 seconds: completed 410000 rows: index 400000 to 409999 
612 seconds: completed 420000 rows: index 410000 to 419999 
626 seconds: completed 430000 rows: index 420000 to 429999 
641 seconds: completed 440000 rows: index 430000 to 439999 
656 seconds: completed 450000 rows: index 440000 to 449999 
670 seconds: completed 460000 rows: index 450000 to 459999 
685 seconds: completed 470000 rows: index 460000 to 469999 
699 seconds: completed 480000 rows: index 470000 to 479999 
714 seconds: completed 490000 rows: index 480000 to 489999 
729 seconds: completed 500000 rows: index 490000 to 499999 
744 seconds: completed 510000 rows: index 500000 to 509999 
759 seconds: completed 520000 rows: index 510000 to 519999 
773 seconds: completed 530000 rows: index 520000 to 529999 
788 seconds: completed 540000 rows: index 530000 to 539999 
803 seconds: completed 550000 rows: index 540000 to 549999 
817 seconds: completed 560000 rows: index 550000 to 559999 
832 seconds: completed 570000 rows: index 560000 to 569999 
847 seconds: completed 580000 rows: index 570000 to 579999 
861 seconds: completed 590000 rows: index 580000 to 589999 
876 seconds: completed 600000 rows: index 590000 to 599999 
890 seconds: completed 610000 rows: index 600000 to 609999 
905 seconds: completed 620000 rows: index 610000 to 619999 
920 seconds: completed 630000 rows: index 620000 to 629999 
934 seconds: completed 640000 rows: index 630000 to 639999 
949 seconds: completed 650000 rows: index 640000 to 649999 
963 seconds: completed 660000 rows: index 650000 to 659999 
978 seconds: completed 670000 rows: index 660000 to 669999 
992 seconds: completed 680000 rows: index 670000 to 679999 
1007 seconds: completed 690000 rows: index 680000 to 689999 
1022 seconds: completed 700000 rows: index 690000 to 699999 
1036 seconds: completed 710000 rows: index 700000 to 709999 
1051 seconds: completed 720000 rows: index 710000 to 719999 
1065 seconds: completed 730000 rows: index 720000 to 729999 
1080 seconds: completed 740000 rows: index 730000 to 739999 
1095 seconds: completed 750000 rows: index 740000 to 749999 
1109 seconds: completed 760000 rows: index 750000 to 759999 
1124 seconds: completed 770000 rows: index 760000 to 769999 
1138 seconds: completed 780000 rows: index 770000 to 779999 
1153 seconds: completed 790000 rows: index 780000 to 789999 
1167 seconds: completed 800000 rows: index 790000 to 799999 
1182 seconds: completed 810000 rows: index 800000 to 809999 
1196 seconds: completed 820000 rows: index 810000 to 819999 
1210 seconds: completed 830000 rows: index 820000 to 829999 
1225 seconds: completed 840000 rows: index 830000 to 839999 
1239 seconds: completed 850000 rows: index 840000 to 849999 
1254 seconds: completed 860000 rows: index 850000 to 859999 
1268 seconds: completed 870000 rows: index 860000 to 869999 
1282 seconds: completed 880000 rows: index 870000 to 879999 
1297 seconds: completed 890000 rows: index 880000 to 889999 
1311 seconds: completed 900000 rows: index 890000 to 899999 
1325 seconds: completed 910000 rows: index 900000 to 909999 
1339 seconds: completed 920000 rows: index 910000 to 919999 
1354 seconds: completed 930000 rows: index 920000 to 929999 
1368 seconds: completed 940000 rows: index 930000 to 939999 
1383 seconds: completed 950000 rows: index 940000 to 949999 
1397 seconds: completed 960000 rows: index 950000 to 959999 
1411 seconds: completed 970000 rows: index 960000 to 969999 
1425 seconds: completed 980000 rows: index 970000 to 979999 
1439 seconds: completed 990000 rows: index 980000 to 989999 
1454 seconds: completed 1000000 rows: index 990000 to 999999 

In [8]:
y[0].to_sql('label', disk_engine, if_exists='replace')

In [ ]: