Random sample data from file


In [1]:
import numpy as np

In [7]:
np.random.seed(50)

In [8]:
indexlist = range(0,40428967)

In [9]:
np.random.shuffle(indexlist)

In [10]:
indexset = set(indexlist[:20214483])

In [11]:
%%time
i = 0
with open('train1.txt', 'wb') as outfile:
    for line in open("train.txt"):
        if i in indexset:
            outfile.write(line)
        i+=1


CPU times: user 33 s, sys: 10.9 s, total: 43.9 s
Wall time: 3min 40s

In [25]:
%%time
i = 0
with open('train1.fm', 'wb') as outfile:
    for line in open("train.fm"):
        if i in indexset:
            outfile.write(line)
        i+=1


CPU times: user 27.6 s, sys: 5.2 s, total: 32.8 s
Wall time: 2min 14s

In [ ]: