In [ ]:
import os
import time
import pickle
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, vstack
import const
%matplotlib inline
In [ ]:
# Read all data
a=time.time()
b=pd.read_csv(os.path.join(c.BASE_PATH,'train_categorical.csv'), dtype='object')
print time.time()-a
In [ ]:
# Batch wise stripping of elements
i_s = [0, 500, 1000, 1500, 2000]
i_e = [500, 1000, 1500, 2000, 2141]
for s,e in zip(i_s, i_e):
print s, e
#a=time.time()
b.iloc[:,s:e]=b.iloc[:,s:e].apply(lambda x: x.str.lstrip('T'), axis=0)
#print (time.time()-a)
#print (time.time()-a)/500
In [ ]:
b.to_csv(os.path.join(c.BASE_PATH,'test_categorical_to_num.csv'), index=False)
In [ ]:
del b
In [ ]:
def memory_usage(var):
''' Returns estimate of memory usage using pickle trick '''
tmp_file_path = os.path.join(c.BASE_PATH, 'tmp_dump_file.pkl')
with open(tmp_file_path,'wb') as f:
pickle.dump(var, f, pickle.HIGHEST_PROTOCOL)
file_size = os.path.getsize(tmp_file_path)/1024/1024
os.remove(tmp_file_path)
return file_size
def read_csv_as_sparse(csv_file, chuncksize=10000, verbose=False):
''' Read csv and return sparse matrix'''
print('Reading {} with size {} MB'.format(csv_file, os.path.getsize(csv_file)/1024/1024))
# Check for columns to read
quick_scan = pd.read_csv(csv_file,
nrows=1,
dtype=np.float32)
columns_names_used = list(quick_scan.columns)
if 'Response' in quick_scan.columns:
if verbose:
print('Excluding Response column')
columns_names_used.remove('Response')
#print(len(columns_names_used))
#print(columns_names_used)
# index_col is the first column of usecols, not the first column in the csv file
reader = pd.read_csv(csv_file,
chunksize=chuncksize,
dtype=np.float32,
index_col=0,
usecols=columns_names_used)
ids = pd.read_csv(csv_file, usecols=[0])
if verbose:
for i,ch in enumerate(reader):
#print len(ch.columns)
#print ch.columns
ch = ch.replace(0, 1e-5)
if i==0:
csr = csr_matrix(ch.fillna(0))
else:
csr = vstack([csr, csr_matrix(ch.fillna(0))], format='csr')
if not i % 10:
print('Doing chunck: {} | status: {} elements'.format(i, csr.getnnz()))
else:
csr = vstack([csr_matrix(ch.fillna(0)) for ch in reader], format='csr')
print('Sparse matrix has {} elements and {} MB memory usage'.format(csr.getnnz(),
memory_usage(csr)))
return csr, ids, columns_names_used
def read_csv_as_sparse_bin(csv_file, chuncksize=10000, verbose=False):
''' Read csv and return sparse matrix'''
print('Reading {} with size {} MB'.format(csv_file, os.path.getsize(csv_file)/1024/1024))
# Check for columns to read
quick_scan = pd.read_csv(csv_file,
nrows=1,
dtype=np.float32)
columns_names_used = list(quick_scan.columns)
if 'Response' in quick_scan.columns:
if verbose:
print('Excluding Response column')
columns_names_used.remove('Response')
#print(len(columns_names_used))
#print(columns_names_used)
# index_col is the first column of usecols, not the first column in the csv file
reader = pd.read_csv(csv_file,
chunksize=chuncksize,
dtype=np.float32,
index_col=0,
usecols=columns_names_used)
ids = pd.read_csv(csv_file, usecols=[0])
if verbose:
for i,ch in enumerate(reader):
#print len(ch.columns)
#print ch.columns
if i==0:
csr = csr_matrix(~ch.isnull())
else:
csr = vstack([csr, csr_matrix(~ch.isnull())], format='csr')
if not i % 1:
print('Doing chunck: {} | status: {} elements'.format(i, csr.getnnz()))
else:
csr = vstack([csr_matrix(~ch.isnull()) for ch in reader], format='csr')
print('Sparse matrix has {} elements and {} MB memory usage'.format(csr.getnnz(),
memory_usage(csr)))
return csr, ids, columns_names_used
def read_last_column(csv_file):
''' Reads last column in csv file'''
sample = pd.read_csv(os.path.join(c.BASE_PATH,csv_file), nrows=1)
return pd.read_csv(os.path.join(c.BASE_PATH,csv_file), usecols=[0,sample.shape[1]-1], index_col=0)
def convert_data_file_to_pickle(filename, verbose=False):
''' Reads full csv file, converts to sparse and stores using pickle with metadata'''
csr, ids, f_names = read_csv_as_sparse(os.path.join(c.BASE_PATH, filename + '.csv'),
chuncksize=100000,
verbose=verbose)
print('Reading data as sparse took {}s'.format(time.time()-a))
y = read_last_column(os.path.join(c.BASE_PATH, 'train_numeric.csv'))
output = {'data': {'ids': ids, 'y': y, 'features': csr, 'feature_names': f_names},
'creation_date':datetime.datetime.now(),
'created_by': 'joostgp',
'script': 'convert_data_to_sparse'}
save_path = os.path.join(c.BASE_PATH,filename + '.pkl')
with open(save_path,'wb') as f:
pickle.dump(output, f, pickle.HIGHEST_PROTOCOL)
print('Results stored in {}'.format(save_path))
In [ ]:
data_files = ['train_categorical_to_num',
'train_numeric',
'train_date',
'test_categorical_to_num',
'test_numeric',
'test_date']
In [ ]:
#convert_data_file_to_pickle(data_files[1], verbose=True)
#convert_data_file_to_pickle(data_files[4], verbose=True)
In [ ]:
for f in data_files:
convert_data_file_to_pickle(f, verbose=False)