In [ ]:

    
import os
import time
import pickle
import datetime


import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, vstack

import const

%matplotlib inline

Convert categorical data to numbers

(Saves approx 25% memory usage)



In [ ]:

    
# Read all data
a=time.time()
b=pd.read_csv(os.path.join(c.BASE_PATH,'train_categorical.csv'), dtype='object')
print time.time()-a



In [ ]:

    
# Batch wise stripping of elements
i_s = [0, 500, 1000, 1500, 2000]
i_e = [500, 1000, 1500, 2000, 2141]

for s,e in zip(i_s, i_e):
    print s, e
    #a=time.time()
    b.iloc[:,s:e]=b.iloc[:,s:e].apply(lambda x: x.str.lstrip('T'), axis=0)
    #print (time.time()-a)
    #print (time.time()-a)/500



In [ ]:

    
b.to_csv(os.path.join(c.BASE_PATH,'test_categorical_to_num.csv'), index=False)



In [ ]:

    
del b

Convert to sparse and check memory



In [ ]:

    
def memory_usage(var):
    ''' Returns estimate of memory usage using pickle trick '''
    tmp_file_path = os.path.join(c.BASE_PATH, 'tmp_dump_file.pkl')
    
    with open(tmp_file_path,'wb') as f:
        pickle.dump(var, f, pickle.HIGHEST_PROTOCOL)
        
    file_size = os.path.getsize(tmp_file_path)/1024/1024
    os.remove(tmp_file_path)
    
    return file_size


def read_csv_as_sparse(csv_file, chuncksize=10000, verbose=False):
    ''' Read csv and return sparse matrix'''
    
    print('Reading {} with size {} MB'.format(csv_file, os.path.getsize(csv_file)/1024/1024))
    
    # Check for columns to read
    quick_scan = pd.read_csv(csv_file,
                         nrows=1, 
                         dtype=np.float32)
    
    columns_names_used = list(quick_scan.columns)
    
    if 'Response' in quick_scan.columns:
        if verbose:
            print('Excluding Response column')
        columns_names_used.remove('Response')
    
    
    #print(len(columns_names_used))
    #print(columns_names_used)
    
    # index_col is the first column of usecols, not the first column in the csv file
    reader = pd.read_csv(csv_file,
                         chunksize=chuncksize, 
                         dtype=np.float32,
                         index_col=0,
                         usecols=columns_names_used)
    
    
    ids = pd.read_csv(csv_file, usecols=[0])
    
    if verbose:
        for i,ch in enumerate(reader):
            #print len(ch.columns)
            #print ch.columns
            
            ch = ch.replace(0, 1e-5)
            
            if i==0:
                csr = csr_matrix(ch.fillna(0))
            else:
                csr = vstack([csr, csr_matrix(ch.fillna(0))], format='csr')

            if not i % 10:
                print('Doing chunck: {} | status: {} elements'.format(i, csr.getnnz()))
    else:
        csr = vstack([csr_matrix(ch.fillna(0)) for ch in reader], format='csr')
    
    print('Sparse matrix has {} elements and {} MB memory usage'.format(csr.getnnz(),
                                                                       memory_usage(csr)))
    
    return csr, ids, columns_names_used

def read_csv_as_sparse_bin(csv_file, chuncksize=10000, verbose=False):
    ''' Read csv and return sparse matrix'''
    
    print('Reading {} with size {} MB'.format(csv_file, os.path.getsize(csv_file)/1024/1024))
    
    # Check for columns to read
    quick_scan = pd.read_csv(csv_file,
                         nrows=1, 
                         dtype=np.float32)
    
    columns_names_used = list(quick_scan.columns)
    
    if 'Response' in quick_scan.columns:
        if verbose:
            print('Excluding Response column')
        columns_names_used.remove('Response')
    
    
    #print(len(columns_names_used))
    #print(columns_names_used)
    
    # index_col is the first column of usecols, not the first column in the csv file
    reader = pd.read_csv(csv_file,
                         chunksize=chuncksize, 
                         dtype=np.float32,
                         index_col=0,
                         usecols=columns_names_used)
    
    
    ids = pd.read_csv(csv_file, usecols=[0])
    
    if verbose:
        for i,ch in enumerate(reader):
            #print len(ch.columns)
            #print ch.columns
            
            if i==0:
                csr = csr_matrix(~ch.isnull())
            else:
                csr = vstack([csr, csr_matrix(~ch.isnull())], format='csr')

            if not i % 1:
                print('Doing chunck: {} | status: {} elements'.format(i, csr.getnnz()))
    else:
        csr = vstack([csr_matrix(~ch.isnull()) for ch in reader], format='csr')
    
    print('Sparse matrix has {} elements and {} MB memory usage'.format(csr.getnnz(),
                                                                       memory_usage(csr)))
    
    return csr, ids, columns_names_used

def read_last_column(csv_file):
    ''' Reads last column in csv file'''
    sample = pd.read_csv(os.path.join(c.BASE_PATH,csv_file), nrows=1)
    
    return pd.read_csv(os.path.join(c.BASE_PATH,csv_file), usecols=[0,sample.shape[1]-1], index_col=0)

def convert_data_file_to_pickle(filename, verbose=False):
    ''' Reads full csv file, converts to sparse and stores using pickle with metadata'''
    
    csr, ids, f_names = read_csv_as_sparse(os.path.join(c.BASE_PATH, filename + '.csv'), 
                                  chuncksize=100000,
                                  verbose=verbose)
    
    print('Reading data as sparse took {}s'.format(time.time()-a))
    
    y = read_last_column(os.path.join(c.BASE_PATH, 'train_numeric.csv'))
    
    output = {'data': {'ids': ids, 'y': y, 'features': csr, 'feature_names': f_names},
              'creation_date':datetime.datetime.now(),
              'created_by': 'joostgp',
              'script': 'convert_data_to_sparse'}
    
    save_path = os.path.join(c.BASE_PATH,filename + '.pkl')
    
    with open(save_path,'wb') as f:
        pickle.dump(output, f, pickle.HIGHEST_PROTOCOL)
        
    print('Results stored in {}'.format(save_path))



In [ ]:

    
data_files = ['train_categorical_to_num',
              'train_numeric',
              'train_date',
              'test_categorical_to_num',
              'test_numeric',
              'test_date']



In [ ]:

    
#convert_data_file_to_pickle(data_files[1], verbose=True)
#convert_data_file_to_pickle(data_files[4], verbose=True)



In [ ]:

    
for f in data_files:
     convert_data_file_to_pickle(f, verbose=False)