In [1]:
import pandas as pd
import numpy as np
import random
from scipy import sparse
from sklearn import preprocessing
from sklearn import utils
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
import luigi

In [5]:
path = "~/bimbo_kaggle_competition"

In [22]:
def readin_train(path) :
    """read the csv file in
    
    Args :
        path : project path
        nrows : number of rows to read in
    
    Returns:
        dataset
        
    """
    dtypes = {'Semana' : 'int32',
              'Agencia_ID' :'int32',
              'Canal_ID' : 'int32',
              'Ruta_SAK' : 'int32',
              'Cliente-ID' : 'int32',
              'Producto_ID':'int32',
              'Venta_hoy':'float32',
              'Venta_uni_hoy': 'int32',
              'Dev_uni_proxima':'int32',
              'Dev_proxima':'float32',
              'Demanda_uni_equil':'int32'}
    
    train_dataset = pd.read_csv(path + "/data/raw/train.csv",
                                usecols =['Semana','Agencia_ID','Canal_ID','Ruta_SAK','Cliente_ID','Producto_ID','Demanda_uni_equil'],
                                dtype  = dtypes)
                                
    return train_dataset

In [23]:
def readin_test(path) :
    """read the csv file in
    
    Args :
        path : project path
        nrows : number of rows to read in
    
    Returns:
        dataset
        
    """
    
    dtypes = {'Semana' : 'int32',
              'Agencia_ID' :'int32',
              'Canal_ID' : 'int32',
              'Ruta_SAK' : 'int32',
              'Cliente-ID' : 'int32',
              'Producto_ID':'int32',
              'Venta_hoy':'float32',
              'Venta_uni_hoy': 'int32',
              'Dev_uni_proxima':'int32',
              'Dev_proxima':'float32',
              'Demanda_uni_equil':'int32'}
    
    test_dataset = pd.read_csv(path + "/data/raw/test.csv",
                                usecols =['Semana','Agencia_ID','Canal_ID','Ruta_SAK','Cliente_ID','Producto_ID'],
                                dtype  = dtypes)
    return test_dataset

In [24]:
train_dataset = readin_train(path)

In [8]:
train_dataset.shape


Out[8]:
(74180464, 7)

In [25]:
train_dataset = train_dataset.sample(100000)

In [26]:
train_dataset.to_csv("~/bimbo_kaggle_competition/data/interim/train_sample.csv",index = False)

In [27]:
test_dataset = readin_test(path)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-27-40dba066dd0e> in <module>()
----> 1 test_dataset = readin_test(path)

<ipython-input-23-749f138a7358> in readin_test(path)
     25     test_dataset = pd.read_csv(path + "/data/raw/test.csv",
     26                                 usecols =['Semana','Agencia_ID','Canal_ID','Ruta_SAK','Cliente_ID','Producto_ID'],
---> 27                                 dtype  = dtypes)
     28     return test_dataset

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    643                     skip_blank_lines=skip_blank_lines)
    644 
--> 645         return _read(filepath_or_buffer, kwds)
    646 
    647     parser_f.__name__ = name

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    398         return parser
    399 
--> 400     data = parser.read()
    401     parser.close()
    402     return data

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/io/parsers.pyc in read(self, nrows)
    954             new_rows = len(index)
    955 
--> 956         df = DataFrame(col_dict, columns=columns, index=index)
    957 
    958         self._currow += new_rows

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    264                                  dtype=dtype, copy=copy)
    265         elif isinstance(data, dict):
--> 266             mgr = self._init_dict(data, index, columns, dtype=dtype)
    267         elif isinstance(data, ma.MaskedArray):
    268             import numpy.ma.mrecords as mrecords

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    400             arrays = [data[k] for k in keys]
    401 
--> 402         return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    403 
    404     def _init_ndarray(self, values, index, columns, dtype=None, copy=False):

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5387     axes = [_ensure_index(columns), _ensure_index(index)]
   5388 
-> 5389     return create_block_manager_from_arrays(arrays, arr_names, axes)
   5390 
   5391 

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/core/internals.pyc in create_block_manager_from_arrays(arrays, names, axes)
   4237 
   4238     try:
-> 4239         blocks = form_blocks(arrays, names, axes)
   4240         mgr = BlockManager(blocks, axes)
   4241         mgr._consolidate_inplace()

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/core/internals.pyc in form_blocks(arrays, names, axes)
   4314 
   4315     if len(int_items):
-> 4316         int_blocks = _multi_blockify(int_items)
   4317         blocks.extend(int_blocks)
   4318 

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/core/internals.pyc in _multi_blockify(tuples, dtype)
   4383     for dtype, tup_block in grouper:
   4384 
-> 4385         values, placement = _stack_arrays(list(tup_block), dtype)
   4386 
   4387         block = make_block(values, placement=placement)

/Users/boyazhou/py_env/py27/lib/python2.7/site-packages/pandas/core/internals.pyc in _stack_arrays(tuples, dtype)
   4428     stacked = np.empty(shape, dtype=dtype)
   4429     for i, arr in enumerate(arrays):
-> 4430         stacked[i] = _asarray_compat(arr)
   4431 
   4432     return stacked, placement

KeyboardInterrupt: 

In [15]:
test_dataset = test_dataset.sample(100000)

In [16]:
test_dataset.shape


Out[16]:
(100000, 6)

In [17]:
test_dataset.to_csv("~/bimbo_kaggle_competition/data/interim/test_sample.csv",index = False)

In [ ]: