In [3]:
import numpy as np
import pandas as pd
import math

from sklearn.cross_validation import cross_val_score
from subprocess import check_output

from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import normalize
import xgboost as xgb

In [4]:
dtypes = {'Semana' : 'int32',
                              'Agencia_ID' :'int32',
                              'Canal_ID' : 'int32',
                              'Ruta_SAK' : 'int32',
                              'Cliente-ID' : 'int32',
                              'Producto_ID':'int32',
                              'Venta_hoy':'float32',
                              'Venta_uni_hoy': 'int32',
                              'Dev_uni_proxima':'int32',
                              'Dev_proxima':'float32',
                              'Demanda_uni_equil':'int32'}

train = pd.read_csv('train.csv', dtype  = dtypes, usecols=["Semana", "Agencia_ID", "Canal_ID", 'Ruta_SAK',
                                                             'Cliente_ID', 'Producto_ID','Demanda_uni_equil'])
train = train.loc[train['Demanda_uni_equil'] < 85,:]

test = pd.read_csv('test.csv', dtype  = dtypes) #, nrows=numsToRead)
ids = test['id']
test.drop(['id'], axis =1, inplace = True)

In [5]:
#get dummies variables
#train = train[np.append(test.columns.values, 'Demanda_uni_equil')]
train = train.loc[train['Demanda_uni_equil'] < 85,:]
shapeTrain = train.shape[0]
shapeTest = test.shape[0]

# аппендим чтобы единые преобразования и для теста и для трейна
train = train.append(test)

#дропаем по сути коллинеарные колонки, они не нужны
train = pd.concat([train, pd.get_dummies(train['Semana'],sparse=True)], axis=1, join_axes=[train.index])
train.drop([11,'Semana'],axis=1, inplace = True)

train = pd.concat([train, pd.get_dummies(train['Producto_ID'],sparse=True)], axis=1, join_axes=[train.index])
train.drop([123,'Producto_ID'],axis=1, inplace = True)


train = pd.concat([train, pd.get_dummies(train['Canal_ID'],sparse=True)], axis=1, join_axes=[train.index])
train.drop([11,'Canal_ID'],axis=1, inplace = True)

train = pd.concat([train, pd.get_dummies(train['Agencia_ID'],sparse=True)], axis=1, join_axes=[train.index])
train.drop([1382,'Agencia_ID'],axis=1, inplace = True)

#для того чтобы сделать названия колонок уникальными
train.columns = range(train.shape[1])
train.reset_index(drop=True, inplace= True)


test = train[shapeTrain:shapeTrain+shapeTest]
train = train[0:shapeTrain]

test['id'] = ids

train.to_csv('train_1.csv', index=False)
test.to_csv('test_1.csv', index=False)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-5-0921779d7793> in <module>()
      6 
      7 # аппендим чтобы единые преобразования и для теста и для трейна
----> 8 train = train.append(test)
      9 
     10 #дропаем по сути коллинеарные колонки, они не нужны

/home/analyst/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in append(self, other, ignore_index, verify_integrity)
   4336             to_concat = [self, other]
   4337         return concat(to_concat, ignore_index=ignore_index,
-> 4338                       verify_integrity=verify_integrity)
   4339 
   4340     def join(self, other, on=None, how='left', lsuffix='', rsuffix='',

/home/analyst/anaconda2/lib/python2.7/site-packages/pandas/tools/merge.pyc in concat(objs, axis, join, join_axes, ignore_index, keys, levels, names, verify_integrity, copy)
    843                        keys=keys, levels=levels, names=names,
    844                        verify_integrity=verify_integrity,
--> 845                        copy=copy)
    846     return op.get_result()
    847 

/home/analyst/anaconda2/lib/python2.7/site-packages/pandas/tools/merge.pyc in __init__(self, objs, axis, join, join_axes, keys, levels, names, ignore_index, verify_integrity, copy)
    982         self.copy = copy
    983 
--> 984         self.new_axes = self._get_new_axes()
    985 
    986     def get_result(self):

/home/analyst/anaconda2/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_new_axes(self)
   1071                 new_axes[i] = ax
   1072 
-> 1073         new_axes[self.axis] = self._get_concat_axis()
   1074         return new_axes
   1075 

/home/analyst/anaconda2/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_concat_axis(self)
   1125 
   1126         if self.keys is None:
-> 1127             concat_axis = _concat_indexes(indexes)
   1128         else:
   1129             concat_axis = _make_concat_multiindex(indexes, self.keys,

/home/analyst/anaconda2/lib/python2.7/site-packages/pandas/tools/merge.pyc in _concat_indexes(indexes)
   1143 
   1144 def _concat_indexes(indexes):
-> 1145     return indexes[0].append(indexes[1:])
   1146 
   1147 

/home/analyst/anaconda2/lib/python2.7/site-packages/pandas/indexes/base.pyc in append(self, other)
   1322         attribs['name'] = name
   1323         return self._shallow_copy_with_infer(
-> 1324             np.concatenate(to_concat), **attribs)
   1325 
   1326     @staticmethod

MemoryError: 

In [ ]: