In [1]:
import numpy as np
import pandas as pd
import math

from sklearn.cross_validation import cross_val_score
from subprocess import check_output

from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import normalize
import xgboost as xgb

In [2]:
numsToRead = 2000000 # колво строк, которой будет прочитано из трейна

# явно указываем типы данных для оптимизации памяти
dtypes = {'Semana' : 'int32',
                              'Agencia_ID' :'int32',
                              'Canal_ID' : 'int32',
                              'Ruta_SAK' : 'int32',
                              'Cliente-ID' : 'int32',
                              'Producto_ID':'int32',
                              'Venta_hoy':'float32',
                              'Venta_uni_hoy': 'int32',
                              'Dev_uni_proxima':'int32',
                              'Dev_proxima':'float32',
                              'Demanda_uni_equil':'int32'}

# обучаемся на Demanda_uni_equil

train = pd.read_csv('train.csv', dtype  = dtypes, nrows=numsToRead)
train.drop(['Venta_uni_hoy','Venta_hoy','Dev_uni_proxima','Dev_proxima'], axis =1, inplace = True)
train = train.loc[train['Demanda_uni_equil'] < 85,:]

#для соотсвествия функции ошибки тут и на lb
#train.Demanda_uni_equil = train.Demanda_uni_equil.apply(lambda x: math.log(x + 1))
#df_test = pd.read_csv('test.csv'), nrows=5000000)

In [3]:
train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1991036 entries, 0 to 1999999
Data columns (total 7 columns):
Semana               int32
Agencia_ID           int32
Canal_ID             int32
Ruta_SAK             int32
Cliente_ID           int64
Producto_ID          int32
Demanda_uni_equil    int32
dtypes: int32(6), int64(1)
memory usage: 76.0 MB

In [4]:
# по хорошему нужно использовать вот эту функцию ошибки
def rmsle_func(actual, predicted):
    """ Computes the root mean squared log error.
    This function computes the root mean squared log error between two lists
    of numbers. """  
    return np.sqrt(msle(actual, predicted))
def msle(actual, predicted):
    return np.mean(sle(actual, predicted))
def sle(actual, predicted):
    return (np.power(np.log(np.array(actual)+1) - 
            np.log(np.array(predicted)+1), 2))

In [5]:
test = pd.read_csv('test.csv', dtype  = dtypes) #, nrows=numsToRead)
ids = test['id']
test.drop(['id'], axis =1, inplace = True)
print test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6999251 entries, 0 to 6999250
Data columns (total 6 columns):
Semana         int32
Agencia_ID     int32
Canal_ID       int32
Ruta_SAK       int32
Cliente_ID     int64
Producto_ID    int32
dtypes: int32(5), int64(1)
memory usage: 186.9 MB
None

In [6]:
#get dummies variables
#train = train[np.append(test.columns.values, 'Demanda_uni_equil')]
#меньше 1% имеют значение больше 51

train = train.loc[train['Demanda_uni_equil'] < 85,:]
shapeTrain = train.shape[0]
shapeTest = test.shape[0]

# аппендим чтобы единые преобразования и для теста и для трейна
train = train.append(test)

#дропаем по сути коллинеарные колонки, они не нужны
train = pd.concat([train, pd.get_dummies(train['Semana'],sparse=True)], axis=1, join_axes=[train.index])
train.drop([11,'Semana'],axis=1, inplace = True)

# train = pd.concat([train, pd.get_dummies(train['Producto_ID'],sparse=True)], axis=1, join_axes=[train.index])
# train.drop([123,'Producto_ID'],axis=1, inplace = True)


train = pd.concat([train, pd.get_dummies(train['Canal_ID'],sparse=True)], axis=1, join_axes=[train.index])
train.drop([11,'Canal_ID'],axis=1, inplace = True)

# train = pd.concat([train, pd.get_dummies(train['Agencia_ID'],sparse=True)], axis=1, join_axes=[train.index])
# train.drop([1382,'Agencia_ID'],axis=1, inplace = True)

#для того чтобы сделать названия колонок уникальными
train.columns = range(train.shape[1])

# индексы в датафрейме полетели, поступим вот так
train.reset_index(drop=True, inplace= True)


test = train[shapeTrain:shapeTrain+shapeTest]
train = train[0:shapeTrain]
print train.info()


<class 'pandas.sparse.frame.SparseDataFrame'>
RangeIndex: 1991036 entries, 0 to 1991035
Data columns (total 15 columns):
0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
13    float64
14    float64
dtypes: float64(15)
memory usage: 227.9 MB
None

In [7]:
train.index.get_duplicates()


Out[7]:
[]

In [8]:
y = train[train.columns[1]]
train.drop([1],axis=1, inplace = True)
test.drop([1],axis=1, inplace = True)

X = train[test.columns.values]
print(X.shape, y.shape)

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)
print(X_train.shape, X_test.shape)


((1991036, 14), (1991036,))
((1592828, 14), (398208, 14))

In [ ]:
# вот нужно использовать  нашу функцию ошибки
#rmsle  = make_scorer(rmsle_func, greater_is_better=False)

xlf = xgb.XGBRegressor(objective="reg:linear", seed=1729, n_estimators= 2500, learning_rate=0.05, max_depth=5, 
                      nthread=2)
xlf.fit(X_train, y_train, eval_metric = 'rmse',eval_set = [(X_test, y_test)])


[0]	validation_0-rmse:2053305.750000
[1]	validation_0-rmse:2003695.875000
[2]	validation_0-rmse:1957720.250000
[3]	validation_0-rmse:1915332.125000
[4]	validation_0-rmse:1876247.375000
[5]	validation_0-rmse:1840094.875000
[6]	validation_0-rmse:1806735.125000
[7]	validation_0-rmse:1776083.500000
[8]	validation_0-rmse:1748319.500000
[9]	validation_0-rmse:1722845.500000
[10]	validation_0-rmse:1699517.875000
[11]	validation_0-rmse:1677487.000000
[12]	validation_0-rmse:1657396.000000
[13]	validation_0-rmse:1639166.500000
[14]	validation_0-rmse:1622670.125000
[15]	validation_0-rmse:1607295.000000
[16]	validation_0-rmse:1593513.250000
[17]	validation_0-rmse:1580330.750000
[18]	validation_0-rmse:1568860.125000
[19]	validation_0-rmse:1558259.250000
[20]	validation_0-rmse:1548567.625000
[21]	validation_0-rmse:1539986.875000
[22]	validation_0-rmse:1531203.750000
[23]	validation_0-rmse:1523606.375000
[24]	validation_0-rmse:1516943.250000
[25]	validation_0-rmse:1510747.500000
[26]	validation_0-rmse:1505212.625000
[27]	validation_0-rmse:1499894.750000
[28]	validation_0-rmse:1495159.000000
[29]	validation_0-rmse:1490787.375000
[30]	validation_0-rmse:1486972.875000
[31]	validation_0-rmse:1482968.000000
[32]	validation_0-rmse:1479554.500000
[33]	validation_0-rmse:1476313.750000
[34]	validation_0-rmse:1473500.000000
[35]	validation_0-rmse:1470690.875000
[36]	validation_0-rmse:1467570.375000
[37]	validation_0-rmse:1465201.750000
[38]	validation_0-rmse:1462628.750000
[39]	validation_0-rmse:1460755.375000
[40]	validation_0-rmse:1458995.875000
[41]	validation_0-rmse:1457398.250000
[42]	validation_0-rmse:1455325.500000
[43]	validation_0-rmse:1454010.000000
[44]	validation_0-rmse:1452344.750000
[45]	validation_0-rmse:1451220.750000

In [ ]:
preds = xlf.predict(X_test)

In [ ]:
# отриц. ответы занулим
def nonnegative(x):
    if x > 0:
        return x
    else: 
        return 0
    


# submission
test_preds = np.around(xlf.predict(test), decimals=1) # округлим до десятых
#test_preds = xlf.predict(test)
submission = pd.DataFrame({"id":ids, "Demanda_uni_equil": test_preds})

test_preds = map(nonnegative, test_preds)
print('rmsle: " ', rmsle_func(y_test,preds))

submission = pd.DataFrame({"id":ids, "Demanda_uni_equil": test_preds})
cols = ['id',"Demanda_uni_equil"]
submission = submission[cols]
submission.to_csv("submission.csv", index=False)


print('Completed!')

In [ ]:
print test.shape, submission.shape

In [ ]:
submission.Demanda_uni_equil.min()

In [ ]:


In [ ]: