In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.cross_validation import cross_val_score
from subprocess import check_output
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import normalize
import xgboost as xgb
In [2]:
numsToRead = 2000000 # колво строк, которой будет прочитано из трейна
# явно указываем типы данных для оптимизации памяти
dtypes = {'Semana' : 'int32',
'Agencia_ID' :'int32',
'Canal_ID' : 'int32',
'Ruta_SAK' : 'int32',
'Cliente-ID' : 'int32',
'Producto_ID':'int32',
'Venta_hoy':'float32',
'Venta_uni_hoy': 'int32',
'Dev_uni_proxima':'int32',
'Dev_proxima':'float32',
'Demanda_uni_equil':'int32'}
# обучаемся на Demanda_uni_equil
train = pd.read_csv('train.csv', dtype = dtypes, nrows=numsToRead)
train.drop(['Venta_uni_hoy','Venta_hoy','Dev_uni_proxima','Dev_proxima'], axis =1, inplace = True)
train = train.loc[train['Demanda_uni_equil'] < 85,:]
#для соотсвествия функции ошибки тут и на lb
#train.Demanda_uni_equil = train.Demanda_uni_equil.apply(lambda x: math.log(x + 1))
#df_test = pd.read_csv('test.csv'), nrows=5000000)
In [3]:
train.info()
In [4]:
# по хорошему нужно использовать вот эту функцию ошибки
def rmsle_func(actual, predicted):
""" Computes the root mean squared log error.
This function computes the root mean squared log error between two lists
of numbers. """
return np.sqrt(msle(actual, predicted))
def msle(actual, predicted):
return np.mean(sle(actual, predicted))
def sle(actual, predicted):
return (np.power(np.log(np.array(actual)+1) -
np.log(np.array(predicted)+1), 2))
In [5]:
test = pd.read_csv('test.csv', dtype = dtypes) #, nrows=numsToRead)
ids = test['id']
test.drop(['id'], axis =1, inplace = True)
print test.info()
In [6]:
#get dummies variables
#train = train[np.append(test.columns.values, 'Demanda_uni_equil')]
#меньше 1% имеют значение больше 51
train = train.loc[train['Demanda_uni_equil'] < 85,:]
shapeTrain = train.shape[0]
shapeTest = test.shape[0]
# аппендим чтобы единые преобразования и для теста и для трейна
train = train.append(test)
#дропаем по сути коллинеарные колонки, они не нужны
train = pd.concat([train, pd.get_dummies(train['Semana'],sparse=True)], axis=1, join_axes=[train.index])
train.drop([11,'Semana'],axis=1, inplace = True)
# train = pd.concat([train, pd.get_dummies(train['Producto_ID'],sparse=True)], axis=1, join_axes=[train.index])
# train.drop([123,'Producto_ID'],axis=1, inplace = True)
train = pd.concat([train, pd.get_dummies(train['Canal_ID'],sparse=True)], axis=1, join_axes=[train.index])
train.drop([11,'Canal_ID'],axis=1, inplace = True)
# train = pd.concat([train, pd.get_dummies(train['Agencia_ID'],sparse=True)], axis=1, join_axes=[train.index])
# train.drop([1382,'Agencia_ID'],axis=1, inplace = True)
#для того чтобы сделать названия колонок уникальными
train.columns = range(train.shape[1])
# индексы в датафрейме полетели, поступим вот так
train.reset_index(drop=True, inplace= True)
test = train[shapeTrain:shapeTrain+shapeTest]
train = train[0:shapeTrain]
print train.info()
In [7]:
train.index.get_duplicates()
Out[7]:
In [8]:
y = train[train.columns[1]]
train.drop([1],axis=1, inplace = True)
test.drop([1],axis=1, inplace = True)
X = train[test.columns.values]
print(X.shape, y.shape)
# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=777)
print(X_train.shape, X_test.shape)
In [ ]:
# вот нужно использовать нашу функцию ошибки
#rmsle = make_scorer(rmsle_func, greater_is_better=False)
xlf = xgb.XGBRegressor(objective="reg:linear", seed=1729, n_estimators= 2500, learning_rate=0.05, max_depth=5,
nthread=2)
xlf.fit(X_train, y_train, eval_metric = 'rmse',eval_set = [(X_test, y_test)])
In [ ]:
preds = xlf.predict(X_test)
In [ ]:
# отриц. ответы занулим
def nonnegative(x):
if x > 0:
return x
else:
return 0
# submission
test_preds = np.around(xlf.predict(test), decimals=1) # округлим до десятых
#test_preds = xlf.predict(test)
submission = pd.DataFrame({"id":ids, "Demanda_uni_equil": test_preds})
test_preds = map(nonnegative, test_preds)
print('rmsle: " ', rmsle_func(y_test,preds))
submission = pd.DataFrame({"id":ids, "Demanda_uni_equil": test_preds})
cols = ['id',"Demanda_uni_equil"]
submission = submission[cols]
submission.to_csv("submission.csv", index=False)
print('Completed!')
In [ ]:
print test.shape, submission.shape
In [ ]:
submission.Demanda_uni_equil.min()
In [ ]:
In [ ]: