In [4]:
import os
#Primero de todo es importante establecer nuestro workspace, deberemos apuntar a la carpeta donde se encuentran los archivos
# train y test.
os.path.abspath("data")
Out[4]:
In [5]:
#verificamos que estamos en la carpeta correcta.
os.getcwd()
Out[5]:
In [ ]:
In [12]:
import numpy as np
import pandas
import matplotlib.pyplot as plt
filename_train = 'data/TrainingDataset.csv'
filename_test = 'data/TestDataset.csv'
#usando panda importamos los dos archivos csv
dataframe_train = pandas.read_csv(filename_train)
dataframe_test = pandas.read_csv(filename_test)
#los juntamos en un mismo dataframe
dataframe = pandas.concat([dataframe_train, dataframe_test])
quantitative_columns = filter(lambda s: s.startswith("Quan"), dataframe.columns)
plt.figure()
# Lista de variables para mostrar en escala logaritmica:
#to_log = ["Quan_4", "Quan_5", "Quan_6", "Quan_7", "Quan_8", "Quan_9", "Quan_10", "Quan_11", "Quan_12", "Quan_13", "Quan_14", "Quan_15", "Quan_16", "Quan_17", "Quan_18", "Quan_19", "Quan_21", "Quan_22", "Quan_27", "Quan_28", "Quan_29", "Quant_22", "Quant_24", "Quant_25"]
to_log = ["Quan_4", "Quan_15", "Quan_16", "Quan_17", "Quan_18", "Quan_19", "Quan_21", "Quan_22", "Quant_22", "Quant_24", "Quant_25"]
#recorremos todas las columnas para dibujar los histogramas de las variables cuantitativas
for i, col in enumerate(quantitative_columns):
a = dataframe[col]
print col, pandas.isnull(a).sum()
plt.subplot(4,8,i)
if col in to_log:
a = np.log(a)
plt.hist(a[pandas.notnull(a)].tolist(), bins=30, label=col)
plt.legend()
print len(quantitative_columns)
plt.show() # Si no estas en modo interactivo necesitaras esto.
In [1]:
import numpy as np
import pandas
import pickle
import gzip
import datetime
#lista de columnas con variables cuantitativas que son representativas para la escala log
# (previo uso de explore.py)
to_log = ["Quan_4", "Quan_5", "Quan_6", "Quan_7", "Quan_8", "Quan_9", "Quan_10", "Quan_11", "Quan_12", "Quan_13", "Quan_14", "Quan_15", "Quan_16", "Quan_17", "Quan_18", "Quan_19", "Quan_21", "Quan_22", "Quan_27", "Quan_28", "Quan_29", "Quant_22", "Quant_24", "Quant_25"]
def create_dataset(dataframe_train, dataframe_test):
#creamos una variable local
global to_log
#unimos los dos dataframe para crear el conjunto de datos completo
dataframe = pandas.concat([dataframe_train, dataframe_test])
#computamos diferencia entre fechas
dataframe['Date_3'] = dataframe.Date_1 - dataframe.Date_2
train_size = dataframe_train.shape[0]
X_categorical = []
X_quantitative = []
X_date = []
X_id = []
#creamos vector de 0 para la futura predicción
ys = np.zeros((train_size,12), dtype=np.int)
columns = []
for col in dataframe.columns:
if col.startswith('Cat_'):
columns.append(col)
uni = np.unique(dataframe[col])
uni = uni.tolist()
if len(uni) > 1:
#binarizamos las variables categoricas
X_categorical.append(uni==dataframe[col].values[:,None])
elif col.startswith('Quan_') or col.startswith('Quant_'):
columns.append(col)
#verificamos si la columna esta en la variable to_log
if col in to_log:
dataframe[col] = np.log(dataframe[col])
# Si no encontramos la columna en to_log la llenamos de NaN
if (pandas.isnull(dataframe[col])).sum() > 1:
tmp = dataframe[col].copy()
# calculo de la mediana:
tmp = tmp.fillna(tmp.median())
X_quantitative.append(tmp.values)
elif col.startswith('Date_'):
columns.append(col)
# Si la columna no existe la llenamos de valores NaN:
tmp = dataframe[col].copy()
if (pandas.isnull(tmp)).sum() > 1:
# calculo de mediana:
tmp = tmp.fillna(tmp.median())
X_date.append(tmp.values[:,None])
#extraemos dia mes y año para otener efectos estacionarios de las ventas:
year = np.zeros((tmp.size,1))
month = np.zeros((tmp.size,1))
day = np.zeros((tmp.size,1))
for i, date_number in enumerate(tmp):
date = datetime.date.fromordinal(int(date_number))
year[i,0] = date.year
month[i,0] = date.month
day[i,0] = date.day
X_date.append(year)
X_date.append(month)
X_date.append(day)
#considerando año, mes y dia como variables categoricas
#creamos la representacion binaria:
X_date.append((np.unique(year)==year).astype(np.int))
X_date.append((np.unique(month)==month).astype(np.int))
X_date.append((np.unique(day)==day).astype(np.int))
elif col=='id':
pass # X_id.append(dataframe[col].values)
elif col.startswith('Outcome_'):
outcome_col_number = int(col.split('M')[1]) - 1
tmp = dataframe[col][:train_size].copy()
# calculo de mediana:
tmp = tmp.fillna(tmp.median())
ys[:,outcome_col_number] = tmp.values
else:
raise NameError
X_categorical = np.hstack(X_categorical).astype(np.float32)
X_quantitative = np.vstack(X_quantitative).astype(np.float32).T
X_date = np.hstack(X_date).astype(np.float32)
X = np.hstack([X_categorical, X_quantitative, X_date])
X_train = X[:train_size,:]
X_test = X[train_size:,:]
return X_train, X_test, ys, columns
def redundant_columns(X):
"""Identificar columnas redundantes.
"""
idx = []
for i in range(X.shape[1]-1):
for j in range(i+1, X.shape[1]):
if (X[:,i] == X[:,j]).all() :
print i, '==', j
idx.append(j)
return np.unique(idx)
if __name__ == '__main__':
np.random.seed(0)
filename_train = 'data/TrainingDataset.csv'
filename_test = 'data/TestDataset.csv'
dataframe_train = pandas.read_csv(filename_train)
dataframe_test = pandas.read_csv(filename_test)
# Hay que tener en cuenta que el dataframe tiene las columnas en diferente
# orden que dataframe_train y dataframe_test
"""print "dataframe_train:", dataframe_train
print
print "dataframe_test:", dataframe_test
"""
ids = dataframe_test.values[:,0].astype(np.int)
X_train, X_test, ys, columns = create_dataset(dataframe_train, dataframe_test)
print "Este es el dataset de entrenamiento: ", X_train
print
print "este es el dataset de test: ", X_test
print
print "Calculando columnas redundantes"
X = np.vstack([X_train, X_test])
idx = redundant_columns(X)
columns_to_keep = list(set(range(X.shape[1])).difference(set(idx.tolist())))
X = X[:,columns_to_keep]
X_train = X[:X_train.shape[0], :]
X_test = X[X_train.shape[0]:, :]
print "Saving dataset."
all_data = {"X_train": X_train,
"X_test": X_test,
"columns": columns,
"ys": ys,
"ids": ids,
"redundant": idx}
pickle.dump(all_data, gzip.open('all_data.pickle.gz','w'), protocol=pickle.HIGHEST_PROTOCOL)
print("Dataset saved. Everything OK")
In [ ]:
"""
Simple blender para los valores de regresion deseados durante meses
"""
import numpy as np
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import load_data
from sklearn.cross_validation import KFold
from sklearn.linear_model import Ridge, RidgeCV, LinearRegression
import pickle
import gzip
import math
def rmsle_loop(y, y_pred):
assert len(y) == len(y_pred)
terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5
if __name__ == '__main__':
#iniciamos la seed para la aleatoriedad y creamos un 5 fold cross validation
np.random.seed(0)
n_folds = 3
#cagamos el dataset
X, X_submission, ys, ids, idx = load_data.load()
# evitamos el logscale en la evaluacion:
ys = np.log(ys/500.0 + 1.0)
y_submission = np.zeros((X_submission.shape[0], 12))
# regs = [RandomForestRegressor(n_estimators=100, n_jobs=-1, max_features='auto'),
# ExtraTreesRegressor(n_estimators=100, n_jobs=-1, max_features='auto'),
# GradientBoostingRegressor(learning_rate=0.001, subsample=0.5, max_depth=6, n_estimators=20000)]
#se prueba con n stimators 1000 para que se ejecute más rápido
regs = [GradientBoostingRegressor(learning_rate=0.001, subsample=0.5, max_depth=6, n_estimators=1000)]
dataset_blend_train = np.zeros((X.shape[0], 12*len(regs)), dtype=np.double)
dataset_blend_submission = np.zeros((X_submission.shape[0], 12*len(regs), n_folds), dtype=np.double)
for i in range(12):
print "Month", i
y = ys[:,i]
kfcv = KFold(n=X.shape[0], n_folds=n_folds)
for j, (train, test) in enumerate(kfcv):
print "Fold", j
for k, reg in enumerate(regs):
print reg
#Nos aseguramos de eliminar todos los valores infinitos o NaN
y[train] = np.nan_to_num(y[train])
X[train] = np.nan_to_num(X[train])
X[test] = np.nan_to_num(X[test])
X_submission = np.nan_to_num(X_submission)
#check de valores NaN o infinitos
print "y tiene valores infinitos: ", np.isinf(y[train]).any()
print "y tiene valores nan: ", np.isnan(y[train]).any()
print "X tiene valores nan: ", np.isnan(X[train]).any()
print "X tiene valores infinitos: ", np.isnan(X[train]).any()
reg.fit(X[train], y[train])
#ejecutamos el predictor
dataset_blend_train[test,12*k+i] = reg.predict(X[test])
dataset_blend_submission[:,12*k+i,j] = reg.predict(X_submission)
dataset_blend_submission_final = dataset_blend_submission.mean(2)
print "dataset_blend_submission_final:", dataset_blend_submission_final.shape
print "Blending."
for i in range(12):
print "Month", i, '-',
y = ys[:,i]
reg = RidgeCV(alphas=np.logspace(-2,4,40))
reg.fit(dataset_blend_train, y)
print "best_alpha =", reg.alpha_
y_submission[:,i] = reg.predict(dataset_blend_submission_final)
# reconversion de los resultados a la dimension original:
y_submission = (np.exp(y_submission) - 1.0) * 500.0
print "Guardando resultados en test.csv..."
np.savetxt("test.csv", np.hstack([ids[:,None], y_submission]), fmt="%d", delimiter=',')
print("Resultados guardados en test.csv")
ys = (np.exp(ys) - 1.0) * 500.0
print rmsle_loop(ys, y_submission)
In [3]:
"""
Un cargados simple del conjunto de datos y que mezcla las lineas para crear
un conjunto de datos aleatorio
"""
import pickle
import gzip
import numpy as np
def load(filename='all_data.pickle.gz', shuffle_train=False):
"""Load dataset. Shuffle train data if requested
"""
f = gzip.open(filename)
all_data = pickle.load(f)
X_train = all_data['X_train']
X_test = all_data['X_test']
ys = all_data['ys']
ids = all_data['ids']
idx = np.arange(X_train.shape[0])
if shuffle_train:
idx = np.random.permutation(idx)
X_train = X_train[idx, :]
ys = ys[idx, :]
return X_train, X_test, ys, ids, idx
Out[3]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: