In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
data = pd.concat([train, test])
data = pd.get_dummies(data)
data.drop(['Id'], axis=1, inplace=True)
#size of training dataset
train_samples = train.shape[0]
train = data.iloc[:train_samples, :]
test = data.iloc[train_samples:, :]
datanum = train.select_dtypes([np.number])
datatxt = train.select_dtypes(exclude=[np.number])
data_mix = pd.concat([datanum, datatxt], axis=1)
#data_mix = pd.get_dummies(data_mix)
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train = imp_mean.fit_transform(data_mix.drop(['SalePrice'], axis=1))
y_train = data_mix.SalePrice.apply(np.log)
In [2]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
ridge = Ridge()
parameters = {'alpha': list(x / 100 for x in range(0, 101))}
regr = GridSearchCV(ridge, parameters)
regr.fit(X_train, y_train)
Out[2]:
In [3]:
regr.best_estimator_.score(X_train, y_train)
rmse = mean_squared_error(y_train, regr.best_estimator_.predict(X_train))**0.5
print("RMSLE: {:.3f}".format(rmse))
In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
datanum = test.select_dtypes([np.number])
datatxt = test.select_dtypes(exclude=[np.number])
data_mix = pd.concat([datanum, datatxt], axis=1)
X_test = imp_mean.transform(data_mix.drop(['SalePrice'],axis=1))
predictions = regr.best_estimator_.predict(X_test)
import os
#predictions are logs, return to the value
predictions = np.exp(predictions)
file = "Id,SalePrice" + os.linesep
startId = 1461
for i in range(len(X_test)):
file += "{},{}".format(startId, (int)(predictions[i])) + os.linesep
startId += 1
# Save to file
with open('attempt.txt', 'w') as f:
f.write(file)
In [ ]:
#0.13182 in Kaggle Leader Board