In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
data = pd.concat([train, test])
data = pd.get_dummies(data)
data.drop(['Id'], axis=1, inplace=True)

#size of training dataset
train_samples = train.shape[0]

train = data.iloc[:train_samples, :]
test = data.iloc[train_samples:, :]

datanum = train.select_dtypes([np.number])
datatxt = train.select_dtypes(exclude=[np.number])
data_mix = pd.concat([datanum, datatxt], axis=1)
#data_mix = pd.get_dummies(data_mix)

from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train = imp_mean.fit_transform(data_mix.drop(['SalePrice'], axis=1))
y_train = data_mix.SalePrice.apply(np.log)

In [2]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge = Ridge()
parameters = {'alpha': list(x / 100 for x in range(0, 101))}
regr = GridSearchCV(ridge, parameters)
regr.fit(X_train, y_train)


/Users/samuel/anaconda/envs/py3/lib/python3.5/site-packages/sklearn/model_selection/_search.py:841: DeprecationWarning: The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.
  DeprecationWarning)
Out[2]:
GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0...0.84, 0.85, 0.86, 0.87, 0.88, 0.89, 0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [3]:
regr.best_estimator_.score(X_train, y_train)

rmse = mean_squared_error(y_train, regr.best_estimator_.predict(X_train))**0.5
print("RMSLE: {:.3f}".format(rmse))


RMSLE: 0.101

In [4]:
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer


datanum = test.select_dtypes([np.number])
datatxt = test.select_dtypes(exclude=[np.number])
data_mix = pd.concat([datanum, datatxt], axis=1)

X_test = imp_mean.transform(data_mix.drop(['SalePrice'],axis=1))

predictions = regr.best_estimator_.predict(X_test)

import os

#predictions are logs, return to the value
predictions = np.exp(predictions)

file = "Id,SalePrice" + os.linesep

startId = 1461
for i in range(len(X_test)):
    file += "{},{}".format(startId, (int)(predictions[i])) + os.linesep
    startId += 1

# Save to file
with open('attempt.txt', 'w') as f:
    f.write(file)

In [ ]:
#0.13182 in Kaggle Leader Board