In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rnd
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import figure
%matplotlib inline
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
In [5]:
train = pd.read_csv("../input/train.csv.gz")
test = pd.read_csv("../input/test.csv.gz")
In [6]:
print(train.head())
In [7]:
data=pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],\
test.loc[:,'MSSubClass':'SaleCondition']))
data=pd.get_dummies(data)
data.shape
data=data.fillna(data.mean())
data.describe()
data.info()
In [8]:
xtrain=[train['Id'],data[:train.shape[0]],train['SalePrice']]
print(train.shape[0])
In [9]:
print(len(train['Id']))
ntrain=pd.concat(xtrain,axis=1)
print(ntrain.head())
xtest=[test['Id'],data[train.shape[0]:]]
print(len(data[train.shape[0]:])==test.shape[0])
ntest=pd.concat(xtest,axis=1)
print(ntest.head())
In [10]:
X_train=ntrain.loc[:,'MSSubClass':'SaleCondition_Partial']
Y_train=ntrain.loc[:,'SalePrice']
X_test=ntest.loc[:,'MSSubClass':'SaleCondition_Partial']
In [11]:
# Random Forest
random_forest = RandomForestRegressor(n_estimators=2900)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(acc_random_forest)
In [12]:
submission = pd.DataFrame({"Id": ntest['Id'],"SalePrice": Y_pred})
In [13]:
print(len(submission))
In [14]:
submission.to_csv('submission.csv', index=False)
In [ ]: