In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random as rnd

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import figure

%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [5]:
train = pd.read_csv("../input/train.csv.gz")
test = pd.read_csv("../input/test.csv.gz")

In [6]:
print(train.head())


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \
0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   
4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   

  MoSold YrSold  SaleType  SaleCondition  SalePrice  
0      2   2008        WD         Normal     208500  
1      5   2007        WD         Normal     181500  
2      9   2008        WD         Normal     223500  
3      2   2006        WD        Abnorml     140000  
4     12   2008        WD         Normal     250000  

[5 rows x 81 columns]

In [7]:
data=pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],\
                test.loc[:,'MSSubClass':'SaleCondition']))

data=pd.get_dummies(data)
data.shape

data=data.fillna(data.mean())
data.describe()
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 0 to 1458
Columns: 288 entries, MSSubClass to SaleCondition_Partial
dtypes: float64(11), int64(25), uint8(252)
memory usage: 1.5 MB

In [8]:
xtrain=[train['Id'],data[:train.shape[0]],train['SalePrice']]
print(train.shape[0])


1460

In [9]:
print(len(train['Id']))
ntrain=pd.concat(xtrain,axis=1)
print(ntrain.head())

xtest=[test['Id'],data[train.shape[0]:]]
print(len(data[train.shape[0]:])==test.shape[0])
ntest=pd.concat(xtest,axis=1)
print(ntest.head())


1460
   Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  YearBuilt  \
0   1          60         65.0     8450            7            5       2003   
1   2          20         80.0     9600            6            8       1976   
2   3          60         68.0    11250            7            5       2001   
3   4          70         60.0     9550            7            5       1915   
4   5          60         84.0    14260            8            5       2000   

   YearRemodAdd  MasVnrArea  BsmtFinSF1    ...      SaleType_New  \
0          2003       196.0       706.0    ...                 0   
1          1976         0.0       978.0    ...                 0   
2          2002       162.0       486.0    ...                 0   
3          1970         0.0       216.0    ...                 0   
4          2000       350.0       655.0    ...                 0   

   SaleType_Oth  SaleType_WD  SaleCondition_Abnorml  SaleCondition_AdjLand  \
0             0            1                      0                      0   
1             0            1                      0                      0   
2             0            1                      0                      0   
3             0            1                      1                      0   
4             0            1                      0                      0   

   SaleCondition_Alloca  SaleCondition_Family  SaleCondition_Normal  \
0                     0                     0                     1   
1                     0                     0                     1   
2                     0                     0                     1   
3                     0                     0                     0   
4                     0                     0                     1   

   SaleCondition_Partial  SalePrice  
0                      0     208500  
1                      0     181500  
2                      0     223500  
3                      0     140000  
4                      0     250000  

[5 rows x 290 columns]
True
     Id  MSSubClass  LotFrontage  LotArea  OverallQual  OverallCond  \
0  1461          20         80.0    11622            5            6   
1  1462          20         81.0    14267            6            6   
2  1463          60         74.0    13830            5            5   
3  1464          60         78.0     9978            6            6   
4  1465         120         43.0     5005            8            5   

   YearBuilt  YearRemodAdd  MasVnrArea  BsmtFinSF1          ...            \
0       1961          1961         0.0       468.0          ...             
1       1958          1958       108.0       923.0          ...             
2       1997          1998         0.0       791.0          ...             
3       1998          1998        20.0       602.0          ...             
4       1992          1992         0.0       263.0          ...             

   SaleType_ConLw  SaleType_New  SaleType_Oth  SaleType_WD  \
0               0             0             0            1   
1               0             0             0            1   
2               0             0             0            1   
3               0             0             0            1   
4               0             0             0            1   

   SaleCondition_Abnorml  SaleCondition_AdjLand  SaleCondition_Alloca  \
0                      0                      0                     0   
1                      0                      0                     0   
2                      0                      0                     0   
3                      0                      0                     0   
4                      0                      0                     0   

   SaleCondition_Family  SaleCondition_Normal  SaleCondition_Partial  
0                     0                     1                      0  
1                     0                     1                      0  
2                     0                     1                      0  
3                     0                     1                      0  
4                     0                     1                      0  

[5 rows x 289 columns]

In [10]:
X_train=ntrain.loc[:,'MSSubClass':'SaleCondition_Partial']
Y_train=ntrain.loc[:,'SalePrice']
X_test=ntest.loc[:,'MSSubClass':'SaleCondition_Partial']

In [11]:
# Random Forest
random_forest = RandomForestRegressor(n_estimators=2900)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
print(acc_random_forest)


98.16

In [12]:
submission = pd.DataFrame({"Id": ntest['Id'],"SalePrice": Y_pred})

In [13]:
print(len(submission))


1459

In [14]:
submission.to_csv('submission.csv', index=False)

In [ ]: