In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model

In [2]:
# get the dataset
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head()


Out[3]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns


In [4]:
# keep the required data and drop the unnecessary
df_train = df_train[['LotFrontage', 'LotArea', 'SaleCondition', 'SalePrice']]

In [5]:
df_train.head()


Out[5]:
LotFrontage LotArea SaleCondition SalePrice
0 65.0 8450 Normal 208500
1 80.0 9600 Normal 181500
2 68.0 11250 Normal 223500
3 60.0 9550 Abnorml 140000
4 84.0 14260 Normal 250000

In [16]:
# same goes for the testing data
df_test = df_test[['LotFrontage', 'LotArea', 'SaleCondition']]
df_test.head()


Out[16]:
LotFrontage LotArea SaleCondition
0 80.0 11622 Normal
1 81.0 14267 Normal
2 74.0 13830 Normal
3 78.0 9978 Normal
4 43.0 5005 Normal

In [17]:
# prepare the linear regression model
linear_reg = linear_model.LinearRegression()

In [26]:
val = linear_reg.fit(df_train[['LotArea']], df_train[['SalePrice']])

In [27]:
plt.scatter(df_train[['LotArea']], df_train[['SalePrice']])
plt.plot(df_train[['LotArea']], val.predict(df_train[['LotArea']]))
plt.show()



In [ ]: