notebook.community

Edit and run



In [1]:

    
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model



In [2]:

    
# get the dataset
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')



In [3]:

    
df_train.head()









    Out[3]:






  
    
      
      Id
      MSSubClass
      MSZoning
      LotFrontage
      LotArea
      Street
      Alley
      LotShape
      LandContour
      Utilities
      ...
      PoolArea
      PoolQC
      Fence
      MiscFeature
      MiscVal
      MoSold
      YrSold
      SaleType
      SaleCondition
      SalePrice
    
  
  
    
      0
      1
      60
      RL
      65.0
      8450
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2008
      WD
      Normal
      208500
    
    
      1
      2
      20
      RL
      80.0
      9600
      Pave
      NaN
      Reg
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      5
      2007
      WD
      Normal
      181500
    
    
      2
      3
      60
      RL
      68.0
      11250
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      9
      2008
      WD
      Normal
      223500
    
    
      3
      4
      70
      RL
      60.0
      9550
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      2
      2006
      WD
      Abnorml
      140000
    
    
      4
      5
      60
      RL
      84.0
      14260
      Pave
      NaN
      IR1
      Lvl
      AllPub
      ...
      0
      NaN
      NaN
      NaN
      0
      12
      2008
      WD
      Normal
      250000
    
  

5 rows × 81 columns



In [4]:

    
# keep the required data and drop the unnecessary
df_train = df_train[['LotFrontage', 'LotArea', 'SaleCondition', 'SalePrice']]



In [5]:

    
df_train.head()









    Out[5]:






  
    
      
      LotFrontage
      LotArea
      SaleCondition
      SalePrice
    
  
  
    
      0
      65.0
      8450
      Normal
      208500
    
    
      1
      80.0
      9600
      Normal
      181500
    
    
      2
      68.0
      11250
      Normal
      223500
    
    
      3
      60.0
      9550
      Abnorml
      140000
    
    
      4
      84.0
      14260
      Normal
      250000



In [16]:

    
# same goes for the testing data
df_test = df_test[['LotFrontage', 'LotArea', 'SaleCondition']]
df_test.head()









    Out[16]:






  
    
      
      LotFrontage
      LotArea
      SaleCondition
    
  
  
    
      0
      80.0
      11622
      Normal
    
    
      1
      81.0
      14267
      Normal
    
    
      2
      74.0
      13830
      Normal
    
    
      3
      78.0
      9978
      Normal
    
    
      4
      43.0
      5005
      Normal



In [17]:

    
# prepare the linear regression model
linear_reg = linear_model.LinearRegression()



In [26]:

    
val = linear_reg.fit(df_train[['LotArea']], df_train[['SalePrice']])



In [27]:

    
plt.scatter(df_train[['LotArea']], df_train[['SalePrice']])
plt.plot(df_train[['LotArea']], val.predict(df_train[['LotArea']]))
plt.show()



In [ ]:

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000

	LotFrontage	LotArea	SaleCondition
0	80.0	11622	Normal
1	81.0	14267	Normal
2	74.0	13830	Normal
3	78.0	9978	Normal
4	43.0	5005	Normal