In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

Reading CSV and splitting into training and test data


In [268]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
sales = pd.read_csv('kc_house_data.csv', dtype=dtype_dict)
train_data = pd.read_csv('kc_house_train_data.csv', dtype=dtype_dict)
test_data = pd.read_csv('kc_house_test_data.csv', dtype=dtype_dict)

In [269]:
print(train_data)


               id             date    price  bedrooms  bathrooms  sqft_living  \
0      7129300520  20141013T000000   221900         3       1.00         1180   
1      6414100192  20141209T000000   538000         3       2.25         2570   
2      5631500400  20150225T000000   180000         2       1.00          770   
3      2487200875  20141209T000000   604000         4       3.00         1960   
4      1954400510  20150218T000000   510000         3       2.00         1680   
5      7237550310  20140512T000000  1225000         4       4.50         5420   
6      1321400060  20140627T000000   257500         3       2.25         1715   
7      2008000270  20150115T000000   291850         3       1.50         1060   
8      2414600126  20150415T000000   229500         3       1.00         1780   
9      3793500160  20150312T000000   323000         3       2.50         1890   
10     1736800520  20150403T000000   662500         3       2.50         3560   
11     9212900260  20140527T000000   468000         2       1.00         1160   
12     6054650070  20141007T000000   400000         3       1.75         1370   
13     1175000570  20150312T000000   530000         5       2.00         1810   
14     1875500060  20140731T000000   395000         3       2.00         1890   
15     6865200140  20140529T000000   485000         4       1.00         1600   
16     0016000397  20141205T000000   189000         2       1.00         1200   
17     7983200060  20150424T000000   230000         3       1.00         1250   
18     6300500875  20140514T000000   385000         4       1.75         1620   
19     2524049179  20140826T000000  2000000         3       2.75         3050   
20     7137970340  20140703T000000   285000         5       2.50         2270   
21     8091400200  20140516T000000   252700         2       1.50         1070   
22     3814700200  20141120T000000   329000         3       2.25         2450   
23     1794500383  20140626T000000   937000         3       1.75         2450   
24     3303700376  20141201T000000   667000         3       1.00         1400   
25     5101402488  20140624T000000   438000         3       1.75         1520   
26     1873100390  20150302T000000   719000         4       2.50         2570   
27     2426039314  20141201T000000   280000         2       1.50         1190   
28     0461000390  20140624T000000   687500         4       1.75         2330   
29     7955080270  20141203T000000   322500         4       2.75         2060   
...           ...              ...      ...       ...        ...          ...   
17354  1931300090  20140507T000000   610950         3       3.00         1680   
17355  9253900271  20150107T000000  3567000         5       4.50         4850   
17356  0567000385  20140623T000000   362500         2       1.50          940   
17357  7011201004  20140529T000000   645000         3       3.25         1730   
17358  7853420110  20141003T000000   594866         3       3.00         2780   
17359  7853420110  20150504T000000   625000         3       3.00         2780   
17360  0952006823  20141202T000000   380000         3       2.50         1260   
17361  3832050760  20140828T000000   270000         3       2.50         1870   
17362  2767604724  20141015T000000   505000         2       2.50         1430   
17363  6632300207  20150305T000000   385000         3       2.50         1520   
17364  2767600688  20141113T000000   414500         2       1.50         1210   
17365  7570050450  20140910T000000   347500         3       2.50         2540   
17366  7430200100  20140514T000000  1222500         4       3.50         4910   
17367  4140940150  20141002T000000   572000         4       2.75         2770   
17368  1931300412  20150416T000000   475000         3       2.25         1190   
17369  1972201967  20141031T000000   520000         2       2.25         1530   
17370  7502800100  20140813T000000   679950         5       2.75         3600   
17371  0191100405  20150421T000000  1575000         4       3.25         3410   
17372  8956200760  20141013T000000   541800         4       2.50         3118   
17373  7202300110  20140915T000000   810000         4       3.00         3990   
17374  0249000205  20141015T000000  1537000         5       3.75         4470   
17375  5100403806  20150407T000000   467000         3       2.50         1425   
17376  0844000965  20140626T000000   224000         3       1.75         1500   
17377  7852140040  20140825T000000   507250         3       2.50         2270   
17378  9834201367  20150126T000000   429000         3       2.00         1490   
17379  7936000429  20150326T000000  1007500         4       3.50         3510   
17380  2997800021  20150219T000000   475000         3       2.50         1310   
17381  0263000018  20140521T000000   360000         3       2.50         1530   
17382  0291310100  20150116T000000   400000         3       2.50         1600   
17383  1523300157  20141015T000000   325000         2       0.75         1020   

       sqft_lot floors  waterfront  view     ...      grade  sqft_above  \
0          5650      1           0     0     ...          7        1180   
1          7242      2           0     0     ...          7        2170   
2         10000      1           0     0     ...          6         770   
3          5000      1           0     0     ...          7        1050   
4          8080      1           0     0     ...          8        1680   
5        101930      1           0     0     ...         11        3890   
6          6819      2           0     0     ...          7        1715   
7          9711      1           0     0     ...          7        1060   
8          7470      1           0     0     ...          7        1050   
9          6560      2           0     0     ...          7        1890   
10         9796      1           0     0     ...          8        1860   
11         6000      1           0     0     ...          7         860   
12         9680      1           0     0     ...          7        1370   
13         4850    1.5           0     0     ...          7        1810   
14        14040      2           0     0     ...          7        1890   
15         4300    1.5           0     0     ...          7        1600   
16         9850      1           0     0     ...          7        1200   
17         9774      1           0     0     ...          7        1250   
18         4980      1           0     0     ...          7         860   
19        44867      1           0     4     ...          9        2330   
20         6300      2           0     0     ...          8        2270   
21         9643      1           0     0     ...          7        1070   
22         6500      2           0     0     ...          8        2450   
23         2691      2           0     0     ...          8        1750   
24         1581    1.5           0     0     ...          8        1400   
25         6380      1           0     0     ...          7         790   
26         7173      2           0     0     ...          8        2570   
27         1265      3           0     0     ...          7        1190   
28         5000    1.5           0     0     ...          7        1510   
29         6659      1           0     0     ...          7        1280   
...         ...    ...         ...   ...     ...        ...         ...   
17354      1570      3           0     0     ...          8        1680   
17355     10584      2           1     4     ...         10        3540   
17356      1768      2           0     0     ...          7         940   
17357      1229      2           0     2     ...          9        1320   
17358      6000      2           0     0     ...          9        2780   
17359      6000      2           0     0     ...          9        2780   
17360       900      2           0     0     ...          7         940   
17361      5000      2           0     0     ...          7        1870   
17362      1201      3           0     0     ...          8        1430   
17363      1488      3           0     0     ...          8        1520   
17364      1278      2           0     0     ...          8        1020   
17365      4760      2           0     0     ...          8        2540   
17366      9444    1.5           0     0     ...         11        3110   
17367      3852      2           0     0     ...          8        2770   
17368      1200      3           0     0     ...          8        1190   
17369       981      3           0     0     ...          8        1480   
17370      9437      2           0     0     ...          9        3600   
17371     10125      2           0     0     ...         10        3410   
17372      7866      2           0     2     ...          9        3118   
17373      7838      2           0     0     ...          9        3990   
17374      8088      2           0     0     ...         11        4470   
17375      1179      3           0     0     ...          8        1425   
17376     11968      1           0     0     ...          6        1500   
17377      5536      2           0     0     ...          8        2270   
17378      1126      3           0     0     ...          8        1490   
17379      7200      2           0     0     ...          9        2600   
17380      1294      2           0     0     ...          8        1180   
17381      1131      3           0     0     ...          8        1530   
17382      2388      2           0     0     ...          8        1600   
17383      1076      2           0     0     ...          7        1020   

       sqft_basement  yr_built  yr_renovated  zipcode      lat     long  \
0                  0      1955             0    98178  47.5112 -122.257   
1                400      1951          1991    98125  47.7210 -122.319   
2                  0      1933             0    98028  47.7379 -122.233   
3                910      1965             0    98136  47.5208 -122.393   
4                  0      1987             0    98074  47.6168 -122.045   
5               1530      2001             0    98053  47.6561 -122.005   
6                  0      1995             0    98003  47.3097 -122.327   
7                  0      1963             0    98198  47.4095 -122.315   
8                730      1960             0    98146  47.5123 -122.337   
9                  0      2003             0    98038  47.3684 -122.031   
10              1700      1965             0    98007  47.6007 -122.145   
11               300      1942             0    98115  47.6900 -122.292   
12                 0      1977             0    98074  47.6127 -122.045   
13                 0      1900             0    98107  47.6700 -122.394   
14                 0      1994             0    98019  47.7277 -121.962   
15                 0      1916             0    98103  47.6648 -122.343   
16                 0      1921             0    98002  47.3089 -122.210   
17                 0      1969             0    98003  47.3343 -122.306   
18               760      1947             0    98133  47.7025 -122.341   
19               720      1968             0    98040  47.5316 -122.233   
20                 0      1995             0    98092  47.3266 -122.169   
21                 0      1985             0    98030  47.3533 -122.166   
22                 0      1985             0    98030  47.3739 -122.172   
23               700      1915             0    98119  47.6386 -122.360   
24                 0      1909             0    98112  47.6221 -122.314   
25               730      1948             0    98115  47.6950 -122.304   
26                 0      2005             0    98052  47.7073 -122.110   
27                 0      2005             0    98133  47.7274 -122.357   
28               820      1929             0    98117  47.6823 -122.368   
29               780      1981             0    98058  47.4276 -122.157   
...              ...       ...           ...      ...      ...      ...   
17354              0      2014             0    98103  47.6572 -122.346   
17355           1310      2007             0    98008  47.5943 -122.110   
17356              0      2009             0    98144  47.5925 -122.295   
17357            410      2008             0    98119  47.6374 -122.369   
17358              0      2013             0    98065  47.5184 -121.886   
17359              0      2013             0    98065  47.5184 -121.886   
17360            320      2007             0    98116  47.5621 -122.384   
17361              0      2009             0    98042  47.3339 -122.055   
17362              0      2009             0    98107  47.6707 -122.381   
17363              0      2006             0    98125  47.7337 -122.309   
17364            190      2007             0    98117  47.6756 -122.375   
17365              0      2010             0    98038  47.3452 -122.022   
17366           1800      2007             0    98074  47.6502 -122.066   
17367              0      2014             0    98178  47.5001 -122.232   
17368              0      2008             0    98103  47.6542 -122.346   
17369             50      2006             0    98103  47.6533 -122.346   
17370              0      2014             0    98059  47.4822 -122.131   
17371              0      2007             0    98040  47.5653 -122.223   
17372              0      2014             0    98001  47.2931 -122.264   
17373              0      2003             0    98053  47.6857 -122.046   
17374              0      2008             0    98004  47.6321 -122.200   
17375              0      2008             0    98125  47.6963 -122.318   
17376              0      2014             0    98010  47.3095 -122.002   
17377              0      2003             0    98065  47.5389 -121.881   
17378              0      2014             0    98144  47.5699 -122.288   
17379            910      2009             0    98136  47.5537 -122.398   
17380            130      2008             0    98116  47.5773 -122.409   
17381              0      2009             0    98103  47.6993 -122.346   
17382              0      2004             0    98027  47.5345 -122.069   
17383              0      2008             0    98144  47.5941 -122.299   

       sqft_living15  sqft_lot15  
0               1340        5650  
1               1690        7639  
2               2720        8062  
3               1360        5000  
4               1800        7503  
5               4760      101930  
6               2238        6819  
7               1650        9711  
8               1780        8113  
9               2390        7570  
10              2210        8925  
11              1330        6000  
12              1370       10208  
13              1360        4850  
14              1890       14018  
15              1610        4300  
16              1060        5095  
17              1280        8850  
18              1400        4980  
19              4110       20336  
20              2240        7005  
21              1220        8386  
22              2200        6865  
23              1760        3573  
24              1860        3861  
25              1520        6235  
26              2630        6026  
27              1390        1756  
28              1460        5000  
29              2020        8720  
...              ...         ...  
17354           1640        4800  
17355           3470       18270  
17356           1130        1159  
17357           1710        1686  
17358           2850        6000  
17359           2850        6000  
17360           1310        1415  
17361           2170        5399  
17362           1430        1249  
17363           1520        1497  
17364           1210        1118  
17365           2540        4571  
17366           4560       11063  
17367           1810        5641  
17368           1180        1224  
17369           1530        1282  
17370           3550        9421  
17371           2290       10125  
17372           2673        6500  
17373           3370        6814  
17374           2780        8964  
17375           1285        1253  
17376           1320       11303  
17377           2270        5731  
17378           1400        1230  
17379           2050        6200  
17380           1330        1265  
17381           1530        1509  
17382           1410        1287  
17383           1020        1357  

[17384 rows x 21 columns]

In [270]:
train_data['price'].mean()


Out[270]:
539366.6279337321

In [5]:
train_data['sqft_living'].mean()


Out[5]:
2080.0295098941556

In [6]:
train_data['sqft_living'].var()


Out[6]:
849403.4935202107

Learning multiple regression model


In [8]:
from sklearn import linear_model

In [53]:
regr = linear_model.LinearRegression(fit_intercept=False)

In [169]:
X = np.ones((len(train_data), 4))
X[:,1:] = train_data[['sqft_living', 'bedrooms', 'bathrooms']].values
y = train_data['price']

In [131]:
X


Out[131]:
array([[  1.00000000e+00,   1.18000000e+03,   3.00000000e+00,
          1.00000000e+00],
       [  1.00000000e+00,   2.57000000e+03,   3.00000000e+00,
          2.25000000e+00],
       [  1.00000000e+00,   7.70000000e+02,   2.00000000e+00,
          1.00000000e+00],
       ..., 
       [  1.00000000e+00,   1.53000000e+03,   3.00000000e+00,
          2.50000000e+00],
       [  1.00000000e+00,   1.60000000e+03,   3.00000000e+00,
          2.50000000e+00],
       [  1.00000000e+00,   1.02000000e+03,   2.00000000e+00,
          7.50000000e-01]])

In [170]:
regr.fit(X, y)


Out[170]:
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [171]:
regr.coef_


Out[171]:
array([ 87912.86581493,    315.40669062, -65081.88711588,   6942.16598637])

Adding new features t odataset


In [279]:
def add_new_features(dataset):
    dataset['bedrooms_squared'] = dataset['bedrooms'] * dataset['bedrooms']
    dataset['bed_bath_rooms'] = dataset['bedrooms'] * dataset['bathrooms']
    dataset['log_sqft_living'] = np.log(dataset['sqft_living'])
    dataset['lat_plus_long'] = dataset['lat'] + dataset['long']

In [280]:
add_new_features(train_data)
add_new_features(test_data)

In [281]:
train_data.columns.values


Out[281]:
array(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15',
       'bedrooms_squared', 'bed_bath_rooms', 'log_sqft_living',
       'lat_plus_long'], dtype=object)

Computing mean for newly added features


In [282]:
np.mean(test_data['bedrooms_squared'])


Out[282]:
12.4466777015843

In [283]:
np.mean(train_data['bedrooms_squared'])


Out[283]:
12.174240681086056

In [284]:
np.mean(test_data['bed_bath_rooms'])


Out[284]:
7.5039016315913925

In [285]:
np.mean(test_data['log_sqft_living'])


Out[285]:
7.550274679645921

In [286]:
np.mean(test_data['lat_plus_long'])


Out[286]:
-74.65333355403185

Evaluating the models with new features


In [287]:
feature_set1 = ['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']
feature_set2 = feature_set1 + ['bed_bath_rooms']
feature_set3 = feature_set2 + ['bedrooms_squared', 'log_sqft_living', 'lat_plus_long']

In [288]:
def train_linear_regression(dataset, feature_set):
    y = dataset['price']
    X = np.ones((len(dataset), len(feature_set) + 1))
    X[:,1:] = dataset[feature_set]
    regr = linear_model.LinearRegression(fit_intercept=False)
    regr.fit(X, y)
    return regr

In [289]:
model1 = train_linear_regression(train_data, feature_set1)

In [290]:
model2 = train_linear_regression(train_data, feature_set2)

In [291]:
model3 = train_linear_regression(train_data, feature_set3)

What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 1?


In [292]:
model1.coef_[3]


Out[292]:
15706.742082734683

What is the sign (positive or negative) for the coefficient/weight for ‘bathrooms’ in Model 2?


In [293]:
model2.coef_[3]


Out[293]:
-71461.308292758564

Estimating RSS for Linear Regression Models


In [294]:
def predict(model, dataset, feature_set):
    X = np.ones((len(dataset), len(feature_set) + 1))
    X[:,1:] = dataset[feature_set]
    return model.predict(X)

In [295]:
def compute_rss(model, dataset, feature_set):
    y_hat = predict(model, dataset, feature_set)
    y = dataset['price']
    return (y - y_hat).T.dot(y - y_hat)

For train data


In [300]:
train_rss = [compute_rss(model1, train_data, feature_set1), compute_rss(model2, train_data, feature_set2), compute_rss(model3, train_data, feature_set3)]
train_rss


Out[300]:
[967879963049545.38, 958419635074069.5, 903436455050478.25]

In [304]:
train_rss.index(min(train_rss))


Out[304]:
2

For test data


In [301]:
test_rss = [compute_rss(model1, test_data, feature_set1), compute_rss(model2, test_data, feature_set2), compute_rss(model3, test_data, feature_set3)]
test_rss


Out[301]:
[225500469795490.34, 223377462976467.63, 259236319207171.28]

In [299]:
test_rss.index(min(test_rss))


Out[299]:
1

In [303]:
np.array(test_rss) - np.array(train_rss)


Out[303]:
array([ -7.42379493e+14,  -7.35042172e+14,  -6.44200136e+14])

In [ ]: