In [1]:
from __future__ import division
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import pysal as ps
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame
from shapely.geometry import Point
from sklearn import neighbors
sns.set(style="white")
sns.set_context({"figure.figsize": (24, 10)})
pd.options.display.float_format = '{:.2f}'.format
abb_link = './tfg/dbases/development3.csv'
zc_link = './tfg/mapas/barrios_area.shp'
muestra = pd.read_csv(abb_link)
barrios = gpd.read_file(zc_link)
geometry = [Point(xy) for xy in zip(muestra['lon'], muestra['lat'])]
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(muestra, crs=crs, geometry=geometry)
db = gpd.sjoin(geo_df, barrios, how="inner", op='intersects')
metro = pd.read_csv('./tfg/dbases/distance_matrix_metro.csv')
db = db.join(metro.set_index('InputID'),
on='id', how='left')
db = db.rename(index=str, columns={"DESBDT": "subdistrict_f", "Distance": "metro_distance", "NUMPOINTS": "metro_number"})
db = pd.DataFrame(db)
db['floor']=db['floor'].replace(['Ground floor', 'Mezzanine', 'Semi-basement', 'Basement', 'ground', 'Floor -2', 'Floor -1'], 0,regex=True)
#db.replace(u'\xe', 'A')
db['floor'] = pd.to_numeric(db['floor'])
In [2]:
varis = ['pricems', 'rooms', 'floor', 'needs_renovating', 'garden', 'terrace', 'new_dev', 'garage']
In [3]:
db.loc[:,varis].describe()
Out[3]:
pricems
rooms
floor
needs_renovating
garden
terrace
new_dev
garage
count
19177.00
19177.00
19177.00
19177.00
19177.00
19177.00
19177.00
19177.00
mean
3288.34
3.01
3.02
0.18
0.27
0.42
0.03
0.34
std
1755.78
1.28
2.65
0.38
0.45
0.49
0.17
0.47
min
468.85
1.00
0.00
0.00
0.00
0.00
0.00
0.00
25%
2000.00
2.00
1.00
0.00
0.00
0.00
0.00
0.00
50%
3000.00
3.00
2.00
0.00
0.00
0.00
0.00
0.00
75%
4109.59
4.00
4.00
0.00
1.00
1.00
0.00
1.00
max
23787.53
25.00
60.00
1.00
1.00
1.00
1.00
1.00
In [3]:
y = np.log(db['pricems'])
In [4]:
yxs = db.loc[:, varis + ['pricems']].dropna()
In [ ]:
Regresion no espacial sin cluster
In [5]:
m1 = ps.spreg.ols.OLS(y.values[:, None], yxs.drop('pricems', axis=1).values, \
name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)')
In [6]:
print(m1.summary)
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : unknown
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 19177
Mean dependent var : 7.9693 Number of Variables : 8
S.D. dependent var : 0.5114 Degrees of Freedom : 19169
R-squared : 0.0936
Adjusted R-squared : 0.0932
Sum squared residual: 4545.057 F-statistic : 282.6218
Sigma-square : 0.237 Prob(F-statistic) : 0
S.E. of regression : 0.487 Log likelihood : -13406.695
Sigma-square ML : 0.237 Akaike info criterion : 26829.391
S.E of regression ML: 0.4868 Schwarz criterion : 26892.283
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 7.6957623 0.0097237 791.4406643 0.0000000
rooms 0.0566068 0.0028965 19.5434489 0.0000000
floor 0.0253068 0.0013540 18.6903222 0.0000000
needs_renovating -0.0232518 0.0096208 -2.4168162 0.0156662
garden -0.0299240 0.0089228 -3.3536435 0.0007991
terrace -0.0782592 0.0073291 -10.6778549 0.0000000
new_dev -0.2137107 0.0209672 -10.1926412 0.0000000
garage 0.2300741 0.0085285 26.9770397 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 7.432
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 19.330 0.0001
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 714.195 0.0000
Koenker-Bassett test 7 774.239 0.0000
================================ END OF REPORT =====================================
In [7]:
zona = dict()
mreg = dict()
In [8]:
for clu in range(0, 8):
zona[clu] = db[db["cl"] == clu]
y = np.log(zona[clu]['pricems'])
yxs = zona[clu].loc[:, varis + ['pricems']].dropna()
mreg[clu] = ps.spreg.ols.OLS(y.values[:, None], yxs.drop('pricems', axis=1).values, \
name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)', \
name_ds = 'zona ' + str([clu]))
print(mreg[clu].summary)
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : zona [0]
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 1927
Mean dependent var : 7.9418 Number of Variables : 8
S.D. dependent var : 0.3581 Degrees of Freedom : 1919
R-squared : 0.1440
Adjusted R-squared : 0.1409
Sum squared residual: 211.443 F-statistic : 46.1128
Sigma-square : 0.110 Prob(F-statistic) : 1.129e-60
S.E. of regression : 0.332 Log likelihood : -605.186
Sigma-square ML : 0.110 Akaike info criterion : 1226.372
S.E of regression ML: 0.3312 Schwarz criterion : 1270.882
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 7.9405081 0.0218724 363.0373411 0.0000000
rooms -0.0439766 0.0074622 -5.8932121 0.0000000
floor 0.0227006 0.0028456 7.9775498 0.0000000
needs_renovating -0.0819731 0.0204012 -4.0180594 0.0000609
garden 0.1285873 0.0190909 6.7355318 0.0000000
terrace 0.0053045 0.0159785 0.3319785 0.7399417
new_dev 0.0026693 0.0577180 0.0462469 0.9631183
garage 0.1713121 0.0199865 8.5714097 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 7.818
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 109.780 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 152.885 0.0000
Koenker-Bassett test 7 138.405 0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : zona [1]
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 3762
Mean dependent var : 8.2282 Number of Variables : 8
S.D. dependent var : 0.2850 Degrees of Freedom : 3754
R-squared : 0.0925
Adjusted R-squared : 0.0909
Sum squared residual: 277.291 F-statistic : 54.6945
Sigma-square : 0.074 Prob(F-statistic) : 8.298e-75
S.E. of regression : 0.272 Log likelihood : -433.076
Sigma-square ML : 0.074 Akaike info criterion : 882.153
S.E of regression ML: 0.2715 Schwarz criterion : 932.015
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 8.2097272 0.0119656 686.1113164 0.0000000
rooms -0.0151536 0.0032845 -4.6136124 0.0000041
floor 0.0136679 0.0015127 9.0354079 0.0000000
needs_renovating -0.0925550 0.0115126 -8.0394516 0.0000000
garden -0.0056561 0.0105446 -0.5363964 0.5917164
terrace -0.0202202 0.0094246 -2.1454744 0.0319789
new_dev 0.1776062 0.0323922 5.4829842 0.0000000
garage 0.1108651 0.0104008 10.6592961 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 7.432
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 236.755 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 18.522 0.0098
Koenker-Bassett test 7 11.634 0.1133
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : zona [2]
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 2730
Mean dependent var : 7.5247 Number of Variables : 8
S.D. dependent var : 0.3304 Degrees of Freedom : 2722
R-squared : 0.3166
Adjusted R-squared : 0.3149
Sum squared residual: 203.538 F-statistic : 180.1533
Sigma-square : 0.075 Prob(F-statistic) : 1.13e-219
S.E. of regression : 0.273 Log likelihood : -329.882
Sigma-square ML : 0.075 Akaike info criterion : 675.764
S.E of regression ML: 0.2730 Schwarz criterion : 723.061
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 7.5939377 0.0184533 411.5211696 0.0000000
rooms -0.0744436 0.0063689 -11.6885149 0.0000000
floor 0.0157955 0.0024600 6.4209155 0.0000000
needs_renovating -0.2091500 0.0176097 -11.8769597 0.0000000
garden 0.1223859 0.0134157 9.1225995 0.0000000
terrace -0.0147373 0.0108665 -1.3562066 0.1751459
new_dev -0.1222995 0.0226672 -5.3954381 0.0000001
garage 0.1982731 0.0133659 14.8341972 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 9.508
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 68.223 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 134.525 0.0000
Koenker-Bassett test 7 97.178 0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : zona [3]
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 1963
Mean dependent var : 7.6186 Number of Variables : 8
S.D. dependent var : 0.3286 Degrees of Freedom : 1955
R-squared : 0.3029
Adjusted R-squared : 0.3005
Sum squared residual: 147.671 F-statistic : 121.3803
Sigma-square : 0.076 Prob(F-statistic) : 2.847e-148
S.E. of regression : 0.275 Log likelihood : -245.997
Sigma-square ML : 0.075 Akaike info criterion : 507.994
S.E of regression ML: 0.2743 Schwarz criterion : 552.652
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 7.6332504 0.0199630 382.3701255 0.0000000
rooms -0.0456407 0.0068365 -6.6760402 0.0000000
floor 0.0123698 0.0027239 4.5412727 0.0000059
needs_renovating -0.1848885 0.0179786 -10.2838070 0.0000000
garden 0.1702656 0.0166229 10.2428437 0.0000000
terrace 0.0133446 0.0128581 1.0378334 0.2994760
new_dev -0.1172667 0.0884838 -1.3252907 0.1852296
garage 0.2514333 0.0170547 14.7427377 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 8.505
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 91.663 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 62.596 0.0000
Koenker-Bassett test 7 42.234 0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : zona [4]
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 1935
Mean dependent var : 7.3348 Number of Variables : 8
S.D. dependent var : 0.3230 Degrees of Freedom : 1927
R-squared : 0.2929
Adjusted R-squared : 0.2903
Sum squared residual: 142.726 F-statistic : 114.0057
Sigma-square : 0.074 Prob(F-statistic) : 4.085e-140
S.E. of regression : 0.272 Log likelihood : -223.433
Sigma-square ML : 0.074 Akaike info criterion : 462.867
S.E of regression ML: 0.2716 Schwarz criterion : 507.410
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 7.4005908 0.0220058 336.3017216 0.0000000
rooms -0.0698449 0.0073275 -9.5318928 0.0000000
floor 0.0081365 0.0027482 2.9606876 0.0031070
needs_renovating -0.1782097 0.0212691 -8.3788028 0.0000000
garden 0.1774203 0.0168747 10.5139911 0.0000000
terrace 0.0320155 0.0126456 2.5317514 0.0114283
new_dev -0.0862337 0.0301832 -2.8570062 0.0043225
garage 0.2268694 0.0162452 13.9652814 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 9.604
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 23.625 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 88.259 0.0000
Koenker-Bassett test 7 77.838 0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : zona [5]
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 2426
Mean dependent var : 7.9228 Number of Variables : 8
S.D. dependent var : 0.2946 Degrees of Freedom : 2418
R-squared : 0.2992
Adjusted R-squared : 0.2971
Sum squared residual: 147.507 F-statistic : 147.4442
Sigma-square : 0.061 Prob(F-statistic) : 1.724e-181
S.E. of regression : 0.247 Log likelihood : -45.792
Sigma-square ML : 0.061 Akaike info criterion : 107.584
S.E of regression ML: 0.2466 Schwarz criterion : 153.936
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 7.8437717 0.0142562 550.2005941 0.0000000
rooms -0.0261764 0.0050592 -5.1739850 0.0000002
floor 0.0196845 0.0018708 10.5217408 0.0000000
needs_renovating -0.1886605 0.0158824 -11.8785861 0.0000000
garden 0.1252929 0.0116687 10.7375523 0.0000000
terrace -0.0299305 0.0107729 -2.7783121 0.0055064
new_dev -0.0826065 0.0212817 -3.8815704 0.0001066
garage 0.1676376 0.0117594 14.2556526 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 8.282
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 113.280 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 80.973 0.0000
Koenker-Bassett test 7 55.592 0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : zona [6]
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 2441
Mean dependent var : 8.3833 Number of Variables : 8
S.D. dependent var : 0.3048 Degrees of Freedom : 2433
R-squared : 0.1427
Adjusted R-squared : 0.1402
Sum squared residual: 194.354 F-statistic : 57.8427
Sigma-square : 0.080 Prob(F-statistic) : 5.69e-77
S.E. of regression : 0.283 Log likelihood : -375.173
Sigma-square ML : 0.080 Akaike info criterion : 766.347
S.E of regression ML: 0.2822 Schwarz criterion : 812.748
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 8.3596846 0.0150597 555.1042118 0.0000000
rooms -0.0179375 0.0038939 -4.6065923 0.0000043
floor 0.0288913 0.0021639 13.3512615 0.0000000
needs_renovating -0.1112053 0.0134319 -8.2791765 0.0000000
garden -0.0587619 0.0234966 -2.5008669 0.0124542
terrace -0.0034244 0.0128854 -0.2657576 0.7904484
new_dev 0.2275301 0.0712863 3.1917775 0.0014320
garage 0.1183715 0.0152672 7.7533183 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 6.730
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 145.527 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 68.252 0.0000
Koenker-Bassett test 7 43.261 0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : zona [7]
Weights matrix : None
Dependent Variable : ln(pricems) Number of Observations: 1993
Mean dependent var : 8.6270 Number of Variables : 8
S.D. dependent var : 0.3274 Degrees of Freedom : 1985
R-squared : 0.0859
Adjusted R-squared : 0.0827
Sum squared residual: 195.163 F-statistic : 26.6593
Sigma-square : 0.098 Prob(F-statistic) : 3.886e-35
S.E. of regression : 0.314 Log likelihood : -512.516
Sigma-square ML : 0.098 Akaike info criterion : 1041.031
S.E of regression ML: 0.3129 Schwarz criterion : 1085.811
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 8.6416231 0.0214403 403.0558477 0.0000000
rooms -0.0145895 0.0050758 -2.8743334 0.0040919
floor 0.0129099 0.0026868 4.8049678 0.0000017
needs_renovating -0.1337112 0.0164456 -8.1304973 0.0000000
garden -0.0063801 0.0229057 -0.2785380 0.7806284
terrace -0.0119363 0.0149175 -0.8001508 0.4237192
new_dev 0.1873795 0.0911719 2.0552322 0.0399872
garage 0.1081096 0.0154383 7.0027039 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 8.256
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 613.745 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 7 34.376 0.0000
Koenker-Bassett test 7 14.662 0.0406
================================ END OF REPORT =====================================
In [ ]:
In [ ]:
In [ ]:
Content source: cmmarti/housing-madrid
Similar notebooks: