In [1]:
from __future__ import division
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import pysal as ps
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame
from shapely.geometry import Point
from sklearn import neighbors

sns.set(style="white")
sns.set_context({"figure.figsize": (24, 10)})

pd.options.display.float_format = '{:.2f}'.format

abb_link = './tfg/dbases/development3.csv'
zc_link = './tfg/mapas/barrios_area.shp'

muestra = pd.read_csv(abb_link)
barrios = gpd.read_file(zc_link)

geometry = [Point(xy) for xy in zip(muestra['lon'], muestra['lat'])]
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(muestra, crs=crs, geometry=geometry)

db = gpd.sjoin(geo_df, barrios, how="inner", op='intersects')

metro = pd.read_csv('./tfg/dbases/distance_matrix_metro.csv')

db = db.join(metro.set_index('InputID'),
                            on='id', how='left')

db = db.rename(index=str, columns={"DESBDT": "subdistrict_f", "Distance": "metro_distance", "NUMPOINTS": "metro_number"})

db = pd.DataFrame(db)
db['floor']=db['floor'].replace(['Ground floor', 'Mezzanine', 'Semi-basement', 'Basement', 'ground', 'Floor -2', 'Floor -1'], 0,regex=True)
#db.replace(u'\xe', 'A')
db['floor'] = pd.to_numeric(db['floor'])

In [2]:
varis = ['pricems', 'rooms', 'floor', 'needs_renovating', 'garden', 'terrace', 'new_dev', 'garage']

In [3]:
db.loc[:,varis].describe()


Out[3]:
pricems rooms floor needs_renovating garden terrace new_dev garage
count 19177.00 19177.00 19177.00 19177.00 19177.00 19177.00 19177.00 19177.00
mean 3288.34 3.01 3.02 0.18 0.27 0.42 0.03 0.34
std 1755.78 1.28 2.65 0.38 0.45 0.49 0.17 0.47
min 468.85 1.00 0.00 0.00 0.00 0.00 0.00 0.00
25% 2000.00 2.00 1.00 0.00 0.00 0.00 0.00 0.00
50% 3000.00 3.00 2.00 0.00 0.00 0.00 0.00 0.00
75% 4109.59 4.00 4.00 0.00 1.00 1.00 0.00 1.00
max 23787.53 25.00 60.00 1.00 1.00 1.00 1.00 1.00

In [3]:
y = np.log(db['pricems'])

In [4]:
yxs = db.loc[:, varis + ['pricems']].dropna()

In [ ]:

Regresion no espacial sin cluster


In [5]:
m1 = ps.spreg.ols.OLS(y.values[:, None], yxs.drop('pricems', axis=1).values, \
                  name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)')

In [6]:
print(m1.summary)


REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :     unknown
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:       19177
Mean dependent var  :      7.9693                Number of Variables   :           8
S.D. dependent var  :      0.5114                Degrees of Freedom    :       19169
R-squared           :      0.0936
Adjusted R-squared  :      0.0932
Sum squared residual:    4545.057                F-statistic           :    282.6218
Sigma-square        :       0.237                Prob(F-statistic)     :           0
S.E. of regression  :       0.487                Log likelihood        :  -13406.695
Sigma-square ML     :       0.237                Akaike info criterion :   26829.391
S.E of regression ML:      0.4868                Schwarz criterion     :   26892.283

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       7.6957623       0.0097237     791.4406643       0.0000000
               rooms       0.0566068       0.0028965      19.5434489       0.0000000
               floor       0.0253068       0.0013540      18.6903222       0.0000000
    needs_renovating      -0.0232518       0.0096208      -2.4168162       0.0156662
              garden      -0.0299240       0.0089228      -3.3536435       0.0007991
             terrace      -0.0782592       0.0073291     -10.6778549       0.0000000
             new_dev      -0.2137107       0.0209672     -10.1926412       0.0000000
              garage       0.2300741       0.0085285      26.9770397       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            7.432

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2          19.330           0.0001

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7         714.195           0.0000
Koenker-Bassett test              7         774.239           0.0000
================================ END OF REPORT =====================================

In [7]:
zona = dict()
mreg = dict()

In [8]:
for clu in range(0, 8):
    
    zona[clu] = db[db["cl"] == clu]
    y = np.log(zona[clu]['pricems'])
    yxs = zona[clu].loc[:, varis + ['pricems']].dropna()
    
    mreg[clu] = ps.spreg.ols.OLS(y.values[:, None], yxs.drop('pricems', axis=1).values, \
                  name_x=yxs.drop('pricems', axis=1).columns.tolist(), name_y='ln(pricems)', \
                  name_ds = 'zona ' + str([clu]))
    print(mreg[clu].summary)


REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [0]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        1927
Mean dependent var  :      7.9418                Number of Variables   :           8
S.D. dependent var  :      0.3581                Degrees of Freedom    :        1919
R-squared           :      0.1440
Adjusted R-squared  :      0.1409
Sum squared residual:     211.443                F-statistic           :     46.1128
Sigma-square        :       0.110                Prob(F-statistic)     :   1.129e-60
S.E. of regression  :       0.332                Log likelihood        :    -605.186
Sigma-square ML     :       0.110                Akaike info criterion :    1226.372
S.E of regression ML:      0.3312                Schwarz criterion     :    1270.882

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       7.9405081       0.0218724     363.0373411       0.0000000
               rooms      -0.0439766       0.0074622      -5.8932121       0.0000000
               floor       0.0227006       0.0028456       7.9775498       0.0000000
    needs_renovating      -0.0819731       0.0204012      -4.0180594       0.0000609
              garden       0.1285873       0.0190909       6.7355318       0.0000000
             terrace       0.0053045       0.0159785       0.3319785       0.7399417
             new_dev       0.0026693       0.0577180       0.0462469       0.9631183
              garage       0.1713121       0.0199865       8.5714097       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            7.818

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2         109.780           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7         152.885           0.0000
Koenker-Bassett test              7         138.405           0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [1]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        3762
Mean dependent var  :      8.2282                Number of Variables   :           8
S.D. dependent var  :      0.2850                Degrees of Freedom    :        3754
R-squared           :      0.0925
Adjusted R-squared  :      0.0909
Sum squared residual:     277.291                F-statistic           :     54.6945
Sigma-square        :       0.074                Prob(F-statistic)     :   8.298e-75
S.E. of regression  :       0.272                Log likelihood        :    -433.076
Sigma-square ML     :       0.074                Akaike info criterion :     882.153
S.E of regression ML:      0.2715                Schwarz criterion     :     932.015

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       8.2097272       0.0119656     686.1113164       0.0000000
               rooms      -0.0151536       0.0032845      -4.6136124       0.0000041
               floor       0.0136679       0.0015127       9.0354079       0.0000000
    needs_renovating      -0.0925550       0.0115126      -8.0394516       0.0000000
              garden      -0.0056561       0.0105446      -0.5363964       0.5917164
             terrace      -0.0202202       0.0094246      -2.1454744       0.0319789
             new_dev       0.1776062       0.0323922       5.4829842       0.0000000
              garage       0.1108651       0.0104008      10.6592961       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            7.432

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2         236.755           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7          18.522           0.0098
Koenker-Bassett test              7          11.634           0.1133
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [2]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        2730
Mean dependent var  :      7.5247                Number of Variables   :           8
S.D. dependent var  :      0.3304                Degrees of Freedom    :        2722
R-squared           :      0.3166
Adjusted R-squared  :      0.3149
Sum squared residual:     203.538                F-statistic           :    180.1533
Sigma-square        :       0.075                Prob(F-statistic)     :   1.13e-219
S.E. of regression  :       0.273                Log likelihood        :    -329.882
Sigma-square ML     :       0.075                Akaike info criterion :     675.764
S.E of regression ML:      0.2730                Schwarz criterion     :     723.061

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       7.5939377       0.0184533     411.5211696       0.0000000
               rooms      -0.0744436       0.0063689     -11.6885149       0.0000000
               floor       0.0157955       0.0024600       6.4209155       0.0000000
    needs_renovating      -0.2091500       0.0176097     -11.8769597       0.0000000
              garden       0.1223859       0.0134157       9.1225995       0.0000000
             terrace      -0.0147373       0.0108665      -1.3562066       0.1751459
             new_dev      -0.1222995       0.0226672      -5.3954381       0.0000001
              garage       0.1982731       0.0133659      14.8341972       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            9.508

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2          68.223           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7         134.525           0.0000
Koenker-Bassett test              7          97.178           0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [3]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        1963
Mean dependent var  :      7.6186                Number of Variables   :           8
S.D. dependent var  :      0.3286                Degrees of Freedom    :        1955
R-squared           :      0.3029
Adjusted R-squared  :      0.3005
Sum squared residual:     147.671                F-statistic           :    121.3803
Sigma-square        :       0.076                Prob(F-statistic)     :  2.847e-148
S.E. of regression  :       0.275                Log likelihood        :    -245.997
Sigma-square ML     :       0.075                Akaike info criterion :     507.994
S.E of regression ML:      0.2743                Schwarz criterion     :     552.652

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       7.6332504       0.0199630     382.3701255       0.0000000
               rooms      -0.0456407       0.0068365      -6.6760402       0.0000000
               floor       0.0123698       0.0027239       4.5412727       0.0000059
    needs_renovating      -0.1848885       0.0179786     -10.2838070       0.0000000
              garden       0.1702656       0.0166229      10.2428437       0.0000000
             terrace       0.0133446       0.0128581       1.0378334       0.2994760
             new_dev      -0.1172667       0.0884838      -1.3252907       0.1852296
              garage       0.2514333       0.0170547      14.7427377       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            8.505

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2          91.663           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7          62.596           0.0000
Koenker-Bassett test              7          42.234           0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [4]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        1935
Mean dependent var  :      7.3348                Number of Variables   :           8
S.D. dependent var  :      0.3230                Degrees of Freedom    :        1927
R-squared           :      0.2929
Adjusted R-squared  :      0.2903
Sum squared residual:     142.726                F-statistic           :    114.0057
Sigma-square        :       0.074                Prob(F-statistic)     :  4.085e-140
S.E. of regression  :       0.272                Log likelihood        :    -223.433
Sigma-square ML     :       0.074                Akaike info criterion :     462.867
S.E of regression ML:      0.2716                Schwarz criterion     :     507.410

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       7.4005908       0.0220058     336.3017216       0.0000000
               rooms      -0.0698449       0.0073275      -9.5318928       0.0000000
               floor       0.0081365       0.0027482       2.9606876       0.0031070
    needs_renovating      -0.1782097       0.0212691      -8.3788028       0.0000000
              garden       0.1774203       0.0168747      10.5139911       0.0000000
             terrace       0.0320155       0.0126456       2.5317514       0.0114283
             new_dev      -0.0862337       0.0301832      -2.8570062       0.0043225
              garage       0.2268694       0.0162452      13.9652814       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            9.604

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2          23.625           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7          88.259           0.0000
Koenker-Bassett test              7          77.838           0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [5]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        2426
Mean dependent var  :      7.9228                Number of Variables   :           8
S.D. dependent var  :      0.2946                Degrees of Freedom    :        2418
R-squared           :      0.2992
Adjusted R-squared  :      0.2971
Sum squared residual:     147.507                F-statistic           :    147.4442
Sigma-square        :       0.061                Prob(F-statistic)     :  1.724e-181
S.E. of regression  :       0.247                Log likelihood        :     -45.792
Sigma-square ML     :       0.061                Akaike info criterion :     107.584
S.E of regression ML:      0.2466                Schwarz criterion     :     153.936

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       7.8437717       0.0142562     550.2005941       0.0000000
               rooms      -0.0261764       0.0050592      -5.1739850       0.0000002
               floor       0.0196845       0.0018708      10.5217408       0.0000000
    needs_renovating      -0.1886605       0.0158824     -11.8785861       0.0000000
              garden       0.1252929       0.0116687      10.7375523       0.0000000
             terrace      -0.0299305       0.0107729      -2.7783121       0.0055064
             new_dev      -0.0826065       0.0212817      -3.8815704       0.0001066
              garage       0.1676376       0.0117594      14.2556526       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            8.282

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2         113.280           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7          80.973           0.0000
Koenker-Bassett test              7          55.592           0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [6]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        2441
Mean dependent var  :      8.3833                Number of Variables   :           8
S.D. dependent var  :      0.3048                Degrees of Freedom    :        2433
R-squared           :      0.1427
Adjusted R-squared  :      0.1402
Sum squared residual:     194.354                F-statistic           :     57.8427
Sigma-square        :       0.080                Prob(F-statistic)     :    5.69e-77
S.E. of regression  :       0.283                Log likelihood        :    -375.173
Sigma-square ML     :       0.080                Akaike info criterion :     766.347
S.E of regression ML:      0.2822                Schwarz criterion     :     812.748

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       8.3596846       0.0150597     555.1042118       0.0000000
               rooms      -0.0179375       0.0038939      -4.6065923       0.0000043
               floor       0.0288913       0.0021639      13.3512615       0.0000000
    needs_renovating      -0.1112053       0.0134319      -8.2791765       0.0000000
              garden      -0.0587619       0.0234966      -2.5008669       0.0124542
             terrace      -0.0034244       0.0128854      -0.2657576       0.7904484
             new_dev       0.2275301       0.0712863       3.1917775       0.0014320
              garage       0.1183715       0.0152672       7.7533183       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            6.730

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2         145.527           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7          68.252           0.0000
Koenker-Bassett test              7          43.261           0.0000
================================ END OF REPORT =====================================
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set            :    zona [7]
Weights matrix      :        None
Dependent Variable  : ln(pricems)                Number of Observations:        1993
Mean dependent var  :      8.6270                Number of Variables   :           8
S.D. dependent var  :      0.3274                Degrees of Freedom    :        1985
R-squared           :      0.0859
Adjusted R-squared  :      0.0827
Sum squared residual:     195.163                F-statistic           :     26.6593
Sigma-square        :       0.098                Prob(F-statistic)     :   3.886e-35
S.E. of regression  :       0.314                Log likelihood        :    -512.516
Sigma-square ML     :       0.098                Akaike info criterion :    1041.031
S.E of regression ML:      0.3129                Schwarz criterion     :    1085.811

------------------------------------------------------------------------------------
            Variable     Coefficient       Std.Error     t-Statistic     Probability
------------------------------------------------------------------------------------
            CONSTANT       8.6416231       0.0214403     403.0558477       0.0000000
               rooms      -0.0145895       0.0050758      -2.8743334       0.0040919
               floor       0.0129099       0.0026868       4.8049678       0.0000017
    needs_renovating      -0.1337112       0.0164456      -8.1304973       0.0000000
              garden      -0.0063801       0.0229057      -0.2785380       0.7806284
             terrace      -0.0119363       0.0149175      -0.8001508       0.4237192
             new_dev       0.1873795       0.0911719       2.0552322       0.0399872
              garage       0.1081096       0.0154383       7.0027039       0.0000000
------------------------------------------------------------------------------------

REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER            8.256

TEST ON NORMALITY OF ERRORS
TEST                             DF        VALUE           PROB
Jarque-Bera                       2         613.745           0.0000

DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST                             DF        VALUE           PROB
Breusch-Pagan test                7          34.376           0.0000
Koenker-Bassett test              7          14.662           0.0406
================================ END OF REPORT =====================================

In [ ]:


In [ ]:


In [ ]: