In [1]:
from __future__ import division
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import pysal as ps
import geopandas as gpd
from geopandas import GeoSeries, GeoDataFrame
from shapely.geometry import Point
from sklearn import neighbors
sns.set(style="white")
sns.set_context({"figure.figsize": (24, 10)})
pd.options.display.float_format = '{:.2f}'.format
abb_link = './tfg/dbases/development3.csv'
zc_link = './tfg/mapas/barrios_area.shp'
muestra = pd.read_csv(abb_link)
barrios = gpd.read_file(zc_link)
geometry = [Point(xy) for xy in zip(muestra['lon'], muestra['lat'])]
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(muestra, crs=crs, geometry=geometry)
db = gpd.sjoin(geo_df, barrios, how="inner", op='intersects')
metro = pd.read_csv('./tfg/dbases/distance_matrix_metro.csv')
db = db.join(metro.set_index('InputID'),
on='id', how='left')
db = db.rename(index=str, columns={"DESBDT": "subdistrict_f", "Distance": "metro_distance", "NUMPOINTS": "metro_number"})
db = pd.DataFrame(db)
db['floor']=db['floor'].replace(['Ground floor', 'Mezzanine', 'Semi-basement', 'Basement', 'ground', 'Floor -2', 'Floor -1'], 0,regex=True)
#db.replace(u'\xe', 'A')
db['floor'] = pd.to_numeric(db['floor'])
In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pysal as ps
import geopandas as gpd
from sklearn import cluster
from sklearn.preprocessing import scale
Agregaci\'on de variables a nivel barrio
In [50]:
varis = ['pricems', 'rooms', 'floor', 'needs_renovating', 'garden', 'terrace', 'new_dev', 'garage']
In [10]:
aves = db.groupby('GEOCODIGO')[varis].mean()
aves.info()
In [12]:
types = pd.get_dummies(db['metro_number'])
prop_types = types.join(db['GEOCODIGO'])\
.groupby('GEOCODIGO')\
.sum()
prop_types_pct = (prop_types * 100).div(prop_types)
prop_types_pct.info()
In [13]:
aves_props = aves.join(prop_types_pct)
#eliminar valores nulos
aves_props = aves_props.fillna(value=0)
In [14]:
db1 = pd.DataFrame(\
scale(aves_props), \
index=aves_props.index, \
columns=aves_props.columns)\
#.rename(lambda x: str(int(x)) )
In [16]:
#zc = gpd.read_file(zc_link)
#zc.plot(color='green')
#sns.plt.show()
In [162]:
db1.info()
In [17]:
#zdb = db1.set_index('subdistrict_f').join(zc[['DESBDT', 'geometry']], on='DESBDT').dropna()
zdb = zc[['geometry', 'GEOCODIGO']].join(db1, on='GEOCODIGO')\
.dropna()
In [46]:
km5 = cluster.KMeans(n_clusters=3)
km5cls = km5.fit(zdb.drop(['geometry', 'GEOCODIGO'], axis=1).values)
f, ax = plt.subplots(1, figsize=(9, 9))
zdb.assign(cl=km5cls.labels_)\
.plot(column='cl', categorical=True, legend=True, \
linewidth=0.1, edgecolor='white', ax=ax)
ax.set_axis_off()
plt.show()
In [47]:
km5cls.labels_
Out[47]:
In [48]:
cl_pcts = prop_types_pct.reindex(zdb['GEOCODIGO'])\
.assign(cl=km5cls.labels_)\
.groupby('cl')\
.count()
N\'umero de bocas de metro en los barrios que componen la zona
In [49]:
cl_pcts
Out[49]:
In [44]:
f, ax = plt.subplots(1, figsize=(18, 9))
cl_pcts.plot(kind='bar', stacked=False, ax=ax, \
cmap='Set2', linewidth=2)
ax.legend(ncol=1, loc="right");
In [45]:
plt.show()
In [176]:
type(cl_pcts)
Out[176]:
In [226]:
zdb.info()
In [51]:
rt_av = db.groupby('GEOCODIGO')[varis]\
.mean()\
.rename(lambda x: str(int(x)))
#pasar a int para join
#rt_av.index = rt_av.index.astype(int)
In [52]:
rt_av.describe()
Out[52]:
In [53]:
zc['GEOCODIGO'] = pd.to_numeric(zc['GEOCODIGO'])
In [54]:
#pasar a int para join
rt_av.index = rt_av.index.astype(int)
zrt = zc[['geometry', 'GEOCODIGO']].join(rt_av, on='GEOCODIGO')\
.dropna()
zrt.info()
In [230]:
zrt
Out[230]:
In [55]:
zrt.to_file('tmp')
#matriz de pesos espaciales
w = ps.queen_from_shapefile('tmp/tmp.shp', idVariable='GEOCODIGO')
#rm -r tmp
w
Out[55]:
Establecer minimo de viviendas para region
In [56]:
n_rev = db.groupby('GEOCODIGO')\
.count()\
['price']\
thr = np.round(0.1 * n_rev.sum())
thr
Out[56]:
In [57]:
np.random.seed(1234)
z = zrt.drop(['geometry', 'GEOCODIGO'], axis=1).values
maxp = ps.region.Maxp(w, z, thr, n_rev, initial=1000)
Inferencia para comprobar que los resultados son mejores que definiendo zonas al azar
In [220]:
np.random.seed(1234)
maxp.cinference(nperm=20)
maxp.cpvalue
Out[220]:
In [58]:
lbls = pd.Series(maxp.area2region).reindex(zrt['GEOCODIGO'])
In [59]:
f, ax = plt.subplots(1, figsize=(9, 9))
zrt.assign(cl=lbls.values)\
.plot(column='cl', categorical=True, legend=True, \
linewidth=0.1, edgecolor='white', ax=ax)
ax.set_axis_off()
plt.show()
In [217]:
lbls
Out[217]:
In [60]:
#ver stats
zrt[varis].groupby(lbls.values).mean().T
Out[60]:
In [1]:
#ver stats
zrt[varis].groupby(lbls.values).mean().T
In [ ]: