In [21]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns;
from numpy import nan
from math import sqrt, sin, cos, atan2, log
plt.style.use('ggplot')
from IPython.core.display import HTML
css = open('style-table.css').read()+ open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))


Out[21]:

In [7]:
test = pd.read_csv('TR_EXP_INMUEBLE_CONTROL.txt', header=0, sep='|', encoding='latin-1')


/Users/luis/anaconda/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (8,15,21,22) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [13]:
test['LATITUD_CARGA']


Out[13]:
0         19.682048
1         19.683822
2         19.620886
3         19.632750
4         19.632500
5         19.632500
6         19.632500
7         19.604167
8         19.604167
9         19.604167
10        19.604167
11        19.631368
12        19.631353
13        19.635681
14        19.599752
15        19.606337
16        19.663932
17        19.661334
18        19.668352
19        19.667823
20        19.663295
21        19.664250
22        19.664605
23        19.663278
24        19.668044
25        19.669455
26        19.665754
27        19.661474
28        19.672186
29        19.674479
            ...    
127869    19.525079
127870    19.522568
127871    19.520602
127872    19.634842
127873    19.637520
127874    19.632951
127875    19.633307
127876    19.631217
127877    19.613686
127878    19.622406
127879    19.616849
127880    19.618915
127881    19.626261
127882    19.619439
127883    19.622323
127884    19.621413
127885    19.621057
127886    19.631667
127887    19.626309
127888    19.636517
127889    19.612617
127890    19.612617
127891    19.624244
127892    19.621978
127893    19.618838
127894    19.620057
127895    19.619488
127896    19.615727
127897    19.675489
127898    19.674545
Name: LATITUD_CARGA, dtype: float64

In [87]:
infr = pd.read_csv('infra_corregido.csv')

In [63]:
df = pd.read_excel('INMUEBLES.xlsx')

In [67]:
df = df[df['P3']<6]

In [68]:
df.ix[df.P12>2, 'P12'] = 3
df['P12'].replace({1: 1.5, 2: 1, 3: 0},inplace=True)

In [69]:
df.ix[df.P20>2,'P20']=0
df.ix[df.P22>2,'P22']=0

In [70]:
df.ix[(df.P22==1),'P20']=0 #se ignora p20 si p22=1

In [71]:
df['P20'].replace({1: 3, 2: 0},inplace=True)

In [99]:
df.ix[df['P12'].isnull(),'P12']=0
df.ix[df['P20'].isnull(),'P20']=0

In [109]:
infr['P20'].unique()


Out[109]:
array([ 0.,  3.])

In [107]:
infr.ix[:,'P20'] = df['P20'].values
infr.ix[:,'P12'] = df['P12'].values

In [95]:
infr.to_csv('infra_corregido.csv')

In [8]:
#Init
secciones = [['P3'], ['P11', 'P12', 'P13A', 'P14', 'P15', 'P16'], ['P17A', 'P18A', 'P19', 'P20', 'P21', 'P22'], ['P23', 'P24'], ['P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'P38', 'P39', 'P40', 'P41'], ['P42', 'P44', 'P46', 'P47', 'P48', 'P49', 'P52', 'P62', 'P72', 'P82', 'P92', 'P102', 'P103', 'P112', 'P113', 'P117', 'P122', 'P123', 'P125'], ['P126'], ['P133', 'P134', 'P135', 'P136', 'P137', 'P138', 'P139', 'P140', 'P141', 'P142', 'P143'], ['P144', 'P145']]
max_cal = [11, 13, 13, 11, 14, 17, 2, 11, 8]
sec_lst = []
for i in range(1,10):
    sec_lst.append('seccion_'+str(i))

In [110]:
i=1
for seccion in secciones:
    infr['seccion_'+str(i)]=infr[seccion].sum(axis=1)
    i+=1

 
#dict(zip([1,2,3,4], [a,b,c,d]))

In [111]:
new_columns = ['ENT','MUN','LOC','AGEB','MZA','ID_INM']
new_columns.extend(sec_lst)
infr_secciones = infr[new_columns]

In [117]:
infr.to_csv('infra_corregido.csv')
infr_secciones.to_csv('infr_secciones.csv')

In [116]:
for j in range(9):
    print("Sección",j,":\t",infr[secciones[j]].sum(axis=1).max())


Sección 0 :	 5
Sección 1 :	 13.0
Sección 2 :	 13.0
Sección 3 :	 11.0
Sección 4 :	 14.0
Sección 5 :	 10.0
Sección 6 :	 2.0
Sección 7 :	 11.0
Sección 8 :	 8.0

In [125]:
infr_secciones[sec_lst].max()


Out[125]:
seccion_1     5
seccion_2    13
seccion_3    13
seccion_4    11
seccion_5    14
seccion_6    10
seccion_7     2
seccion_8    11
seccion_9     8
dtype: float64

In [14]:
infr = pd.read_csv('infr_secciones.csv', index_col=0)
infr.head()


Out[14]:
ENT MUN LOC AGEB MZA ID_INM seccion_1 seccion_2 seccion_3 seccion_4 seccion_5 seccion_6 seccion_7 seccion_8 seccion_9
0 2 4 1 2876 23 1914 2 8.0 11 11.0 11.75 4.80 0 8.8 4
1 2 4 1 2880 103 1915 2 8.5 11 5.5 11.50 2.75 0 4.8 8
2 2 4 1 3677 5 1916 2 9.0 11 5.5 11.75 2.75 0 4.8 4
3 2 4 1 3677 9 1917 2 8.0 11 11.0 11.75 2.65 0 10.0 0
4 2 4 1 3677 23 1918 2 8.0 11 5.5 10.70 5.35 0 8.4 8

In [4]:
loc = pd.read_csv('TR_EXP_INMUEBLE_CONTROL.txt', header=0, sep='|', encoding='latin-1', low_memory=False)

In [5]:
#loc[~loc['DESCRUBIC'].isnull()]['DESCRUBIC']#loc[['NOMVIAL', 'NUMEXT1', 'NEXTALF1', 'NUMINT', 'NUMINTALF', 'ENTRECA', 'YCALLE', 'DESCRUBIC']]
Y=loc['Y'].map(lambda x: -float(str(x).replace(",","."))/10000)
X=loc['X'].map(lambda x: float(str(x).replace(",","."))/100000)
XY=pd.DataFrame({'Longitud':Y, 'Latitud':X})

In [63]:
loc_headers=['ENT', 'MUN','LOC', 'AGEB', 'MZA', 'X', 'Y']
common_headers=['ENT', 'MUN','LOC', 'AGEB', 'MZA']

In [64]:
location = loc[loc_headers]

In [65]:
len(infr[common_headers])/len(location)


Out[65]:
1.181369674508792

In [6]:
full_data = pd.merge(infr, loc[loc_headers], how='outer',left_on=common_headers, right_on=common_headers)
inner_data = pd.merge(infr, loc[loc_headers], how='inner',left_on=common_headers, right_on=common_headers)
left_data = pd.merge(infr, loc[loc_headers], how='left',left_on=common_headers, right_on=common_headers)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-819c5ab440c6> in <module>()
----> 1 full_data = pd.merge(infr, loc[loc_headers], how='outer',left_on=common_headers, right_on=common_headers)
      2 inner_data = pd.merge(infr, loc[loc_headers], how='inner',left_on=common_headers, right_on=common_headers)
      3 left_data = pd.merge(infr, loc[loc_headers], how='left',left_on=common_headers, right_on=common_headers)

NameError: name 'infr' is not defined

In [67]:
print(len(full_data))
print(len(inner_data))


232236
147671

In [68]:
full_data.to_csv('full_data.csv')
inner_data.to_csv('inner_data.csv')
left_data.to_csv('left_data.csv')

In [69]:
len(left_data)


Out[69]:
204656

In [78]:
#inner_data['X'].apply(lambda x: float(str(x).replace(",","."))/100000)
print(min(inner_data['X']))
print(max(inner_data['X']))
print(min(inner_data['Y']))
print(max(inner_data['Y']))


1076159.9242
4066239.0212
1000010.4931
999994.69286

In [80]:
inner_data['Y'].map(lambda x: -float(str(x).replace(",","."))/10000).hist()


Out[80]:
<matplotlib.axes._subplots.AxesSubplot at 0x104ba1f60>

In [81]:
inner_data['X'].map(lambda x: float(str(x).replace(",","."))/10000).hist()


Out[81]:
<matplotlib.axes._subplots.AxesSubplot at 0x106f06f28>

In [108]:
sns.jointplot(x='Latitud', y='Longitud', data=XY)


Out[108]:
<seaborn.axisgrid.JointGrid at 0x129304b00>
/Users/luis/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [128]:
sns.jointplot(x='Latitud', y='Longitud', data=XY, size=10, ratio=5,s=1)


Out[128]:
<seaborn.axisgrid.JointGrid at 0x13359ceb8>
/Users/luis/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [129]:
XY.head()


Out[129]:
Latitud Longitud
0 28.034619 85.692449
1 28.026875 85.679924
2 27.992703 84.988437
3 27.992288 85.114371
4 27.994609 84.962842

In [7]:
diferencias = ((loc['LATITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))- loc['X'].map(lambda x: float(str(x).replace(",","."))))**2+(loc['LONGITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))-loc['Y'].map(lambda x: float(str(x).replace(",","."))))**2).map(lambda x: sqrt(x))

In [8]:
x1 = loc['LATITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))
x2 = loc['X'].map(lambda x: float(str(x).replace(",",".")))
y1 = loc['LONGITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))
y2 = loc['Y'].map(lambda x: float(str(x).replace(",",".")))

In [9]:
plt.figure(figsize=(12,4))
diferencias.hist(bins = int(sqrt(len(diferencias))))


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x11766def0>

In [10]:
def heaversine(x1,y1,x2,y2):
    a = sin((x2-x1)/2)**2+cos(x1)*cos(x2)*sin((y2-y1)/2)**2
    c = 2*atan2(sqrt(a),sqrt(1-a))
    return 6371*c

In [212]:
vals=loc[['LATITUD_CARGA', 'LONGITUD_CARGA','X']].groupby(['LATITUD_CARGA', 'LONGITUD_CARGA']).count()['X'].values

In [215]:
vals = pd.DataFrame(vals)

In [276]:
vals.hist(bins=320)


Out[276]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x148a93da0>]], dtype=object)

In [11]:
data = pd.DataFrame({'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2})

In [12]:
distancias_transformadas = data.apply(lambda s: heaversine(s['x1'],s['y1'],s['x2'],s['y2']),axis=1)

In [17]:
distancias_transformadas.map(lambda x:log(x)).hist(bins=int(sqrt(len(distancias_transformadas))))


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x110bde588>

In [19]:
inner_data


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-b0eb0c17f2ca> in <module>()
----> 1 inner_data

NameError: name 'inner_data' is not defined

In [3]:
inner_data = pd.read_csv('inner_data.csv')


/Users/luis/anaconda/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (16,17) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [106]:
secciones = ['seccion_1', 'seccion_2', 'seccion_3', 'seccion_4', 'seccion_5', 'seccion_6', 'seccion_7', 'seccion_8', 'seccion_9']
resumen = ['ENT']
resumen.extend(secciones)
resumen


Out[106]:
['ENT',
 'seccion_1',
 'seccion_2',
 'seccion_3',
 'seccion_4',
 'seccion_5',
 'seccion_6',
 'seccion_7',
 'seccion_8',
 'seccion_9']

In [5]:
x=300
tmp_data = inner_data[['ENT', 'seccion_1', 'seccion_2', 'seccion_3', 'seccion_4', 'seccion_5', 'seccion_6', 'seccion_7', 'seccion_8', 'seccion_9']]

In [ ]:


In [6]:
#plt.scatter([1, 2, 3, 4, 5, 6, 7, 8, 9], tmp_data['ENT'].values, s=tmp_data.ix[x:x+9][secciones].apply(lambda x: 10*x))
#tmp_data.groupby(['ENT']).quantile(0.99)
calificaciones = inner_data[secciones].sum(axis=1)

In [7]:
inner_data['cal_final'] = calificaciones

In [8]:
inner_data['cal_final'].median()


Out[8]:
45.8

In [53]:
inner_agg=inner_data[resumen].groupby(['ENT']).agg('mean')

In [54]:
tmp_sum=inner_agg.sum(axis=1)/100
tmp_sum


Out[54]:
ENT
2     0.537939
3     0.517053
4     0.446552
5     0.573427
6     0.516840
7     0.317523
8     0.441015
9     0.568376
10    0.426499
11    0.445975
12    0.375210
13    0.364790
14    0.466537
15    0.528691
16    0.447125
18    0.476000
19    0.517784
20    0.321473
21    0.462190
23    0.410151
24    0.411188
25    0.438324
26    0.485331
27    0.428555
29    0.370000
30    0.419388
31    0.474968
32    0.414102
dtype: float64

In [22]:
max_vals = np.array([11, 13, 13, 11, 14, 17, 2, 11, 8])

In [ ]:


In [10]:
new_names = dict(zip(secciones,['1', '2', '3', '4', '5', '6', '7', '8', '9']))
inner_agg = inner_agg.rename(columns = new_names)

In [55]:
for i in inner_agg.index.values:
    inner_agg.ix[i]=inner_agg.ix[i]/max_vals

In [56]:
inner_agg['total'] = tmp_sum

In [57]:
inner_agg


Out[57]:
seccion_1 seccion_2 seccion_3 seccion_4 seccion_5 seccion_6 seccion_7 seccion_8 seccion_9 total
ENT
2 0.121927 0.792511 0.788793 0.395282 0.753682 0.200733 0.218067 0.626755 0.781646 0.537939
3 0.120186 0.878856 0.830533 0.457436 0.720168 0.162555 0.291282 0.431375 0.619487 0.517053
4 0.114517 0.932916 0.760286 0.257106 0.581848 0.120410 0.114341 0.384056 0.488695 0.446552
5 0.159645 0.969043 0.911820 0.414634 0.806794 0.179412 0.658537 0.459867 0.731707 0.573427
6 0.113054 0.933925 0.876726 0.333333 0.734249 0.159766 0.282051 0.534615 0.474359 0.516840
7 0.138276 0.622362 0.585957 0.092025 0.412869 0.117713 0.034619 0.290845 0.307625 0.317523
8 0.126585 0.783828 0.738228 0.254687 0.662540 0.153636 0.108175 0.293837 0.598110 0.441015
9 0.122912 0.954153 0.903672 0.290823 0.789570 0.239824 0.184403 0.796426 0.484288 0.568376
10 0.112819 0.829115 0.769643 0.241010 0.655383 0.113312 0.118210 0.178135 0.584545 0.426499
11 0.114603 0.887870 0.801900 0.206186 0.666036 0.111531 0.065207 0.249409 0.625920 0.445975
12 0.121728 0.770769 0.677701 0.059082 0.491643 0.116673 0.041796 0.383866 0.441176 0.375210
13 0.103226 0.820968 0.732837 0.037097 0.515084 0.084244 0.001075 0.201779 0.483871 0.364790
14 0.121181 0.842609 0.841942 0.180758 0.673511 0.134244 0.048929 0.487586 0.532577 0.466537
15 0.117793 0.912617 0.856746 0.245453 0.712556 0.183394 0.079585 0.650293 0.683223 0.528691
16 0.125631 0.890558 0.826322 0.114149 0.639912 0.138128 0.062500 0.331179 0.585069 0.447125
18 0.090909 0.923077 0.923077 0.000000 0.671429 0.223529 1.000000 0.309091 0.500000 0.476000
19 0.120649 0.899570 0.801149 0.376322 0.745900 0.180982 0.245434 0.462030 0.638738 0.517784
20 0.121552 0.741799 0.666412 0.036084 0.492100 0.098757 0.018280 0.099370 0.301088 0.321473
21 0.119893 0.860686 0.777718 0.135310 0.604782 0.120335 0.057260 0.600843 0.609504 0.462190
23 0.120225 0.897394 0.724630 0.263844 0.635284 0.107262 0.078176 0.251969 0.257329 0.410151
24 0.119892 0.869823 0.741285 0.126167 0.580499 0.113492 0.045627 0.244430 0.578926 0.411188
25 0.118357 0.883553 0.777566 0.153712 0.648066 0.128725 0.144141 0.253224 0.613765 0.438324
26 0.113837 0.797429 0.817362 0.288593 0.723329 0.160189 0.248597 0.461150 0.586808 0.485331
27 0.120312 0.877890 0.735384 0.175096 0.590148 0.112030 0.167612 0.344259 0.543094 0.428555
29 0.181818 0.692308 0.846154 0.000000 0.442857 0.000000 0.000000 0.436364 0.500000 0.370000
30 0.115824 0.833389 0.740629 0.086622 0.569393 0.111261 0.047243 0.433832 0.565013 0.419388
31 0.117727 0.948770 0.806081 0.349465 0.686585 0.150235 0.133776 0.364025 0.388327 0.474968
32 0.111398 0.831928 0.802964 0.166247 0.628412 0.106737 0.085011 0.161060 0.568566 0.414102

In [164]:
coef = 1000
#COLORS
color = ['b', 'r', 'g']
colors = [[color[0]]*9, [color[1]]*9, [color[2]]*9 ]
for i in range(3):
    colors[i].extend('y')
#GRID
fig = plt.figure(figsize=(10,30))
ax = fig.add_subplot(1,1,1)
ax.set_axis_bgcolor('white')

major_yticks = np.arange(0, 33, 5)
ax.set_yticks(major_yticks)                                           
major_xticks = np.arange(0, 11, 5)
ax.set_xticks(major_xticks)


minor_yticks = np.arange(0, 33, 1)
ax.set_yticks(minor_yticks, minor=True)                                           
minor_xticks = np.arange(0, 11, 1)
ax.set_xticks(minor_xticks, minor=True)



#ax.grid(which='major', c = 'white', linestyle='-')
ax.grid(which='minor', c = 'green', linestyle=':')


for i in inner_agg.index.values:
    plt.scatter(range(1,11),[i]*10, s=coef, c = colors[i%3], alpha=0.1, edgecolor='k', lw=1)
    plt.scatter(range(1,11), [i]*10, s=inner_agg.ix[i].map(lambda x: coef*x), c = colors[i%3])
plt.savefig('test.png', bbox_inches='tight')
plt.show()


/Users/luis/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

In [71]:
inner_agg['total'].plot()


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x109198c18>

In [107]:
inner_agg[secciones].plot(figsize=(20,8), kind='bar', stacked=True)


Out[107]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d0b7470>

In [125]:
fig=plt.figure()


<matplotlib.figure.Figure at 0x1176780b8>

In [126]:


In [127]:
ax


Out[127]:
<matplotlib.axes._subplots.AxesSubplot at 0x117678b00>

In [150]:
minor_xticks


Out[150]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [ ]: