notebook.community

Edit and run



In [21]:

    
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns;
from numpy import nan
from math import sqrt, sin, cos, atan2, log
plt.style.use('ggplot')
from IPython.core.display import HTML
css = open('style-table.css').read()+ open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))









    Out[21]:



In [7]:

    
test = pd.read_csv('TR_EXP_INMUEBLE_CONTROL.txt', header=0, sep='|', encoding='latin-1')









    



/Users/luis/anaconda/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (8,15,21,22) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [13]:

    
test['LATITUD_CARGA']









    Out[13]:





0         19.682048
1         19.683822
2         19.620886
3         19.632750
4         19.632500
5         19.632500
6         19.632500
7         19.604167
8         19.604167
9         19.604167
10        19.604167
11        19.631368
12        19.631353
13        19.635681
14        19.599752
15        19.606337
16        19.663932
17        19.661334
18        19.668352
19        19.667823
20        19.663295
21        19.664250
22        19.664605
23        19.663278
24        19.668044
25        19.669455
26        19.665754
27        19.661474
28        19.672186
29        19.674479
            ...    
127869    19.525079
127870    19.522568
127871    19.520602
127872    19.634842
127873    19.637520
127874    19.632951
127875    19.633307
127876    19.631217
127877    19.613686
127878    19.622406
127879    19.616849
127880    19.618915
127881    19.626261
127882    19.619439
127883    19.622323
127884    19.621413
127885    19.621057
127886    19.631667
127887    19.626309
127888    19.636517
127889    19.612617
127890    19.612617
127891    19.624244
127892    19.621978
127893    19.618838
127894    19.620057
127895    19.619488
127896    19.615727
127897    19.675489
127898    19.674545
Name: LATITUD_CARGA, dtype: float64



In [87]:

    
infr = pd.read_csv('infra_corregido.csv')



In [63]:

    
df = pd.read_excel('INMUEBLES.xlsx')



In [67]:

    
df = df[df['P3']<6]



In [68]:

    
df.ix[df.P12>2, 'P12'] = 3
df['P12'].replace({1: 1.5, 2: 1, 3: 0},inplace=True)



In [69]:

    
df.ix[df.P20>2,'P20']=0
df.ix[df.P22>2,'P22']=0



In [70]:

    
df.ix[(df.P22==1),'P20']=0 #se ignora p20 si p22=1



In [71]:

    
df['P20'].replace({1: 3, 2: 0},inplace=True)



In [99]:

    
df.ix[df['P12'].isnull(),'P12']=0
df.ix[df['P20'].isnull(),'P20']=0



In [109]:

    
infr['P20'].unique()









    Out[109]:





array([ 0.,  3.])



In [107]:

    
infr.ix[:,'P20'] = df['P20'].values
infr.ix[:,'P12'] = df['P12'].values



In [95]:

    
infr.to_csv('infra_corregido.csv')



In [8]:

    
#Init
secciones = [['P3'], ['P11', 'P12', 'P13A', 'P14', 'P15', 'P16'], ['P17A', 'P18A', 'P19', 'P20', 'P21', 'P22'], ['P23', 'P24'], ['P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'P38', 'P39', 'P40', 'P41'], ['P42', 'P44', 'P46', 'P47', 'P48', 'P49', 'P52', 'P62', 'P72', 'P82', 'P92', 'P102', 'P103', 'P112', 'P113', 'P117', 'P122', 'P123', 'P125'], ['P126'], ['P133', 'P134', 'P135', 'P136', 'P137', 'P138', 'P139', 'P140', 'P141', 'P142', 'P143'], ['P144', 'P145']]
max_cal = [11, 13, 13, 11, 14, 17, 2, 11, 8]
sec_lst = []
for i in range(1,10):
    sec_lst.append('seccion_'+str(i))



In [110]:

    
i=1
for seccion in secciones:
    infr['seccion_'+str(i)]=infr[seccion].sum(axis=1)
    i+=1

 
#dict(zip([1,2,3,4], [a,b,c,d]))



In [111]:

    
new_columns = ['ENT','MUN','LOC','AGEB','MZA','ID_INM']
new_columns.extend(sec_lst)
infr_secciones = infr[new_columns]



In [117]:

    
infr.to_csv('infra_corregido.csv')
infr_secciones.to_csv('infr_secciones.csv')



In [116]:

    
for j in range(9):
    print("Sección",j,":\t",infr[secciones[j]].sum(axis=1).max())









    



Sección 0 :	 5
Sección 1 :	 13.0
Sección 2 :	 13.0
Sección 3 :	 11.0
Sección 4 :	 14.0
Sección 5 :	 10.0
Sección 6 :	 2.0
Sección 7 :	 11.0
Sección 8 :	 8.0



In [125]:

    
infr_secciones[sec_lst].max()









    Out[125]:





seccion_1     5
seccion_2    13
seccion_3    13
seccion_4    11
seccion_5    14
seccion_6    10
seccion_7     2
seccion_8    11
seccion_9     8
dtype: float64



In [14]:

    
infr = pd.read_csv('infr_secciones.csv', index_col=0)
infr.head()



In [4]:

    
loc = pd.read_csv('TR_EXP_INMUEBLE_CONTROL.txt', header=0, sep='|', encoding='latin-1', low_memory=False)



In [5]:

    
#loc[~loc['DESCRUBIC'].isnull()]['DESCRUBIC']#loc[['NOMVIAL', 'NUMEXT1', 'NEXTALF1', 'NUMINT', 'NUMINTALF', 'ENTRECA', 'YCALLE', 'DESCRUBIC']]
Y=loc['Y'].map(lambda x: -float(str(x).replace(",","."))/10000)
X=loc['X'].map(lambda x: float(str(x).replace(",","."))/100000)
XY=pd.DataFrame({'Longitud':Y, 'Latitud':X})



In [63]:

    
loc_headers=['ENT', 'MUN','LOC', 'AGEB', 'MZA', 'X', 'Y']
common_headers=['ENT', 'MUN','LOC', 'AGEB', 'MZA']



In [64]:

    
location = loc[loc_headers]



In [65]:

    
len(infr[common_headers])/len(location)









    Out[65]:





1.181369674508792



In [6]:

    
full_data = pd.merge(infr, loc[loc_headers], how='outer',left_on=common_headers, right_on=common_headers)
inner_data = pd.merge(infr, loc[loc_headers], how='inner',left_on=common_headers, right_on=common_headers)
left_data = pd.merge(infr, loc[loc_headers], how='left',left_on=common_headers, right_on=common_headers)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-819c5ab440c6> in <module>()
----> 1 full_data = pd.merge(infr, loc[loc_headers], how='outer',left_on=common_headers, right_on=common_headers)
      2 inner_data = pd.merge(infr, loc[loc_headers], how='inner',left_on=common_headers, right_on=common_headers)
      3 left_data = pd.merge(infr, loc[loc_headers], how='left',left_on=common_headers, right_on=common_headers)

NameError: name 'infr' is not defined



In [67]:

    
print(len(full_data))
print(len(inner_data))



In [68]:

    
full_data.to_csv('full_data.csv')
inner_data.to_csv('inner_data.csv')
left_data.to_csv('left_data.csv')



In [69]:

    
len(left_data)









    Out[69]:





204656



In [78]:

    
#inner_data['X'].apply(lambda x: float(str(x).replace(",","."))/100000)
print(min(inner_data['X']))
print(max(inner_data['X']))
print(min(inner_data['Y']))
print(max(inner_data['Y']))









    



1076159.9242
4066239.0212
1000010.4931
999994.69286



In [80]:

    
inner_data['Y'].map(lambda x: -float(str(x).replace(",","."))/10000).hist()









    Out[80]:





<matplotlib.axes._subplots.AxesSubplot at 0x104ba1f60>



In [81]:

    
inner_data['X'].map(lambda x: float(str(x).replace(",","."))/10000).hist()









    Out[81]:





<matplotlib.axes._subplots.AxesSubplot at 0x106f06f28>



In [108]:

    
sns.jointplot(x='Latitud', y='Longitud', data=XY)









    Out[108]:





<seaborn.axisgrid.JointGrid at 0x129304b00>






    



/Users/luis/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [128]:

    
sns.jointplot(x='Latitud', y='Longitud', data=XY, size=10, ratio=5,s=1)









    Out[128]:





<seaborn.axisgrid.JointGrid at 0x13359ceb8>






    



/Users/luis/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [129]:

    
XY.head()



In [7]:

    
diferencias = ((loc['LATITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))- loc['X'].map(lambda x: float(str(x).replace(",","."))))**2+(loc['LONGITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))-loc['Y'].map(lambda x: float(str(x).replace(",","."))))**2).map(lambda x: sqrt(x))



In [8]:

    
x1 = loc['LATITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))
x2 = loc['X'].map(lambda x: float(str(x).replace(",",".")))
y1 = loc['LONGITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))
y2 = loc['Y'].map(lambda x: float(str(x).replace(",",".")))



In [9]:

    
plt.figure(figsize=(12,4))
diferencias.hist(bins = int(sqrt(len(diferencias))))









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x11766def0>



In [10]:

    
def heaversine(x1,y1,x2,y2):
    a = sin((x2-x1)/2)**2+cos(x1)*cos(x2)*sin((y2-y1)/2)**2
    c = 2*atan2(sqrt(a),sqrt(1-a))
    return 6371*c



In [212]:

    
vals=loc[['LATITUD_CARGA', 'LONGITUD_CARGA','X']].groupby(['LATITUD_CARGA', 'LONGITUD_CARGA']).count()['X'].values



In [215]:

    
vals = pd.DataFrame(vals)



In [276]:

    
vals.hist(bins=320)









    Out[276]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x148a93da0>]], dtype=object)



In [11]:

    
data = pd.DataFrame({'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2})



In [12]:

    
distancias_transformadas = data.apply(lambda s: heaversine(s['x1'],s['y1'],s['x2'],s['y2']),axis=1)



In [17]:

    
distancias_transformadas.map(lambda x:log(x)).hist(bins=int(sqrt(len(distancias_transformadas))))









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x110bde588>



In [19]:

    
inner_data









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-b0eb0c17f2ca> in <module>()
----> 1 inner_data

NameError: name 'inner_data' is not defined



In [3]:

    
inner_data = pd.read_csv('inner_data.csv')









    



/Users/luis/anaconda/lib/python3.4/site-packages/IPython/core/interactiveshell.py:2902: DtypeWarning: Columns (16,17) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [106]:

    
secciones = ['seccion_1', 'seccion_2', 'seccion_3', 'seccion_4', 'seccion_5', 'seccion_6', 'seccion_7', 'seccion_8', 'seccion_9']
resumen = ['ENT']
resumen.extend(secciones)
resumen









    Out[106]:





['ENT',
 'seccion_1',
 'seccion_2',
 'seccion_3',
 'seccion_4',
 'seccion_5',
 'seccion_6',
 'seccion_7',
 'seccion_8',
 'seccion_9']



In [5]:

    
x=300
tmp_data = inner_data[['ENT', 'seccion_1', 'seccion_2', 'seccion_3', 'seccion_4', 'seccion_5', 'seccion_6', 'seccion_7', 'seccion_8', 'seccion_9']]



In [ ]:



In [6]:

    
#plt.scatter([1, 2, 3, 4, 5, 6, 7, 8, 9], tmp_data['ENT'].values, s=tmp_data.ix[x:x+9][secciones].apply(lambda x: 10*x))
#tmp_data.groupby(['ENT']).quantile(0.99)
calificaciones = inner_data[secciones].sum(axis=1)



In [7]:

    
inner_data['cal_final'] = calificaciones



In [8]:

    
inner_data['cal_final'].median()









    Out[8]:





45.8



In [53]:

    
inner_agg=inner_data[resumen].groupby(['ENT']).agg('mean')



In [54]:

    
tmp_sum=inner_agg.sum(axis=1)/100
tmp_sum









    Out[54]:





ENT
2     0.537939
3     0.517053
4     0.446552
5     0.573427
6     0.516840
7     0.317523
8     0.441015
9     0.568376
10    0.426499
11    0.445975
12    0.375210
13    0.364790
14    0.466537
15    0.528691
16    0.447125
18    0.476000
19    0.517784
20    0.321473
21    0.462190
23    0.410151
24    0.411188
25    0.438324
26    0.485331
27    0.428555
29    0.370000
30    0.419388
31    0.474968
32    0.414102
dtype: float64



In [22]:

    
max_vals = np.array([11, 13, 13, 11, 14, 17, 2, 11, 8])



In [ ]:



In [10]:

    
new_names = dict(zip(secciones,['1', '2', '3', '4', '5', '6', '7', '8', '9']))
inner_agg = inner_agg.rename(columns = new_names)



In [55]:

    
for i in inner_agg.index.values:
    inner_agg.ix[i]=inner_agg.ix[i]/max_vals



In [56]:

    
inner_agg['total'] = tmp_sum



In [57]:

    
inner_agg



In [164]:

    
coef = 1000
#COLORS
color = ['b', 'r', 'g']
colors = [[color[0]]*9, [color[1]]*9, [color[2]]*9 ]
for i in range(3):
    colors[i].extend('y')
#GRID
fig = plt.figure(figsize=(10,30))
ax = fig.add_subplot(1,1,1)
ax.set_axis_bgcolor('white')

major_yticks = np.arange(0, 33, 5)
ax.set_yticks(major_yticks)                                           
major_xticks = np.arange(0, 11, 5)
ax.set_xticks(major_xticks)


minor_yticks = np.arange(0, 33, 1)
ax.set_yticks(minor_yticks, minor=True)                                           
minor_xticks = np.arange(0, 11, 1)
ax.set_xticks(minor_xticks, minor=True)



#ax.grid(which='major', c = 'white', linestyle='-')
ax.grid(which='minor', c = 'green', linestyle=':')


for i in inner_agg.index.values:
    plt.scatter(range(1,11),[i]*10, s=coef, c = colors[i%3], alpha=0.1, edgecolor='k', lw=1)
    plt.scatter(range(1,11), [i]*10, s=inner_agg.ix[i].map(lambda x: coef*x), c = colors[i%3])
plt.savefig('test.png', bbox_inches='tight')
plt.show()









    



/Users/luis/anaconda/lib/python3.4/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):



In [71]:

    
inner_agg['total'].plot()









    Out[71]:





<matplotlib.axes._subplots.AxesSubplot at 0x109198c18>



In [107]:

    
inner_agg[secciones].plot(figsize=(20,8), kind='bar', stacked=True)









    Out[107]:





<matplotlib.axes._subplots.AxesSubplot at 0x10d0b7470>



In [125]:

    
fig=plt.figure()









    





<matplotlib.figure.Figure at 0x1176780b8>



In [126]:



In [127]:

    
ax









    Out[127]:





<matplotlib.axes._subplots.AxesSubplot at 0x117678b00>



In [150]:

    
minor_xticks









    Out[150]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])



In [ ]:

	Latitud	Longitud
0	28.034619	85.692449
1	28.026875	85.679924
2	27.992703	84.988437
3	27.992288	85.114371
4	27.994609	84.962842

	seccion_1	seccion_2	seccion_3	seccion_4	seccion_5	seccion_6	seccion_7	seccion_8	seccion_9	total
ENT
2	0.121927	0.792511	0.788793	0.395282	0.753682	0.200733	0.218067	0.626755	0.781646	0.537939
3	0.120186	0.878856	0.830533	0.457436	0.720168	0.162555	0.291282	0.431375	0.619487	0.517053
4	0.114517	0.932916	0.760286	0.257106	0.581848	0.120410	0.114341	0.384056	0.488695	0.446552
5	0.159645	0.969043	0.911820	0.414634	0.806794	0.179412	0.658537	0.459867	0.731707	0.573427
6	0.113054	0.933925	0.876726	0.333333	0.734249	0.159766	0.282051	0.534615	0.474359	0.516840
7	0.138276	0.622362	0.585957	0.092025	0.412869	0.117713	0.034619	0.290845	0.307625	0.317523
8	0.126585	0.783828	0.738228	0.254687	0.662540	0.153636	0.108175	0.293837	0.598110	0.441015
9	0.122912	0.954153	0.903672	0.290823	0.789570	0.239824	0.184403	0.796426	0.484288	0.568376
10	0.112819	0.829115	0.769643	0.241010	0.655383	0.113312	0.118210	0.178135	0.584545	0.426499
11	0.114603	0.887870	0.801900	0.206186	0.666036	0.111531	0.065207	0.249409	0.625920	0.445975
12	0.121728	0.770769	0.677701	0.059082	0.491643	0.116673	0.041796	0.383866	0.441176	0.375210
13	0.103226	0.820968	0.732837	0.037097	0.515084	0.084244	0.001075	0.201779	0.483871	0.364790
14	0.121181	0.842609	0.841942	0.180758	0.673511	0.134244	0.048929	0.487586	0.532577	0.466537
15	0.117793	0.912617	0.856746	0.245453	0.712556	0.183394	0.079585	0.650293	0.683223	0.528691
16	0.125631	0.890558	0.826322	0.114149	0.639912	0.138128	0.062500	0.331179	0.585069	0.447125
18	0.090909	0.923077	0.923077	0.000000	0.671429	0.223529	1.000000	0.309091	0.500000	0.476000
19	0.120649	0.899570	0.801149	0.376322	0.745900	0.180982	0.245434	0.462030	0.638738	0.517784
20	0.121552	0.741799	0.666412	0.036084	0.492100	0.098757	0.018280	0.099370	0.301088	0.321473
21	0.119893	0.860686	0.777718	0.135310	0.604782	0.120335	0.057260	0.600843	0.609504	0.462190
23	0.120225	0.897394	0.724630	0.263844	0.635284	0.107262	0.078176	0.251969	0.257329	0.410151
24	0.119892	0.869823	0.741285	0.126167	0.580499	0.113492	0.045627	0.244430	0.578926	0.411188
25	0.118357	0.883553	0.777566	0.153712	0.648066	0.128725	0.144141	0.253224	0.613765	0.438324
26	0.113837	0.797429	0.817362	0.288593	0.723329	0.160189	0.248597	0.461150	0.586808	0.485331
27	0.120312	0.877890	0.735384	0.175096	0.590148	0.112030	0.167612	0.344259	0.543094	0.428555
29	0.181818	0.692308	0.846154	0.000000	0.442857	0.000000	0.000000	0.436364	0.500000	0.370000
30	0.115824	0.833389	0.740629	0.086622	0.569393	0.111261	0.047243	0.433832	0.565013	0.419388
31	0.117727	0.948770	0.806081	0.349465	0.686585	0.150235	0.133776	0.364025	0.388327	0.474968
32	0.111398	0.831928	0.802964	0.166247	0.628412	0.106737	0.085011	0.161060	0.568566	0.414102

	ENT	MUN	LOC	AGEB	MZA	ID_INM	seccion_1	seccion_2	seccion_3	seccion_4	seccion_5	seccion_6	seccion_8	seccion_9
0	2	4	1	2876	23	1914	2	8.0	11	11.0	11.75	4.80	8.8	4
1	2	4	1	2880	103	1915	2	8.5	11	5.5	11.50	2.75	4.8	8
2	2	4	1	3677	5	1916	2	9.0	11	5.5	11.75	2.75	4.8	4
3	2	4	1	3677	9	1917	2	8.0	11	11.0	11.75	2.65	10.0	0
4	2	4	1	3677	23	1918	2	8.0	11	5.5	10.70	5.35	8.4	8