In [21]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns;
from numpy import nan
from math import sqrt, sin, cos, atan2, log
plt.style.use('ggplot')
from IPython.core.display import HTML
css = open('style-table.css').read()+ open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))
Out[21]:
In [7]:
test = pd.read_csv('TR_EXP_INMUEBLE_CONTROL.txt', header=0, sep='|', encoding='latin-1')
In [13]:
test['LATITUD_CARGA']
Out[13]:
In [87]:
infr = pd.read_csv('infra_corregido.csv')
In [63]:
df = pd.read_excel('INMUEBLES.xlsx')
In [67]:
df = df[df['P3']<6]
In [68]:
df.ix[df.P12>2, 'P12'] = 3
df['P12'].replace({1: 1.5, 2: 1, 3: 0},inplace=True)
In [69]:
df.ix[df.P20>2,'P20']=0
df.ix[df.P22>2,'P22']=0
In [70]:
df.ix[(df.P22==1),'P20']=0 #se ignora p20 si p22=1
In [71]:
df['P20'].replace({1: 3, 2: 0},inplace=True)
In [99]:
df.ix[df['P12'].isnull(),'P12']=0
df.ix[df['P20'].isnull(),'P20']=0
In [109]:
infr['P20'].unique()
Out[109]:
In [107]:
infr.ix[:,'P20'] = df['P20'].values
infr.ix[:,'P12'] = df['P12'].values
In [95]:
infr.to_csv('infra_corregido.csv')
In [8]:
#Init
secciones = [['P3'], ['P11', 'P12', 'P13A', 'P14', 'P15', 'P16'], ['P17A', 'P18A', 'P19', 'P20', 'P21', 'P22'], ['P23', 'P24'], ['P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 'P34', 'P35', 'P36', 'P37', 'P38', 'P39', 'P40', 'P41'], ['P42', 'P44', 'P46', 'P47', 'P48', 'P49', 'P52', 'P62', 'P72', 'P82', 'P92', 'P102', 'P103', 'P112', 'P113', 'P117', 'P122', 'P123', 'P125'], ['P126'], ['P133', 'P134', 'P135', 'P136', 'P137', 'P138', 'P139', 'P140', 'P141', 'P142', 'P143'], ['P144', 'P145']]
max_cal = [11, 13, 13, 11, 14, 17, 2, 11, 8]
sec_lst = []
for i in range(1,10):
sec_lst.append('seccion_'+str(i))
In [110]:
i=1
for seccion in secciones:
infr['seccion_'+str(i)]=infr[seccion].sum(axis=1)
i+=1
#dict(zip([1,2,3,4], [a,b,c,d]))
In [111]:
new_columns = ['ENT','MUN','LOC','AGEB','MZA','ID_INM']
new_columns.extend(sec_lst)
infr_secciones = infr[new_columns]
In [117]:
infr.to_csv('infra_corregido.csv')
infr_secciones.to_csv('infr_secciones.csv')
In [116]:
for j in range(9):
print("Sección",j,":\t",infr[secciones[j]].sum(axis=1).max())
In [125]:
infr_secciones[sec_lst].max()
Out[125]:
In [14]:
infr = pd.read_csv('infr_secciones.csv', index_col=0)
infr.head()
Out[14]:
In [4]:
loc = pd.read_csv('TR_EXP_INMUEBLE_CONTROL.txt', header=0, sep='|', encoding='latin-1', low_memory=False)
In [5]:
#loc[~loc['DESCRUBIC'].isnull()]['DESCRUBIC']#loc[['NOMVIAL', 'NUMEXT1', 'NEXTALF1', 'NUMINT', 'NUMINTALF', 'ENTRECA', 'YCALLE', 'DESCRUBIC']]
Y=loc['Y'].map(lambda x: -float(str(x).replace(",","."))/10000)
X=loc['X'].map(lambda x: float(str(x).replace(",","."))/100000)
XY=pd.DataFrame({'Longitud':Y, 'Latitud':X})
In [63]:
loc_headers=['ENT', 'MUN','LOC', 'AGEB', 'MZA', 'X', 'Y']
common_headers=['ENT', 'MUN','LOC', 'AGEB', 'MZA']
In [64]:
location = loc[loc_headers]
In [65]:
len(infr[common_headers])/len(location)
Out[65]:
In [6]:
full_data = pd.merge(infr, loc[loc_headers], how='outer',left_on=common_headers, right_on=common_headers)
inner_data = pd.merge(infr, loc[loc_headers], how='inner',left_on=common_headers, right_on=common_headers)
left_data = pd.merge(infr, loc[loc_headers], how='left',left_on=common_headers, right_on=common_headers)
In [67]:
print(len(full_data))
print(len(inner_data))
In [68]:
full_data.to_csv('full_data.csv')
inner_data.to_csv('inner_data.csv')
left_data.to_csv('left_data.csv')
In [69]:
len(left_data)
Out[69]:
In [78]:
#inner_data['X'].apply(lambda x: float(str(x).replace(",","."))/100000)
print(min(inner_data['X']))
print(max(inner_data['X']))
print(min(inner_data['Y']))
print(max(inner_data['Y']))
In [80]:
inner_data['Y'].map(lambda x: -float(str(x).replace(",","."))/10000).hist()
Out[80]:
In [81]:
inner_data['X'].map(lambda x: float(str(x).replace(",","."))/10000).hist()
Out[81]:
In [108]:
sns.jointplot(x='Latitud', y='Longitud', data=XY)
Out[108]:
In [128]:
sns.jointplot(x='Latitud', y='Longitud', data=XY, size=10, ratio=5,s=1)
Out[128]:
In [129]:
XY.head()
Out[129]:
In [7]:
diferencias = ((loc['LATITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))- loc['X'].map(lambda x: float(str(x).replace(",","."))))**2+(loc['LONGITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))-loc['Y'].map(lambda x: float(str(x).replace(",","."))))**2).map(lambda x: sqrt(x))
In [8]:
x1 = loc['LATITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))
x2 = loc['X'].map(lambda x: float(str(x).replace(",",".")))
y1 = loc['LONGITUD_CARGA'].map(lambda x: float(str(x).replace(",",".")))
y2 = loc['Y'].map(lambda x: float(str(x).replace(",",".")))
In [9]:
plt.figure(figsize=(12,4))
diferencias.hist(bins = int(sqrt(len(diferencias))))
Out[9]:
In [10]:
def heaversine(x1,y1,x2,y2):
a = sin((x2-x1)/2)**2+cos(x1)*cos(x2)*sin((y2-y1)/2)**2
c = 2*atan2(sqrt(a),sqrt(1-a))
return 6371*c
In [212]:
vals=loc[['LATITUD_CARGA', 'LONGITUD_CARGA','X']].groupby(['LATITUD_CARGA', 'LONGITUD_CARGA']).count()['X'].values
In [215]:
vals = pd.DataFrame(vals)
In [276]:
vals.hist(bins=320)
Out[276]:
In [11]:
data = pd.DataFrame({'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2})
In [12]:
distancias_transformadas = data.apply(lambda s: heaversine(s['x1'],s['y1'],s['x2'],s['y2']),axis=1)
In [17]:
distancias_transformadas.map(lambda x:log(x)).hist(bins=int(sqrt(len(distancias_transformadas))))
Out[17]:
In [19]:
inner_data
In [3]:
inner_data = pd.read_csv('inner_data.csv')
In [106]:
secciones = ['seccion_1', 'seccion_2', 'seccion_3', 'seccion_4', 'seccion_5', 'seccion_6', 'seccion_7', 'seccion_8', 'seccion_9']
resumen = ['ENT']
resumen.extend(secciones)
resumen
Out[106]:
In [5]:
x=300
tmp_data = inner_data[['ENT', 'seccion_1', 'seccion_2', 'seccion_3', 'seccion_4', 'seccion_5', 'seccion_6', 'seccion_7', 'seccion_8', 'seccion_9']]
In [ ]:
In [6]:
#plt.scatter([1, 2, 3, 4, 5, 6, 7, 8, 9], tmp_data['ENT'].values, s=tmp_data.ix[x:x+9][secciones].apply(lambda x: 10*x))
#tmp_data.groupby(['ENT']).quantile(0.99)
calificaciones = inner_data[secciones].sum(axis=1)
In [7]:
inner_data['cal_final'] = calificaciones
In [8]:
inner_data['cal_final'].median()
Out[8]:
In [53]:
inner_agg=inner_data[resumen].groupby(['ENT']).agg('mean')
In [54]:
tmp_sum=inner_agg.sum(axis=1)/100
tmp_sum
Out[54]:
In [22]:
max_vals = np.array([11, 13, 13, 11, 14, 17, 2, 11, 8])
In [ ]:
In [10]:
new_names = dict(zip(secciones,['1', '2', '3', '4', '5', '6', '7', '8', '9']))
inner_agg = inner_agg.rename(columns = new_names)
In [55]:
for i in inner_agg.index.values:
inner_agg.ix[i]=inner_agg.ix[i]/max_vals
In [56]:
inner_agg['total'] = tmp_sum
In [57]:
inner_agg
Out[57]:
In [164]:
coef = 1000
#COLORS
color = ['b', 'r', 'g']
colors = [[color[0]]*9, [color[1]]*9, [color[2]]*9 ]
for i in range(3):
colors[i].extend('y')
#GRID
fig = plt.figure(figsize=(10,30))
ax = fig.add_subplot(1,1,1)
ax.set_axis_bgcolor('white')
major_yticks = np.arange(0, 33, 5)
ax.set_yticks(major_yticks)
major_xticks = np.arange(0, 11, 5)
ax.set_xticks(major_xticks)
minor_yticks = np.arange(0, 33, 1)
ax.set_yticks(minor_yticks, minor=True)
minor_xticks = np.arange(0, 11, 1)
ax.set_xticks(minor_xticks, minor=True)
#ax.grid(which='major', c = 'white', linestyle='-')
ax.grid(which='minor', c = 'green', linestyle=':')
for i in inner_agg.index.values:
plt.scatter(range(1,11),[i]*10, s=coef, c = colors[i%3], alpha=0.1, edgecolor='k', lw=1)
plt.scatter(range(1,11), [i]*10, s=inner_agg.ix[i].map(lambda x: coef*x), c = colors[i%3])
plt.savefig('test.png', bbox_inches='tight')
plt.show()
In [71]:
inner_agg['total'].plot()
Out[71]:
In [107]:
inner_agg[secciones].plot(figsize=(20,8), kind='bar', stacked=True)
Out[107]:
In [125]:
fig=plt.figure()
In [126]:
In [127]:
ax
Out[127]:
In [150]:
minor_xticks
Out[150]:
In [ ]: