In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
This script takes a CSV file from REDATAM (first we need to open that file, which is a SYLK file, and then save it as a CSV file). The we go through the folder with the diferent files for each Comune and append al of them into one data frame to save it and them map it.
In [3]:
def readRedatamCSV(asciiFile):
f = open(asciiFile, 'r')
areas = []
measures = []
for line in f:
columns = line.strip().split()
#print columns
if len(columns) > 0:
if 'RESUMEN' in columns[0] :
break
elif columns[0] == 'AREA':
area = str.split(columns[2],',')[0]
areas.append(area)
elif columns[0] == 'Total':
measure = str.split(columns[2],',')[2]
if measure == '-':
measure = np.nan
measures.append(measure)
try:
data = pd.DataFrame({'area':areas,'measure':measures})
return data
except:
print asciiFile
In [4]:
comunasFile = '/home/pipe/Dropbox/NYU/classes/Applied Data Science/adsProject/data/indecOnline/headEducYjobs/comuna.csv'
comunas = readRedatamCSV(comunasFile)
In [5]:
comunas.area
Out[5]:
In [6]:
baseMadre = comunas.loc[comunas.measure==0,:]
ruta = '/home/pipe/Dropbox/NYU/classes/Applied Data Science/adsProject/data/indecOnline/MODELO1E/'
for i in comunas.area:
archivoCSV = ruta + i + '.csv'
data = readRedatamCSV(archivoCSV)
baseMadre = baseMadre.append(data)
In [8]:
baseMadre.measure = baseMadre.measure.apply(float)
In [35]:
baseMadre[baseMadre.area=='020041801'] = np.nan
In [36]:
baseMadre.to_csv(ruta + 'modelo1e.csv',index=False)
In [37]:
baseMadre.dropna().measure.describe()
Out[37]:
In [ ]: