In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

This script takes a CSV file from REDATAM (first we need to open that file, which is a SYLK file, and then save it as a CSV file). The we go through the folder with the diferent files for each Comune and append al of them into one data frame to save it and them map it.


In [3]:
def readRedatamCSV(asciiFile):
    f = open(asciiFile, 'r')
    areas = []
    measures = []
    for line in f:
        columns = line.strip().split()
        #print columns
        if len(columns) > 0:
            if 'RESUMEN' in columns[0] :
                break
            elif columns[0] == 'AREA':
                area = str.split(columns[2],',')[0]
                areas.append(area)
            elif columns[0] == 'Total':
                measure = str.split(columns[2],',')[2]
                if measure == '-':
                    measure = np.nan
                measures.append(measure)
    try:        
        data = pd.DataFrame({'area':areas,'measure':measures})
        return data
    except:
        print asciiFile

In [4]:
comunasFile = '/home/pipe/Dropbox/NYU/classes/Applied Data Science/adsProject/data/indecOnline/headEducYjobs/comuna.csv'
comunas = readRedatamCSV(comunasFile)

In [5]:
comunas.area


Out[5]:
0     02001
1     02002
2     02003
3     02004
4     02005
5     02006
6     02007
7     02008
8     02009
9     02010
10    02011
11    02012
12    02013
13    02014
14    02015
Name: area, dtype: object

In [6]:
baseMadre = comunas.loc[comunas.measure==0,:]
ruta = '/home/pipe/Dropbox/NYU/classes/Applied Data Science/adsProject/data/indecOnline/MODELO1E/'
for i in comunas.area:
    archivoCSV = ruta + i + '.csv'
    data = readRedatamCSV(archivoCSV)
    baseMadre = baseMadre.append(data)

In [8]:
baseMadre.measure = baseMadre.measure.apply(float)

In [35]:
baseMadre[baseMadre.area=='020041801'] = np.nan

In [36]:
baseMadre.to_csv(ruta + 'modelo1e.csv',index=False)

In [37]:
baseMadre.dropna().measure.describe()


Out[37]:
count    3551.000000
mean     5472.165472
std       486.080279
min      2733.180000
25%      5233.565000
50%      5513.410000
75%      5767.475000
max      7291.030000
Name: measure, dtype: float64

In [ ]: