notebook.community

Edit and run



In [6]:

    
import pandas as pd
import numpy as np
import os
import sys
import simpledbf

def getEPHdbf_i(censusstring):
    print ("Downloading", censusstring)
    ### First I will check that it is not already there
    if not os.path.isfile("data/Individual_" + censusstring + ".dbf"):
        if os.path.isfile('Individual_' + censusstring + ".dbf"):
            # if in the current dir just move it
            if os.system("mv " + 'Individual_' + censusstring + ".dbf " + "data/"):
                print ("Error moving file!, Please check!")
        # otherwise start looking for the zip file
        else:
            if not os.path.isfile("data/" + censusstring + "_dbf.zip"):
                if not os.path.isfile(censusstring + "_dbf.zip"):
                    os.system(
                        "curl -O http://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/" + censusstring + "_dbf.zip")
                ###  To move it I use the os.system() functions to run bash commands with arguments
                os.system("mv " + censusstring + "_dbf.zip " + "data/")
            ### unzip the csv
            os.system("unzip " + "data/" + censusstring + "_dbf.zip -d data/")

    if not os.path.isfile("data/" + 'Individual_' + censusstring + ".dbf"):
        print ("WARNING!!! something is wrong: the file is not there!")

    else:
        print ("file in place, creating CSV file")

    trimestre = censusstring

    dbf = simpledbf.Dbf5('data/Individual_' + trimestre + '.dbf',codec='latin1')
    indRaw = dbf.to_dataframe()

    indNoW = indRaw.loc[indRaw.REGION == 1,['CODUSU',
                        'NRO_HOGAR',
                        'PONDERA',
                        'CH03',
                        'CH04',
                        'CH06',
                        'CH07',
                        'CH15',
                        'NIVEL_ED',
                        'ESTADO',
                        'CAT_OCUP',
#                         'ITF',
#                         'IPCF',
                        'P47T']]

    indNoW.columns = ['CODUSU',
                        'NRO_HOGAR',
                        'PONDERA',
                        'Parentesco',
                        'Sexo',
                        'Edad',
                        'Estado_Civil',
                        'Lugar_Nac',
                        'NIVEL_ED',
                        'Trabajo',
                        'CAT_OCUP',
#                         'Ingr_Total_Fam',
#                         'Ingr_Per_Capita_Fam',
                        'Ingr_Total_Ind']
    indNoW.index =range(0,indNoW.shape[0])

    indNoW.to_csv('data/cleanData' + trimestre + '.csv', index = False)
    print 'csv file cleanData',trimestre,'.csv successfully created in folder data/'
    return



In [7]:

    
def dummy_variables(data, data_type_dict):
    #Loop over nominal variables.
    for variable in filter(lambda x: data_type_dict[x]=='nominal',
                           data_type_dict.keys()):
 
        #First we create the columns with dummy variables.
        #Note that the argument 'prefix' means the column names will be
        #prefix_value for each unique value in the original column, so
        #we set the prefix to be the name of the original variable.
        dummy_df=pd.get_dummies(data[variable], prefix=variable)
 
        #Remove old variable from dictionary.
        data_type_dict.pop(variable)
 
        #Add new dummy variables to dictionary.
        for dummy_variable in dummy_df.columns:
            data_type_dict[dummy_variable] = 'nominal'
 
        #Add dummy variables to main df.
        data=data.drop(variable, axis=1)
        data=data.join(dummy_df)
 
    return [data, data_type_dict]



In [69]:

    
def make_dummy(data):
    data['edad-sexo'] = 'no-data'
    data.loc[(data['Sexo'] == 0) & (data['Edad'] < 14), 'edad-sexo'] = 'male_0_14'
    data.loc[(data['Sexo'] == 0) & (data['Edad'] >= 14) & (data['Edad'] < 25), 'edad-sexo'] = 'male_14_24'
    data.loc[(data['Sexo'] == 0) & (data['Edad'] >= 25) & (data['Edad'] < 35), 'edad-sexo'] = 'male_25_34'
    data.loc[(data['Sexo'] == 0) & (data['Edad'] >= 35), 'edad-sexo'] = 'male_35_more'
    data.loc[(data['Sexo'] == 1) & (data['Edad'] < 14), 'edad-sexo'] = 'female_0_14'
    data.loc[(data['Sexo'] == 1) & (data['Edad'] >= 14) & (data['Edad'] < 25), 'edad-sexo'] = 'female_14_24'
    data.loc[(data['Sexo'] == 1) & (data['Edad'] >= 25) & (data['Edad'] < 35), 'edad-sexo'] = 'female_25_34'
    data.loc[(data['Sexo'] == 1) & (data['Edad'] >= 35), 'edad-sexo'] = 'female_35_more'
    return



In [73]:

    
getEPHdbf_i('t310')









    



('Downloading', 't310')
file in place, creating CSV file
csv file cleanData t310 .csv successfully created in folder data/



In [126]:

    
data = pd.read_csv('data/cleanDatat310.csv')
data.head()









    Out[126]:






  
    
      
      CODUSU
      NRO_HOGAR
      PONDERA
      Parentesco
      Sexo
      Edad
      Estado_Civil
      Lugar_Nac
      NIVEL_ED
      Trabajo
      CAT_OCUP
      Ingr_Total_Ind
    
  
  
    
      0
      302468
      1
      1287
      1
      2
      20
      5
      1
      5
      3
      0
      2000
    
    
      1
      302468
      1
      1287
      10
      2
      20
      5
      1
      5
      3
      0
      2000
    
    
      2
      307861
      1
      1674
      1
      1
      42
      2
      2
      2
      1
      3
      3000
    
    
      3
      307861
      1
      1674
      2
      2
      44
      2
      2
      6
      1
      3
      2800
    
    
      4
      307861
      1
      1674
      3
      1
      13
      5
      1
      3
      3
      0
      0



In [127]:

    
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
data = data[cols]



In [128]:

    
data['Parentesco'] = data['Parentesco'].map({1:'Jefe', 2:'Conyuge', 3:'Hijo',4:'Yerno',5:'Nieto', 6:'Madre-padre',
                                             7:'Suegro', 8:'Hermano',9:'Otro', 10:'No-familia'})
data['Sexo'] = data['Sexo'].map({1:0,2:1})
data['Estado_Civil'] = data['Estado_Civil'].map({1:'Unido',2:'Casado',3:'Divorciado',4:'Viudo',5:'Soltero'})
data.Estado_Civil.replace(to_replace=[9], value=[np.nan], inplace=True, axis=None) 
data['Lugar_Nac'] = data['Lugar_Nac'].map({1:'Localidad',2:'Otra_loc',3:'Otra_prov',4:'Pais_limit',5:'Otro_pais'})
data.Lugar_Nac.replace(to_replace=[9], value=[np.nan], inplace=True, axis=None) 
data['NIVEL_ED'] = data['NIVEL_ED'].map({1:'Primaria_I',2:'Primaria_C',3:'Secundaria_I',4:'Secundaria_C',
                                          5:'Univ_I',6:'Univ_C',7:'Sin_Edu'})
data['Trabajo'] = data['Trabajo'].map({1:'Ocupado',2:'Desocupado',3:'Inactivo',4:'Menor'})
data.Trabajo.replace(to_replace=[0], value=[np.nan], inplace=True, axis=None) 
data['CAT_OCUP'] = data['CAT_OCUP'].map({0:'No_empleo',1:'Patron',2:'Cuenta_propia', 3:'Empleado',4:'Sin_sueldo'})



In [129]:

    
make_dummy(data)



In [130]:

    
data_type_dict = {'Parentesco':'nominal','Estado_Civil':'nominal','Lugar_Nac':'nominal','NIVEL_ED':'nominal',
                 'Trabajo':'nominal','CAT_OCUP':'nominal'}
dummy_var = dummy_variables(data, data_type_dict)
df = dummy_var[0]



In [131]:

    
df = df.drop('edad-sexo', axis=1)



In [132]:

    
df.head()









    Out[132]:






  
    
      
      Ingr_Total_Ind
      CODUSU
      NRO_HOGAR
      PONDERA
      Sexo
      Edad
      Parentesco_Conyuge
      Parentesco_Hermano
      Parentesco_Hijo
      Parentesco_Jefe
      ...
      Estado_Civil_Viudo
      Lugar_Nac_Localidad
      Lugar_Nac_Otra_loc
      Lugar_Nac_Otra_prov
      Lugar_Nac_Otro_pais
      Lugar_Nac_Pais_limit
      Trabajo_Desocupado
      Trabajo_Inactivo
      Trabajo_Menor
      Trabajo_Ocupado
    
  
  
    
      0
      2000
      302468
      1
      1287
      1
      20
      0.0
      0.0
      0.0
      1.0
      ...
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
    
    
      1
      2000
      302468
      1
      1287
      1
      20
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
    
    
      2
      3000
      307861
      1
      1674
      0
      42
      0.0
      0.0
      0.0
      1.0
      ...
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      3
      2800
      307861
      1
      1674
      1
      44
      1.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
    
    
      4
      0
      307861
      1
      1674
      0
      13
      0.0
      0.0
      1.0
      0.0
      ...
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      0.0
    
  

5 rows × 42 columns



In [122]:

    
# Run linear regression
x=df.iloc[:,1:]
y=np.asarray(df.Ingr_Total_Ind)
# weights=np.asarray(df.PONDERA)
# df = df.drop('PONDERA', axis=1)
weights_train = weights[:5016]



In [124]:

    
# sk-learn (Y ~ x) with intercept
from sklearn.linear_model import LinearRegression

R_IS=[]
R_OS=[]

n = 100
from sklearn.cross_validation import train_test_split
for i in range(n):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.40)
    
    res=LinearRegression(fit_intercept=False)
    res.fit(X_train,y_train,sample_weight=weights_train)
    R_IS.append(1-((np.asarray(res.predict(X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())                                                                     
    R_OS.append(1-((np.asarray(res.predict(X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
    
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))









    



IS R-squared for 100 times is 0.491594989235
OS R-squared for 100 times is 0.48973903576



In [26]:

    
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from matplotlib.pylab import plt
%matplotlib inline









    



/usr/local/lib/python2.7/dist-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [125]:

    
from sklearn.decomposition import PCA
import matplotlib.pylab as plt

pca = PCA(2)
Xproj = pca.fit_transform(df)
plt.scatter(Xproj[:,0],Xproj[:,1])
plt.show()



In [112]:

    
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

X3 = np.asarray(df)

Xproj2 = pca.fit_transform(df)

km1=KMeans(n_clusters = 4, random_state = 324)
res5=km1.fit(X3)

plt.figure(figsize=(8,8))

plt.scatter(Xproj2[:, 0], Xproj2[:, 1], c=res5.predict(X3), cmap=plt.cm.cool)
plt.show()



In [114]:

    
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans

X = np.asarray(df)
range_n_clusters = [2, 3, 4, 5, 6, 7]
for n_clusters in range_n_clusters:
    km = KMeans(n_clusters=n_clusters, random_state=324)
    cluster_labels = km.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters = {},".format(n_clusters)+" the average silhouette_score is :{}".format(silhouette_avg))









    



For n_clusters = 2, the average silhouette_score is :0.741589321074
For n_clusters = 3, the average silhouette_score is :0.724995299497
For n_clusters = 4, the average silhouette_score is :0.716886263101
For n_clusters = 5, the average silhouette_score is :0.7119781235
For n_clusters = 6, the average silhouette_score is :0.697278095436
For n_clusters = 7, the average silhouette_score is :0.689042684383



In [133]:

    
df.to_csv('data_sobre_EPH.csv', index = False)

	CODUSU	NRO_HOGAR	PONDERA	Parentesco	Sexo	Edad	Estado_Civil	Lugar_Nac	NIVEL_ED	Trabajo	CAT_OCUP	Ingr_Total_Ind
0	302468	1	1287	1	2	20	5	1	5	3	0	2000
1	302468	1	1287	10	2	20	5	1	5	3	0	2000
2	307861	1	1674	1	1	42	2	2	2	1	3	3000
3	307861	1	1674	2	2	44	2	2	6	1	3	2800
4	307861	1	1674	3	1	13	5	1	3	3	0	0