In [6]:
import pandas as pd
import numpy as np
import os
import sys
import simpledbf
def getEPHdbf_i(censusstring):
print ("Downloading", censusstring)
### First I will check that it is not already there
if not os.path.isfile("data/Individual_" + censusstring + ".dbf"):
if os.path.isfile('Individual_' + censusstring + ".dbf"):
# if in the current dir just move it
if os.system("mv " + 'Individual_' + censusstring + ".dbf " + "data/"):
print ("Error moving file!, Please check!")
# otherwise start looking for the zip file
else:
if not os.path.isfile("data/" + censusstring + "_dbf.zip"):
if not os.path.isfile(censusstring + "_dbf.zip"):
os.system(
"curl -O http://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/" + censusstring + "_dbf.zip")
### To move it I use the os.system() functions to run bash commands with arguments
os.system("mv " + censusstring + "_dbf.zip " + "data/")
### unzip the csv
os.system("unzip " + "data/" + censusstring + "_dbf.zip -d data/")
if not os.path.isfile("data/" + 'Individual_' + censusstring + ".dbf"):
print ("WARNING!!! something is wrong: the file is not there!")
else:
print ("file in place, creating CSV file")
trimestre = censusstring
dbf = simpledbf.Dbf5('data/Individual_' + trimestre + '.dbf',codec='latin1')
indRaw = dbf.to_dataframe()
indNoW = indRaw.loc[indRaw.REGION == 1,['CODUSU',
'NRO_HOGAR',
'PONDERA',
'CH03',
'CH04',
'CH06',
'CH07',
'CH15',
'NIVEL_ED',
'ESTADO',
'CAT_OCUP',
# 'ITF',
# 'IPCF',
'P47T']]
indNoW.columns = ['CODUSU',
'NRO_HOGAR',
'PONDERA',
'Parentesco',
'Sexo',
'Edad',
'Estado_Civil',
'Lugar_Nac',
'NIVEL_ED',
'Trabajo',
'CAT_OCUP',
# 'Ingr_Total_Fam',
# 'Ingr_Per_Capita_Fam',
'Ingr_Total_Ind']
indNoW.index =range(0,indNoW.shape[0])
indNoW.to_csv('data/cleanData' + trimestre + '.csv', index = False)
print 'csv file cleanData',trimestre,'.csv successfully created in folder data/'
return
In [7]:
def dummy_variables(data, data_type_dict):
#Loop over nominal variables.
for variable in filter(lambda x: data_type_dict[x]=='nominal',
data_type_dict.keys()):
#First we create the columns with dummy variables.
#Note that the argument 'prefix' means the column names will be
#prefix_value for each unique value in the original column, so
#we set the prefix to be the name of the original variable.
dummy_df=pd.get_dummies(data[variable], prefix=variable)
#Remove old variable from dictionary.
data_type_dict.pop(variable)
#Add new dummy variables to dictionary.
for dummy_variable in dummy_df.columns:
data_type_dict[dummy_variable] = 'nominal'
#Add dummy variables to main df.
data=data.drop(variable, axis=1)
data=data.join(dummy_df)
return [data, data_type_dict]
In [69]:
def make_dummy(data):
data['edad-sexo'] = 'no-data'
data.loc[(data['Sexo'] == 0) & (data['Edad'] < 14), 'edad-sexo'] = 'male_0_14'
data.loc[(data['Sexo'] == 0) & (data['Edad'] >= 14) & (data['Edad'] < 25), 'edad-sexo'] = 'male_14_24'
data.loc[(data['Sexo'] == 0) & (data['Edad'] >= 25) & (data['Edad'] < 35), 'edad-sexo'] = 'male_25_34'
data.loc[(data['Sexo'] == 0) & (data['Edad'] >= 35), 'edad-sexo'] = 'male_35_more'
data.loc[(data['Sexo'] == 1) & (data['Edad'] < 14), 'edad-sexo'] = 'female_0_14'
data.loc[(data['Sexo'] == 1) & (data['Edad'] >= 14) & (data['Edad'] < 25), 'edad-sexo'] = 'female_14_24'
data.loc[(data['Sexo'] == 1) & (data['Edad'] >= 25) & (data['Edad'] < 35), 'edad-sexo'] = 'female_25_34'
data.loc[(data['Sexo'] == 1) & (data['Edad'] >= 35), 'edad-sexo'] = 'female_35_more'
return
In [73]:
getEPHdbf_i('t310')
In [126]:
data = pd.read_csv('data/cleanDatat310.csv')
data.head()
Out[126]:
In [127]:
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
data = data[cols]
In [128]:
data['Parentesco'] = data['Parentesco'].map({1:'Jefe', 2:'Conyuge', 3:'Hijo',4:'Yerno',5:'Nieto', 6:'Madre-padre',
7:'Suegro', 8:'Hermano',9:'Otro', 10:'No-familia'})
data['Sexo'] = data['Sexo'].map({1:0,2:1})
data['Estado_Civil'] = data['Estado_Civil'].map({1:'Unido',2:'Casado',3:'Divorciado',4:'Viudo',5:'Soltero'})
data.Estado_Civil.replace(to_replace=[9], value=[np.nan], inplace=True, axis=None)
data['Lugar_Nac'] = data['Lugar_Nac'].map({1:'Localidad',2:'Otra_loc',3:'Otra_prov',4:'Pais_limit',5:'Otro_pais'})
data.Lugar_Nac.replace(to_replace=[9], value=[np.nan], inplace=True, axis=None)
data['NIVEL_ED'] = data['NIVEL_ED'].map({1:'Primaria_I',2:'Primaria_C',3:'Secundaria_I',4:'Secundaria_C',
5:'Univ_I',6:'Univ_C',7:'Sin_Edu'})
data['Trabajo'] = data['Trabajo'].map({1:'Ocupado',2:'Desocupado',3:'Inactivo',4:'Menor'})
data.Trabajo.replace(to_replace=[0], value=[np.nan], inplace=True, axis=None)
data['CAT_OCUP'] = data['CAT_OCUP'].map({0:'No_empleo',1:'Patron',2:'Cuenta_propia', 3:'Empleado',4:'Sin_sueldo'})
In [129]:
make_dummy(data)
In [130]:
data_type_dict = {'Parentesco':'nominal','Estado_Civil':'nominal','Lugar_Nac':'nominal','NIVEL_ED':'nominal',
'Trabajo':'nominal','CAT_OCUP':'nominal'}
dummy_var = dummy_variables(data, data_type_dict)
df = dummy_var[0]
In [131]:
df = df.drop('edad-sexo', axis=1)
In [132]:
df.head()
Out[132]:
In [122]:
# Run linear regression
x=df.iloc[:,1:]
y=np.asarray(df.Ingr_Total_Ind)
# weights=np.asarray(df.PONDERA)
# df = df.drop('PONDERA', axis=1)
weights_train = weights[:5016]
In [124]:
# sk-learn (Y ~ x) with intercept
from sklearn.linear_model import LinearRegression
R_IS=[]
R_OS=[]
n = 100
from sklearn.cross_validation import train_test_split
for i in range(n):
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.40)
res=LinearRegression(fit_intercept=False)
res.fit(X_train,y_train,sample_weight=weights_train)
R_IS.append(1-((np.asarray(res.predict(X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())
R_OS.append(1-((np.asarray(res.predict(X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))
In [26]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from matplotlib.pylab import plt
%matplotlib inline
In [125]:
from sklearn.decomposition import PCA
import matplotlib.pylab as plt
pca = PCA(2)
Xproj = pca.fit_transform(df)
plt.scatter(Xproj[:,0],Xproj[:,1])
plt.show()
In [112]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
X3 = np.asarray(df)
Xproj2 = pca.fit_transform(df)
km1=KMeans(n_clusters = 4, random_state = 324)
res5=km1.fit(X3)
plt.figure(figsize=(8,8))
plt.scatter(Xproj2[:, 0], Xproj2[:, 1], c=res5.predict(X3), cmap=plt.cm.cool)
plt.show()
In [114]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
X = np.asarray(df)
range_n_clusters = [2, 3, 4, 5, 6, 7]
for n_clusters in range_n_clusters:
km = KMeans(n_clusters=n_clusters, random_state=324)
cluster_labels = km.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters = {},".format(n_clusters)+" the average silhouette_score is :{}".format(silhouette_avg))
In [133]:
df.to_csv('data_sobre_EPH.csv', index = False)