In [1]:

    
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Algunas limitaciones de trabajar con Numpy



In [2]:

    
csv_file_object = csv.reader(open('Titanic/train.csv', 'r')) 
header = next(csv_file_object)

data=[] 

for row in csv_file_object:
    data.append(row)
data = np.array(data)



In [ ]:

    
#Matrices puras, sin nombres de columna
print(data)



In [ ]:

    
data[0:15,5]

Tipos de Dato Numpy:



In [ ]:

    
type(data[0::,5])



In [ ]:

    
#Problemas con valores faltantes
ages_onboard = data[0::,5].astype(np.float)

Pandas



In [3]:

    
df = pd.read_csv('Titanic/train.csv', header=0)



In [ ]:

    
df



In [ ]:

    
type(df)



In [17]:

    
df.columns









    Out[17]:





Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')



In [ ]:

    
df.dtypes #Inferencia de tipos de datos



In [ ]:

    
df.info()
#La mayoría de las columnas tiene sus valores completos



In [ ]:

    
df.describe()

Selección



In [ ]:

    
df['Age']



In [ ]:

    
type(df['Age'])



In [15]:

    
df['Age'][0:10]
#df['Age'].iloc[0:10]
#df[0:10]['Age']
#df.iloc[0:10]['Age']









    Out[15]:





0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64



In [ ]:

    
print("Media:", df['Age'].mean())
print("Mediana:", df['Age'].median())
print("Suma:", df['Age'].sum())



In [ ]:

    
df[ ['Sex', 'Pclass', 'Age'] ]

Métodos de selección: http://pandas.pydata.org/pandas-docs/stable/indexing.html

Filtrado



In [ ]:

    
df['Age'] > 60



In [ ]:

    
df[df['Age'] > 60]



In [ ]:

    
df[df['Age'] > 60][['Sex', 'Pclass', 'Age', 'Survived']]



In [19]:

    
df[(df['Age'] > 60) & (df['Sex'] == 'female')]









    Out[19]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      275
      276
      1
      1
      Andrews, Miss. Kornelia Theodosia
      female
      63.0
      1
      0
      13502
      77.9583
      D7
      S
    
    
      483
      484
      1
      3
      Turkula, Mrs. (Hedwig)
      female
      63.0
      0
      0
      4134
      9.5875
      NaN
      S
    
    
      829
      830
      1
      1
      Stone, Mrs. George Nelson (Martha Evelyn)
      female
      62.0
      0
      0
      113572
      80.0000
      B28
      NaN



In [ ]:

    
df[df['Age'].isnull()][['Sex', 'Pclass', 'Age']]

Graficación rápida



In [ ]:

    
import matplotlib.pyplot as plt
df['Age'].hist()
plt.show()



In [ ]:

    
df['Age'].hist(bins=16, range=(0,80), alpha = .5)
plt.show()

Manipulación de datos



In [ ]:

    
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
#df['Gender'] = df['Sex'].map( lambda x: 0 if x[0] == 'm' else 1 )



In [ ]:

    
df.head()



In [ ]:

    
df['Embarked'].unique()



In [ ]:

    
df['Embarked_N'] = df['Embarked'].map({'S':0, 'C':1, 'Q':2, np.nan: 3})



In [ ]:

    
df['Embarked_N'].unique()

Imputacion



In [ ]:

    
df['AgeNull'] = df['Age'].isnull()
df.loc[df['Age'].isnull(), 'Age'] = df['Age'].mean()



In [ ]:

    
df[df['AgeNull'] == 1]['Age']



In [ ]:

    
df['Embarked_N'].mode()



In [ ]:

    
type(df['Embarked_N'].mode())



In [ ]:

    
df.loc[df['Embarked_N'] == 3, 'Embarked_N'] = df['Embarked_N'].mode().iloc[0]



In [ ]:

    
df['Embarked_N'].unique()

Predicciones basadas en género



In [10]:

    
total_mujeres = df[df['Sex'] == 'female'].shape[0]
mujeres_supervivientes = df[(df['Sex'] == 'female') & (df['Survived'] == 1)].shape[0]
print("Porcentaje de mujeres supervivientes:", mujeres_supervivientes / total_mujeres)

total_hombres = df[df['Sex'] == 'male'].shape[0]
hombres_supervivientes = df[(df['Sex'] == 'male') & (df['Survived'] == 1)].shape[0]
print("Porcentaje de hombres supervivientes:", hombres_supervivientes / total_hombres)









    



Porcentaje de mujeres supervivientes: 0.7420382165605095
Porcentaje de hombres supervivientes: 0.18890814558058924



In [ ]:

    
test = pd.read_csv('Titanic/test.csv')



In [ ]:

    
#Predecimos que sobrevivió si fue mujer y que murió si fue hombre
test['Survived'] = test['Sex'].map({'female': 1, 'male': 0})
test.to_csv('genderbasedpred.csv', 
            columns=['PassengerId', 'Survived'], 
            index=False)

Documentación de Pandas: http://pandas.pydata.org/pandas-docs/stable/index.html

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
275	276	1	1	Andrews, Miss. Kornelia Theodosia	female	63.0	1	13502	77.9583	D7	S
483	484	1	3	Turkula, Mrs. (Hedwig)	female	63.0	0	4134	9.5875	NaN	S
829	830	1	1	Stone, Mrs. George Nelson (Martha Evelyn)	female	62.0	0	113572	80.0000	B28	NaN