In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
csv_file_object = csv.reader(open('Titanic/train.csv', 'r'))
header = next(csv_file_object)
data=[]
for row in csv_file_object:
data.append(row)
data = np.array(data)
In [ ]:
#Matrices puras, sin nombres de columna
print(data)
In [ ]:
data[0:15,5]
In [ ]:
type(data[0::,5])
In [ ]:
#Problemas con valores faltantes
ages_onboard = data[0::,5].astype(np.float)
In [3]:
df = pd.read_csv('Titanic/train.csv', header=0)
In [ ]:
df
In [ ]:
type(df)
In [17]:
df.columns
Out[17]:
In [ ]:
df.dtypes #Inferencia de tipos de datos
In [ ]:
df.info()
#La mayoría de las columnas tiene sus valores completos
In [ ]:
df.describe()
In [ ]:
df['Age']
In [ ]:
type(df['Age'])
In [15]:
df['Age'][0:10]
#df['Age'].iloc[0:10]
#df[0:10]['Age']
#df.iloc[0:10]['Age']
Out[15]:
In [ ]:
print("Media:", df['Age'].mean())
print("Mediana:", df['Age'].median())
print("Suma:", df['Age'].sum())
In [ ]:
df[ ['Sex', 'Pclass', 'Age'] ]
Métodos de selección: http://pandas.pydata.org/pandas-docs/stable/indexing.html
In [ ]:
df['Age'] > 60
In [ ]:
df[df['Age'] > 60]
In [ ]:
df[df['Age'] > 60][['Sex', 'Pclass', 'Age', 'Survived']]
In [19]:
df[(df['Age'] > 60) & (df['Sex'] == 'female')]
Out[19]:
In [ ]:
df[df['Age'].isnull()][['Sex', 'Pclass', 'Age']]
In [ ]:
import matplotlib.pyplot as plt
df['Age'].hist()
plt.show()
In [ ]:
df['Age'].hist(bins=16, range=(0,80), alpha = .5)
plt.show()
In [ ]:
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
#df['Gender'] = df['Sex'].map( lambda x: 0 if x[0] == 'm' else 1 )
In [ ]:
df.head()
In [ ]:
df['Embarked'].unique()
In [ ]:
df['Embarked_N'] = df['Embarked'].map({'S':0, 'C':1, 'Q':2, np.nan: 3})
In [ ]:
df['Embarked_N'].unique()
In [ ]:
df['AgeNull'] = df['Age'].isnull()
df.loc[df['Age'].isnull(), 'Age'] = df['Age'].mean()
In [ ]:
df[df['AgeNull'] == 1]['Age']
In [ ]:
df['Embarked_N'].mode()
In [ ]:
type(df['Embarked_N'].mode())
In [ ]:
df.loc[df['Embarked_N'] == 3, 'Embarked_N'] = df['Embarked_N'].mode().iloc[0]
In [ ]:
df['Embarked_N'].unique()
In [10]:
total_mujeres = df[df['Sex'] == 'female'].shape[0]
mujeres_supervivientes = df[(df['Sex'] == 'female') & (df['Survived'] == 1)].shape[0]
print("Porcentaje de mujeres supervivientes:", mujeres_supervivientes / total_mujeres)
total_hombres = df[df['Sex'] == 'male'].shape[0]
hombres_supervivientes = df[(df['Sex'] == 'male') & (df['Survived'] == 1)].shape[0]
print("Porcentaje de hombres supervivientes:", hombres_supervivientes / total_hombres)
In [ ]:
test = pd.read_csv('Titanic/test.csv')
In [ ]:
#Predecimos que sobrevivió si fue mujer y que murió si fue hombre
test['Survived'] = test['Sex'].map({'female': 1, 'male': 0})
test.to_csv('genderbasedpred.csv',
columns=['PassengerId', 'Survived'],
index=False)
Documentación de Pandas: http://pandas.pydata.org/pandas-docs/stable/index.html