In [1]:
import pandas as pd

In [2]:
titanic = pd.read_excel("./data/titanic3.xls", "titanic3", index_col=None, na_value=["NA"])

In [3]:
titanic.head()


Out[3]:
pclass survived name sex age sibsp parch ticket fare cabin embarked boat body home.dest
0 1 1 Allen, Miss. Elisabeth Walton female 29.0000 0 0 24160 211.3375 B5 S 2 NaN St Louis, MO
1 1 1 Allison, Master. Hudson Trevor male 0.9167 1 2 113781 151.5500 C22 C26 S 11 NaN Montreal, PQ / Chesterville, ON
2 1 0 Allison, Miss. Helen Loraine female 2.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON
3 1 0 Allison, Mr. Hudson Joshua Creighton male 30.0000 1 2 113781 151.5500 C22 C26 S NaN 135.0 Montreal, PQ / Chesterville, ON
4 1 0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female 25.0000 1 2 113781 151.5500 C22 C26 S NaN NaN Montreal, PQ / Chesterville, ON

In [4]:
titanic.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pclass     1309 non-null   int64  
 1   survived   1309 non-null   int64  
 2   name       1309 non-null   object 
 3   sex        1309 non-null   object 
 4   age        1046 non-null   float64
 5   sibsp      1309 non-null   int64  
 6   parch      1309 non-null   int64  
 7   ticket     1309 non-null   object 
 8   fare       1308 non-null   float64
 9   cabin      295 non-null    object 
 10  embarked   1307 non-null   object 
 11  boat       486 non-null    object 
 12  body       121 non-null    float64
 13  home.dest  745 non-null    object 
dtypes: float64(3), int64(4), object(7)
memory usage: 143.3+ KB

In [ ]:
titanic.describe()

In [ ]:
titanic.shape

In [ ]:
titanic['name']

In [ ]:
tit = titanic[['name','age']]

In [ ]:
tit.head()

In [ ]:
titanic.loc[5:10,['name','age']]

In [ ]:
titanic[titanic['age']>20].sort_values('age').head()

In [ ]:
titanic['fare_2018']=titanic['fare']*26.62

In [ ]:
titanic[['name','age','fare','fare_2018']].sort_values('fare',ascending=False).head()

In [ ]:
titanic['age'].isnull().sum()

In [ ]:
average_age = titanic['age'].mean()
titanic['corrected_age']=titanic['age'].fillna(average_age)
titanic['corrected_age'].isnull().sum()

In [ ]:
titanic['sex'].value_counts()

In [ ]:
titanic.groupby('sex')['survived'].sum()

In [ ]:
# titanic.to_excel('new_titanic_data.xls')

In [ ]:
titanic.groupby(["sex", "pclass"]).mean()

In [ ]:
titanic[titanic["age"]<18].groupby(["sex", "pclass"]).mean()

In [ ]: