In [1]:
import pandas as pd
In [2]:
titanic = pd.read_excel("./data/titanic3.xls", "titanic3", index_col=None, na_value=["NA"])
In [3]:
titanic.head()
Out[3]:
In [4]:
titanic.info()
In [ ]:
titanic.describe()
In [ ]:
titanic.shape
In [ ]:
titanic['name']
In [ ]:
tit = titanic[['name','age']]
In [ ]:
tit.head()
In [ ]:
titanic.loc[5:10,['name','age']]
In [ ]:
titanic[titanic['age']>20].sort_values('age').head()
In [ ]:
titanic['fare_2018']=titanic['fare']*26.62
In [ ]:
titanic[['name','age','fare','fare_2018']].sort_values('fare',ascending=False).head()
In [ ]:
titanic['age'].isnull().sum()
In [ ]:
average_age = titanic['age'].mean()
titanic['corrected_age']=titanic['age'].fillna(average_age)
titanic['corrected_age'].isnull().sum()
In [ ]:
titanic['sex'].value_counts()
In [ ]:
titanic.groupby('sex')['survived'].sum()
In [ ]:
# titanic.to_excel('new_titanic_data.xls')
In [ ]:
titanic.groupby(["sex", "pclass"]).mean()
In [ ]:
titanic[titanic["age"]<18].groupby(["sex", "pclass"]).mean()
In [ ]: