A note for pandas


In [ ]:
%load_ext autoreload
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as skd
import sklearn as skl

In [ ]:
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes
# X, Y = skd.load_diabetes(return_X_y=True)
# XY = np.hstack((X, Y[:, np.newaxis]))
# print 'X size: {}'.format(X.shape)
# print 'Y size: {}'.format(Y.shape)

In [ ]:
# Load titanic data into a data frame
df = pd.read_csv('data/titanic-train.csv')
df.head()

In [ ]:
df.info()

In [ ]:
df.describe()

Indexing


In [ ]:
df.iloc[2]

In [ ]:
df.iloc[2:4]

In [ ]:
df.loc[2:4]

In [ ]:
df.loc[2, 'Name']

In [ ]:
df.loc[2:4, 'Name']

In [ ]:
df['Name'].head()

In [ ]:
type(df['Name'])

Series is a column. A DataFrame is a list of columns.


In [ ]:
df[['Name', 'Age']].head()

Selections


In [ ]:
df[df['Age']>70]

In [ ]:
df[(df['Age'] == 11) & (df['SibSp'] == 5)]

In [ ]:
df['Age'].unique()

Aggregation


In [ ]:
df['Survived'].value_counts()

In [ ]:
df.groupby(['Survived', 'Pclass'])['PassengerId'].count()

In [ ]:
df['Age'].min()

In [ ]:
mean_gb_survived = df.groupby('Survived')['Age'].mean()
mean_gb_survived

In [ ]:
std_gb_survived = df.groupby('Survived')['Age'].std()
std_gb_survived

Join DataFrames


In [ ]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reset_index.htmlkjkk
df1 = mean_gb_survived.reset_index()
df2 = std_gb_survived.reset_index()
df1

In [ ]:
df2

In [ ]:
# join
df3 = pd.merge(df1, df2, on='Survived')
df3

In [ ]:
df3.columns = ['Survived', 'Avg_age', 'Std_age']
df3

Plot


In [ ]:
df.plot(kind='hist')

In [ ]:


In [ ]:


In [ ]: