A note for pandas
In [ ]:
%load_ext autoreload
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as skd
import sklearn as skl
In [ ]:
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html#sklearn.datasets.load_diabetes
# X, Y = skd.load_diabetes(return_X_y=True)
# XY = np.hstack((X, Y[:, np.newaxis]))
# print 'X size: {}'.format(X.shape)
# print 'Y size: {}'.format(Y.shape)
In [ ]:
# Load titanic data into a data frame
df = pd.read_csv('data/titanic-train.csv')
df.head()
In [ ]:
df.info()
In [ ]:
df.describe()
In [ ]:
df.iloc[2]
In [ ]:
df.iloc[2:4]
In [ ]:
df.loc[2:4]
In [ ]:
df.loc[2, 'Name']
In [ ]:
df.loc[2:4, 'Name']
In [ ]:
df['Name'].head()
In [ ]:
type(df['Name'])
Series
is a column. A DataFrame
is a list of columns.
In [ ]:
df[['Name', 'Age']].head()
In [ ]:
df[df['Age']>70]
In [ ]:
df[(df['Age'] == 11) & (df['SibSp'] == 5)]
In [ ]:
df['Age'].unique()
In [ ]:
df['Survived'].value_counts()
In [ ]:
df.groupby(['Survived', 'Pclass'])['PassengerId'].count()
In [ ]:
df['Age'].min()
In [ ]:
mean_gb_survived = df.groupby('Survived')['Age'].mean()
mean_gb_survived
In [ ]:
std_gb_survived = df.groupby('Survived')['Age'].std()
std_gb_survived
In [ ]:
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reset_index.htmlkjkk
df1 = mean_gb_survived.reset_index()
df2 = std_gb_survived.reset_index()
df1
In [ ]:
df2
In [ ]:
# join
df3 = pd.merge(df1, df2, on='Survived')
df3
In [ ]:
df3.columns = ['Survived', 'Avg_age', 'Std_age']
df3
In [ ]:
df.plot(kind='hist')
In [ ]:
In [ ]:
In [ ]: