In [42]:
import pandas as pd
# learn more about pandas http://pandas.pydata.org/pandas-docs/stable/indexing.html
In [2]:
# Import the Pandas library
import pandas as pd
# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)
#Print the `head` of the train and test dataframes
train.head()
Out[2]:
In [3]:
test.head()
Out[3]:
In [5]:
train.to_csv('/Users/chengjun/github/cjc/data/tatanic_train.csv')
test.to_csv('/Users/chengjun/github/cjc/data/tatanic_test.csv')
You can easily explore a DataFrame
In [7]:
train.shape
Out[7]:
In [6]:
train.describe()
Out[6]:
In [8]:
import pandas as pd
train = pd.read_csv('/Users/chengjun/github/cjc/data/tatanic_train.csv',\
sep = ",", header=0)
In [9]:
train[:3]
Out[9]:
In [37]:
# Passengers that survived vs passengers that passed away
train["Survived"][:5]
Out[37]:
In [11]:
# Passengers that survived vs passengers that passed away
train["Survived"].value_counts()
Out[11]:
In [12]:
# As proportions
train["Survived"].value_counts(normalize = True)
Out[12]:
In [13]:
train['Sex'].value_counts()
Out[13]:
In [14]:
# Males that survived vs males that passed away
train["Survived"][train["Sex"] == 'male'].value_counts()
Out[14]:
In [15]:
# Females that survived vs Females that passed away
train["Survived"][train["Sex"] == 'female'].value_counts()
Out[15]:
In [16]:
# Normalized male survival
train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)
Out[16]:
In [17]:
# Normalized female survival
train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)
Out[17]:
In [18]:
# Create the column Child, and indicate whether child or not a child. Print the new column.
train["Child"] = float('NaN')
train.Child[train.Age < 5] = 1
train.Child[train.Age >= 5] = 0
print train.Child[:3]
In [19]:
# Normalized Survival Rates for under 18
train.Survived[train.Child == 1].value_counts(normalize = True)
Out[19]:
In [20]:
# Normalized Survival Rates for over 18
train.Survived[train.Child == 0].value_counts(normalize = True)
Out[20]:
In [21]:
# Create a copy of test: test_one
test_one = test
# Initialize a Survived column to 0
test_one['Survived'] = 0
# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one.Survived[test_one.Sex =='female'] = 1
print test_one.Survived[:3]
In [22]:
#Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
#Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')
#Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2
In [25]:
train.sort_values(by = ['Age'])[:5]
Out[25]:
In [26]:
train.sort_values(by = ['Age'], ascending = False)[:5]
Out[26]:
In [34]:
train.sort_values(by = ['Pclass', 'Age'], ascending = False)[:5]
Out[34]:
In [39]:
train.groupby(['SibSp']).sum()
Out[39]:
In [40]:
train.groupby(['SibSp', 'Survived']).sum()
Out[40]:
learn more about pandas http://pandas.pydata.org/pandas-docs/stable/indexing.html
In [ ]: