In [7]:
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()


Out[7]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S

In [8]:
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
#print(age.loc[0:10])
age_is_null = pd.isnull(age)
#print age_is_null
age_null_true = age[age_is_null]
#print age_null_true
age_null_count = len(age_null_true)
print(age_null_count)


177

In [11]:
#The result of this is that mean_age would be nan. This is because any calculations we do with a null value also result in a null value
mean_age = sum(titanic_survival["Age"]) / len(titanic_survival["Age"])
print (mean_age)


nan

In [12]:
#we have to filter out the missing values before we calculate the mean.
good_ages = titanic_survival["Age"][age_is_null == False]
#print good_ages
correct_mean_age = sum(good_ages) / len(good_ages)
print (correct_mean_age)


29.6991176471

In [13]:
# missing data is so common that many pandas methods automatically filter for it
correct_mean_age = titanic_survival["Age"].mean()
print (correct_mean_age)


29.69911764705882

In [14]:
#mean fare for each class
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]
    pclass_fares = pclass_rows["Fare"]
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class] = fare_for_class
print (fares_by_class)


{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}

In [15]:
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print (passenger_survival)


Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64

In [16]:
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")
print(passenger_age)


Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [17]:
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)


                Fare  Survived
Embarked                      
C         10072.2962        93
Q          1022.2543        30
S         17439.3988       217

In [18]:
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
#print new_titanic_survival

In [20]:
row_index_83_age = titanic_survival.loc[83,"Age"]
row_index_1000_pclass = titanic_survival.loc[766,"Pclass"]
print (row_index_83_age)
print (row_index_1000_pclass)


28.0
1

In [21]:
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print (new_titanic_survival[0:10])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print('-------------')
print(titanic_reindexed.loc[0:10])


     PassengerId  Survived  Pclass                                  Name  \
630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   
851          852         0       3                   Svensson, Mr. Johan   
493          494         0       1               Artagaveytia, Mr. Ramon   
96            97         0       1             Goldschmidt, Mr. George B   
116          117         0       3                  Connors, Mr. Patrick   
672          673         0       2           Mitchell, Mr. Henry Michael   
745          746         0       1          Crosby, Capt. Edward Gifford   
33            34         0       2                 Wheadon, Mr. Edward H   
54            55         0       1        Ostby, Mr. Engelhart Cornelius   
280          281         0       3                      Duane, Mr. Frank   

      Sex   Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
630  male  80.0      0      0       27042  30.0000   A23        S  
851  male  74.0      0      0      347060   7.7750   NaN        S  
493  male  71.0      0      0    PC 17609  49.5042   NaN        C  
96   male  71.0      0      0    PC 17754  34.6542    A5        C  
116  male  70.5      0      0      370369   7.7500   NaN        Q  
672  male  70.0      0      0  C.A. 24580  10.5000   NaN        S  
745  male  70.0      1      1   WE/P 5735  71.0000   B22        S  
33   male  66.0      0      0  C.A. 24579  10.5000   NaN        S  
54   male  65.0      0      1      113509  61.9792   B30        C  
280  male  65.0      0      0      336439   7.7500   NaN        Q  
-------------
    PassengerId  Survived  Pclass                                  Name   Sex  \
0           631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   
1           852         0       3                   Svensson, Mr. Johan  male   
2           494         0       1               Artagaveytia, Mr. Ramon  male   
3            97         0       1             Goldschmidt, Mr. George B  male   
4           117         0       3                  Connors, Mr. Patrick  male   
5           673         0       2           Mitchell, Mr. Henry Michael  male   
6           746         0       1          Crosby, Capt. Edward Gifford  male   
7            34         0       2                 Wheadon, Mr. Edward H  male   
8            55         0       1        Ostby, Mr. Engelhart Cornelius  male   
9           281         0       3                      Duane, Mr. Frank  male   
10          457         0       1             Millet, Mr. Francis Davis  male   

     Age  SibSp  Parch      Ticket     Fare Cabin Embarked  
0   80.0      0      0       27042  30.0000   A23        S  
1   74.0      0      0      347060   7.7750   NaN        S  
2   71.0      0      0    PC 17609  49.5042   NaN        C  
3   71.0      0      0    PC 17754  34.6542    A5        C  
4   70.5      0      0      370369   7.7500   NaN        Q  
5   70.0      0      0  C.A. 24580  10.5000   NaN        S  
6   70.0      1      1   WE/P 5735  71.0000   B22        S  
7   66.0      0      0  C.A. 24579  10.5000   NaN        S  
8   65.0      0      1      113509  61.9792   B30        C  
9   65.0      0      0      336439   7.7500   NaN        Q  
10  65.0      0      0       13509  26.5500   E38        S  

In [23]:
# This function returns the hundredth item from a series
def hundredth_row(column):
    # Extract the hundredth item
    hundredth_item = column.loc[99]
    return hundredth_item

# Return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print (hundredth_row)


PassengerId                  100
Survived                       0
Pclass                         2
Name           Kantor, Mr. Sinai
Sex                         male
Age                           34
SibSp                          1
Parch                          0
Ticket                    244367
Fare                          26
Cabin                        NaN
Embarked                       S
dtype: object

In [24]:
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

column_null_count = titanic_survival.apply(not_null_count)
print (column_null_count)


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [26]:
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
    pclass = row['Pclass']
    if pd.isnull(pclass):
        return "Unknown"
    elif pclass == 1:
        return "First Class"
    elif pclass == 2:
        return "Second Class"
    elif pclass == 3:
        return "Third Class"

classes = titanic_survival.apply(which_class, axis=1)
print (classes)


0       Third Class
1       First Class
2       Third Class
3       First Class
4       Third Class
5       Third Class
6       First Class
7       Third Class
8       Third Class
9      Second Class
10      Third Class
11      First Class
12      Third Class
13      Third Class
14      Third Class
15     Second Class
16      Third Class
17     Second Class
18      Third Class
19      Third Class
20     Second Class
21     Second Class
22      Third Class
23      First Class
24      Third Class
25      Third Class
26      Third Class
27      First Class
28      Third Class
29      Third Class
           ...     
861    Second Class
862     First Class
863     Third Class
864    Second Class
865    Second Class
866    Second Class
867     First Class
868     Third Class
869     Third Class
870     Third Class
871     First Class
872     First Class
873     Third Class
874    Second Class
875     Third Class
876     Third Class
877     Third Class
878     Third Class
879     First Class
880    Second Class
881     Third Class
882     Third Class
883    Second Class
884     Third Class
885     Third Class
886    Second Class
887     First Class
888     Third Class
889     First Class
890     Third Class
dtype: object

In [27]:
def is_minor(row):
    if row["Age"] < 18:
        return True
    else:
        return False

minors = titanic_survival.apply(is_minor, axis=1)
#print minors

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"

age_labels = titanic_survival.apply(generate_age_label, axis=1)
print (age_labels)


0        adult
1        adult
2        adult
3        adult
4        adult
5      unknown
6        adult
7        minor
8        adult
9        minor
10       minor
11       adult
12       adult
13       adult
14       minor
15       adult
16       minor
17     unknown
18       adult
19     unknown
20       adult
21       adult
22       minor
23       adult
24       minor
25       adult
26     unknown
27       adult
28     unknown
29     unknown
        ...   
861      adult
862      adult
863    unknown
864      adult
865      adult
866      adult
867      adult
868    unknown
869      minor
870      adult
871      adult
872      adult
873      adult
874      adult
875      minor
876      adult
877      adult
878    unknown
879      adult
880      adult
881      adult
882      adult
883      adult
884      adult
885      adult
886      adult
887      adult
888    unknown
889      adult
890      adult
dtype: object

In [28]:
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print (age_group_survival)


age_labels
adult      0.381032
minor      0.539823
unknown    0.293785
Name: Survived, dtype: float64

In [ ]: