Task 3. Class War in Titanic



In [1]:

    
# Imports
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import csv
import calendar
import webbrowser
from datetime import datetime

# Constants
DATA_FOLDER = 'Data/'

Task 1. Compiling Ebola Data

The DATA_FOLDER/ebola folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.

Use pandas to import these data files into a single Dataframe. Using this DataFrame, calculate for each country, the daily average per month of new cases and deaths. Make sure you handle all the different expressions for new cases and deaths that are used in the reports.



In [2]:

    
'''
Functions needed to solve task 1
'''

#function to import excel file into a dataframe
def importdata(path,date):
    allpathFiles = glob.glob(DATA_FOLDER+path+'/*.csv')
    list_data = []
    for file in allpathFiles:
        excel = pd.read_csv(file,parse_dates=[date])
        list_data.append(excel)
    return pd.concat(list_data)

#function to add the month on a new column of a DataFrame
def add_month(df):
    copy_df = df.copy()
    months = [calendar.month_name[x.month] for x in copy_df.Date]
    copy_df['Month'] = months
    return copy_df

#founction which loc only the column within a country and a specified month
#return a dataframe
def chooseCountry_month(dataframe,country,descr,month):
    df = dataframe.loc[(dataframe['Country']==country) & (dataframe['Description']==descr)]
    #df = add_month(df)
    df_month = df.loc[(df['Month']==month)]
    return df_month

# Create a dataframe with the number of death, the new cases and the daily infos for a country and a specified month 
def getmonthresults(dataframe,country,month):
    if country =='Liberia':
        descr_kill ='Total death/s in confirmed cases'
        descr_cases ='Total confirmed cases'
    if country =='Guinea':
        descr_kill ='Total deaths of confirmed'
        descr_cases ='Total cases of confirmed'
    if country == 'Sierra Leone': 
        descr_kill ='death_confirmed'
        descr_cases ='cum_confirmed'
    
    df_kill = chooseCountry_month(dataframe,country,descr_kill,month)
    df_cases = chooseCountry_month(dataframe,country,descr_cases,month)
    
    #calculate the number of new cases and of new deaths for the all month
    res_kill = int(df_kill.iloc[len(df_kill)-1].Totals)-int(df_kill.iloc[0].Totals)
    res_cases = int(df_cases.iloc[len(df_cases)-1].Totals)-int(df_cases.iloc[0].Totals)
    #calculate the number of days counted which is last day of register - first day of register
    nb_day = df_kill.iloc[len(df_kill)-1].Date.day-df_kill.iloc[0].Date.day 
    

    # Sometimes the values in the dataframe are wrong due to the excelfiles which are not all the same!
    # We then get negative results. Therefor we replace them all by NaN ! 
    if(res_cases < 0)&(res_kill <0):
        monthreport = pd.DataFrame({'New cases':[np.nan],'Deaths':[np.nan],'daily average of New cases':[np.nan],'daily average of Deaths':[np.nan],'month':[month],'Country':[country]})
    elif(res_cases >= 0) &( res_kill <0):
        monthreport = pd.DataFrame({'New cases':[res_cases],'Deaths':[np.nan],'daily average of New cases':[res_cases/nb_day],'daily average of Deaths':[np.nan],'month':[month],'Country':[country]})
    elif(res_cases < 0) & (res_kill >= 0):
        monthreport = pd.DataFrame({'New cases':[np.nan],'Deaths':[res_kill],'daily average of New cases':[np.nan],'daily average of Deaths':[res_kill/nb_day],'month':[month],'Country':[country]})
    elif(nb_day == 0):
        monthreport = pd.DataFrame({'New cases':'notEnoughdatas','Deaths':'notEnoughdatas','daily average of New cases':'notEnoughdatas','daily average of Deaths':'notEnoughdatas','month':[month],'Country':[country]})
    else:    
        monthreport = pd.DataFrame({'New cases':[res_cases],'Deaths':[res_kill],'daily average of New cases':[res_cases/nb_day],'daily average of Deaths':[res_kill/nb_day],'month':[month],'Country':[country]})
    return monthreport

#check if the  month and the country is in the dataframe df
def checkData(df,month,country):
    check = df.loc[(df['Country']==country)& (df['Month']== month)]
    return check

#return a dataframe with all the infos(daily new cases, daily death) for each month and each country
def getResults(data):
    Countries = ['Guinea','Liberia','Sierra Leone']
    Months = ['January','February','March','April','May','June','July','August','September','October','November','December']
    results=[]
    compteur =0
    for country in Countries:
        for month in Months:
            if not(checkData(data,month,country).empty) : #check if the datas for the month and country exist 
                res = getmonthresults(data,country,month)
                results.append(res)  
    return pd.concat(results)



In [3]:

    
# import data from guinea
path_guinea = 'Ebola/guinea_data/'
data_guinea = importdata(path_guinea,'Date')

# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_guinea = data_guinea[['Date', 'Description','Totals']]
data_guinea['Country'] = ['Guinea']*len(data_guinea)

#search for New cases and death!!   
#descr(newcases): "Total cases of confirmed"  // descr(deaths): "Total deaths of confirmed"
data_guinea = data_guinea.loc[(data_guinea.Description=='Total cases of confirmed')|(data_guinea.Description=='Total deaths of confirmed')]

   
#import data from liberia
path_liberia = 'Ebola/liberia_data/'
data_liberia = importdata(path_liberia,'Date')
# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_liberia = data_liberia[['Date', 'Variable','National']]
data_liberia['Country'] = ['Liberia']*len(data_liberia)

#search for New cases and death!!    
#descr(newcases): "Total confirmed cases"  // descr(deaths): "Total death/s in confirmed cases"  
data_liberia = data_liberia.loc[(data_liberia.Variable=='Total confirmed cases')|(data_liberia.Variable=='Total death/s in confirmed cases')]

#change the name of the columns to be able merge the 3 data sets
data_liberia = data_liberia.rename(columns={'Date': 'Date', 'Variable': 'Description','National':'Totals'})

    
#import data from sierra leonne
path_sl = 'Ebola/sl_data/'
data_sl = importdata(path_sl,'date')
# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_sl = data_sl[['date', 'variable','National']]
data_sl['Country'] = ['Sierra Leone']*len(data_sl)

#search for new cases and death    
#descr(newcases): "cum_confirmed"  // descr(deaths): "death_confirmed"
data_sl = data_sl.loc[(data_sl.variable=='cum_confirmed')|(data_sl.variable=='death_confirmed')]
#change the name of the columns to be able merge the 3 data sets
data_sl = data_sl.rename(columns={'date': 'Date', 'variable': 'Description','National':'Totals'})


#merge the 3 dataframe into ONE which we'll apply our analysis
dataFrame = [data_guinea,data_liberia,data_sl]
data = pd.concat(dataFrame)

# Replace the NaN by 0;
data = data.fillna(0)
#add a column with the month
data = add_month(data)

#get the results from the data set -> see the function
results = getResults(data)

#print the resuults
results









    Out[3]:







  
    
      
      Country
      Deaths
      New cases
      daily average of Deaths
      daily average of New cases
      month
    
  
  
    
      0
      Guinea
      109
      212
      4.03704
      7.85185
      August
    
    
      0
      Guinea
      190
      370
      6.78571
      13.2143
      September
    
    
      0
      Guinea
      notEnoughdatas
      notEnoughdatas
      notEnoughdatas
      notEnoughdatas
      October
    
    
      0
      Liberia
      18
      31
      1.38462
      2.38462
      June
    
    
      0
      Liberia
      36
      44
      1.44
      1.76
      July
    
    
      0
      Liberia
      203
      237
      7.80769
      9.11538
      August
    
    
      0
      Liberia
      538
      519
      18.5517
      17.8966
      September
    
    
      0
      Liberia
      NaN
      NaN
      NaN
      NaN
      October
    
    
      0
      Liberia
      0
      367
      0
      13.1071
      November
    
    
      0
      Liberia
      0
      NaN
      0
      NaN
      December
    
    
      0
      Sierra Leone
      123
      360
      6.47368
      18.9474
      August
    
    
      0
      Sierra Leone
      154
      1069
      5.31034
      36.8621
      September
    
    
      0
      Sierra Leone
      538
      1724
      17.9333
      57.4667
      October
    
    
      0
      Sierra Leone
      419
      1835
      14.9643
      65.5357
      November
    
    
      0
      Sierra Leone
      NaN
      660
      NaN
      55
      December

Task 2. RNA Sequences

In the DATA_FOLDER/microbiome subdirectory, there are 9 spreadsheets of microbiome data that was acquired from high-throughput RNA sequencing procedures, along with a 10^th file that describes the content of each.

Use pandas to import the first 9 spreadsheets into a single DataFrame. Then, add the metadata information from the 10^th spreadsheet as columns in the combined DataFrame. Make sure that the final DataFrame has a unique index and all the NaN values have been replaced by the tag unknown.



In [4]:

    
Sheet10_Meta = pd.read_excel(DATA_FOLDER +'microbiome/metadata.xls') 
allFiles = glob.glob(DATA_FOLDER + 'microbiome' + "/MID*.xls")
allFiles









    Out[4]:





['Data/microbiome/MID1.xls',
 'Data/microbiome/MID2.xls',
 'Data/microbiome/MID3.xls',
 'Data/microbiome/MID4.xls',
 'Data/microbiome/MID5.xls',
 'Data/microbiome/MID6.xls',
 'Data/microbiome/MID7.xls',
 'Data/microbiome/MID8.xls',
 'Data/microbiome/MID9.xls']

Creating and filling the DataFrame In order to iterate only once over the data folder, we will attach the metadata to each excel spreadsheet right after creating a DataFrame with it. This will allow the code to be shorter and clearer, but also to iterate only once on every line and therefore be more efficient.



In [5]:

    
#Creating an empty DataFrame to store our data and initializing a counter.
Combined_data = pd.DataFrame()
K = 0
while (K < int(len(allFiles))):
    
    #Creating a DataFrame and filling it with the excel's data
    df = pd.read_excel(allFiles[K], header=None)
    
    #Getting the metadata of the corresponding spreadsheet
    df['BARCODE'] = Sheet10_Meta.at[int(K), 'BARCODE']
    df['GROUP'] = Sheet10_Meta.at[int(K), 'GROUP']
    df['SAMPLE'] = Sheet10_Meta.at[int(K),'SAMPLE']
    
    #Append the recently created DataFrame to our combined one
    Combined_data = Combined_data.append(df)
    
    K = K + 1
    
#Renaming the columns with meaningfull names
Combined_data.columns = ['Name', 'Value','BARCODE','GROUP','SAMPLE']
Combined_data.head()









    Out[5]:







  
    
      
      Name
      Value
      BARCODE
      GROUP
      SAMPLE
    
  
  
    
      0
      Archaea "Crenarchaeota" Thermoprotei Desulfuro...
      7
      MID1
      EXTRACTION CONTROL
      NaN
    
    
      1
      Archaea "Crenarchaeota" Thermoprotei Desulfuro...
      2
      MID1
      EXTRACTION CONTROL
      NaN
    
    
      2
      Archaea "Crenarchaeota" Thermoprotei Sulfoloba...
      3
      MID1
      EXTRACTION CONTROL
      NaN
    
    
      3
      Archaea "Crenarchaeota" Thermoprotei Thermopro...
      3
      MID1
      EXTRACTION CONTROL
      NaN
    
    
      4
      Archaea "Euryarchaeota" "Methanomicrobia" Meth...
      7
      MID1
      EXTRACTION CONTROL
      NaN

    3. Cleaning and reindexing

At first we get rid of the NaN value, we must replace them by "unknown". In order to have a more meaningful and single index, we will reset it to be the name of the RNA sequence.



In [6]:

    
#Replacing the NaN values with unknwown
Combined_data = Combined_data.fillna('unknown')

#Reseting the index
Combined_data = Combined_data.set_index('Name')

#Showing the result
Combined_data









    Out[6]:







  
    
      
      Value
      BARCODE
      GROUP
      SAMPLE
    
    
      Name
      
      
      
      
    
  
  
    
      Archaea "Crenarchaeota" Thermoprotei Desulfurococcales Desulfurococcaceae Ignisphaera
      7
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Crenarchaeota" Thermoprotei Desulfurococcales Pyrodictiaceae Pyrolobus
      2
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Crenarchaeota" Thermoprotei Sulfolobales Sulfolobaceae Stygiolobus
      3
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Crenarchaeota" Thermoprotei Thermoproteales Thermofilaceae Thermofilum
      3
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" "Methanomicrobia" Methanocellales Methanocellaceae Methanocella
      7
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" "Methanomicrobia" Methanosarcinales Methanosarcinaceae Methanimicrococcus
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" "Methanomicrobia" Methanosarcinales Methermicoccaceae Methermicoccus
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Archaeoglobi Archaeoglobales Archaeoglobaceae Ferroglobus
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Archaeoglobi Archaeoglobales Archaeoglobaceae Geoglobus
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Haloplanus
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Haloquadratum
      4
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Halosimplex
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Natronobacterium
      2
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Natronomonas
      4
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Methanococci Methanococcales Methanocaldococcaceae Methanotorris
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Methanopyri Methanopyrales Methanopyraceae Methanopyrus
      12
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Archaea "Euryarchaeota" Thermoplasmata Thermoplasmatales Thermoplasmatales_incertae_sedis Thermogymnomonas
      2
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Acidothermaceae Acidothermus
      2
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Beutenbergiaceae Salana
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Bogoriellaceae Bogoriella
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Cellulomonadaceae Tropheryma
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Corynebacteriaceae Turicella
      2
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Cryptosporangiaceae Cryptosporangium
      53
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermabacteraceae Dermabacter
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermabacteraceae Devriesea
      2
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermatophilaceae Kineosphaera
      3
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Frankineae_incertae_sedis Fodinicola
      1
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Geodermatophilaceae Blastococcus
      99
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Geodermatophilaceae Geodermatophilus
      17
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      Bacteria "Actinobacteria" Actinobacteria Actinomycetales Intrasporangiaceae Kribbia
      2
      MID1
      EXTRACTION CONTROL
      unknown
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oceanospirillaceae Neptuniibacter
      10
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oceanospirillales_incertae_sedis Spongiispira
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oleiphilaceae Oleiphilus
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Pasteurellales Pasteurellaceae Haemophilus
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Moraxellaceae Alkanindiges
      12
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Moraxellaceae Perlucidibaca
      4
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azomonas
      805
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azorhizophilus
      862
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azotobacter
      12
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas
      3534
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Piscirickettsiaceae Cycloclasticus
      4
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Piscirickettsiaceae Sulfurivirga
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Thiotrichaceae Beggiatoa
      25
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Thiotrichales_incertae_sedis Fangia
      2
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Sinobacteraceae Alkanibacter
      3
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Sinobacteraceae Sinobacter
      2
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Aquimonas
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Frateuria
      3
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Ignatzschineria
      2
      MID9
      Control 2
      stool
    
    
      Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Luteibacter
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Synergistetes" Synergistia Synergistales Synergistaceae Aminomonas
      3
      MID9
      Control 2
      stool
    
    
      Bacteria "Tenericutes" Mollicutes Anaeroplasmatales Anaeroplasmataceae Asteroleplasma
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Tenericutes" Mollicutes Haloplasmatales Haloplasmataceae Haloplasma
      11
      MID9
      Control 2
      stool
    
    
      Bacteria "Thermodesulfobacteria" Thermodesulfobacteria Thermodesulfobacteriales Thermodesulfobacteriaceae Caldimicrobium
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Thermodesulfobacteria" Thermodesulfobacteria Thermodesulfobacteriales Thermodesulfobacteriaceae Thermodesulfatator
      1
      MID9
      Control 2
      stool
    
    
      Bacteria "Thermotogae" Thermotogae Thermotogales Thermotogaceae Geotoga
      43
      MID9
      Control 2
      stool
    
    
      Bacteria "Thermotogae" Thermotogae Thermotogales Thermotogaceae Kosmotoga
      16
      MID9
      Control 2
      stool
    
    
      Bacteria "Verrucomicrobia" Opitutae Opitutales Opitutaceae Opitutus
      1
      MID9
      Control 2
      stool
    
    
      Bacteria Cyanobacteria Cyanobacteria  Chloroplast Chlorarachniophyceae
      1
      MID9
      Control 2
      stool
    
    
      Bacteria Cyanobacteria Cyanobacteria  Chloroplast Streptophyta
      10
      MID9
      Control 2
      stool
    
  

2396 rows × 4 columns

Task 3. Class War in Titanic

Use pandas to import the data file Data/titanic.xls. It contains data on all the passengers that travelled on the Titanic.

For each of the following questions state clearly your assumptions and discuss your findings:

Describe the type and the value range of each attribute. Indicate and transform the attributes that can be Categorical.
Plot histograms for the travel class, embarkation port, sex and age attributes. For the latter one, use discrete decade intervals.
Calculate the proportion of passengers by cabin floor. Present your results in a pie chart.
For each travel class, calculate the proportion of the passengers that survived. Present your results in pie charts.
Calculate the proportion of the passengers that survived by travel class and sex. Present your results in a single histogram.
Create 2 equally populated age categories and calculate survival proportions by age category, travel class and sex. Present your results in a DataFrame with unique index.

Question 3.1

Describe the type and the value range of each attribute. Indicate and transform the attributes that can be `Categorical`.

Assumptions:

- "For each exercise, please provide both a written explanation of the steps you will apply to manipulate the data, and the corresponding code." We assume that "written explanation can come in the form of commented code as well as text"
- We assume that we must not describe the value range of attributes that contain string as we dont feel the length of strings or ASCI-values don't give any insight



In [7]:

    
''' 
Here is a sample of the information in the titanic dataframe
''' 

# Importing titanic.xls info with Pandas
titanic = pd.read_excel('Data/titanic.xls')

# printing only the 30 first and last rows of information
print(titanic.head)









    



<bound method NDFrame.head of       pclass  survived                                               name  \
0          1         1                      Allen, Miss. Elisabeth Walton   
1          1         1                     Allison, Master. Hudson Trevor   
2          1         0                       Allison, Miss. Helen Loraine   
3          1         0               Allison, Mr. Hudson Joshua Creighton   
4          1         0    Allison, Mrs. Hudson J C (Bessie Waldo Daniels)   
5          1         1                                Anderson, Mr. Harry   
6          1         1                  Andrews, Miss. Kornelia Theodosia   
7          1         0                             Andrews, Mr. Thomas Jr   
8          1         1      Appleton, Mrs. Edward Dale (Charlotte Lamson)   
9          1         0                            Artagaveytia, Mr. Ramon   
10         1         0                             Astor, Col. John Jacob   
11         1         1  Astor, Mrs. John Jacob (Madeleine Talmadge Force)   
12         1         1                      Aubart, Mme. Leontine Pauline   
13         1         1                       Barber, Miss. Ellen "Nellie"   
14         1         1               Barkworth, Mr. Algernon Henry Wilson   
15         1         0                                Baumann, Mr. John D   
16         1         0                           Baxter, Mr. Quigg Edmond   
17         1         1    Baxter, Mrs. James (Helene DeLaudeniere Chaput)   
18         1         1                              Bazzani, Miss. Albina   
19         1         0                               Beattie, Mr. Thomson   
20         1         1                      Beckwith, Mr. Richard Leonard   
21         1         1   Beckwith, Mrs. Richard Leonard (Sallie Monypeny)   
22         1         1                              Behr, Mr. Karl Howell   
23         1         1                              Bidois, Miss. Rosalie   
24         1         1                                  Bird, Miss. Ellen   
25         1         0                                Birnbaum, Mr. Jakob   
26         1         1                            Bishop, Mr. Dickinson H   
27         1         1            Bishop, Mrs. Dickinson H (Helen Walton)   
28         1         1                             Bissette, Miss. Amelia   
29         1         1          Bjornstrom-Steffansson, Mr. Mauritz Hakan   
...      ...       ...                                                ...   
1279       3         0               Vestrom, Miss. Hulda Amanda Adolfina   
1280       3         0                                    Vovk, Mr. Janko   
1281       3         0                               Waelens, Mr. Achille   
1282       3         0                                Ware, Mr. Frederick   
1283       3         0                        Warren, Mr. Charles William   
1284       3         0                                  Webber, Mr. James   
1285       3         0                                Wenzel, Mr. Linhart   
1286       3         1    Whabee, Mrs. George Joseph (Shawneene Abi-Saab)   
1287       3         0                   Widegren, Mr. Carl/Charles Peter   
1288       3         0                          Wiklund, Mr. Jakob Alfred   
1289       3         0                            Wiklund, Mr. Karl Johan   
1290       3         1                   Wilkes, Mrs. James (Ellen Needs)   
1291       3         0                   Willer, Mr. Aaron ("Abi Weller")   
1292       3         0                                 Willey, Mr. Edward   
1293       3         0                  Williams, Mr. Howard Hugh "Harry"   
1294       3         0                               Williams, Mr. Leslie   
1295       3         0                                Windelov, Mr. Einar   
1296       3         0                                   Wirz, Mr. Albert   
1297       3         0                             Wiseman, Mr. Phillippe   
1298       3         0                          Wittevrongel, Mr. Camille   
1299       3         0                                Yasbeck, Mr. Antoni   
1300       3         1            Yasbeck, Mrs. Antoni (Selini Alexander)   
1301       3         0                               Youseff, Mr. Gerious   
1302       3         0                                  Yousif, Mr. Wazli   
1303       3         0                              Yousseff, Mr. Gerious   
1304       3         0                               Zabour, Miss. Hileni   
1305       3         0                              Zabour, Miss. Thamine   
1306       3         0                          Zakarian, Mr. Mapriededer   
1307       3         0                                Zakarian, Mr. Ortin   
1308       3         0                                 Zimmerman, Mr. Leo   

         sex      age  sibsp  parch            ticket      fare    cabin  \
0     female  29.0000      0      0             24160  211.3375       B5   
1       male   0.9167      1      2            113781  151.5500  C22 C26   
2     female   2.0000      1      2            113781  151.5500  C22 C26   
3       male  30.0000      1      2            113781  151.5500  C22 C26   
4     female  25.0000      1      2            113781  151.5500  C22 C26   
5       male  48.0000      0      0             19952   26.5500      E12   
6     female  63.0000      1      0             13502   77.9583       D7   
7       male  39.0000      0      0            112050    0.0000      A36   
8     female  53.0000      2      0             11769   51.4792     C101   
9       male  71.0000      0      0          PC 17609   49.5042      NaN   
10      male  47.0000      1      0          PC 17757  227.5250  C62 C64   
11    female  18.0000      1      0          PC 17757  227.5250  C62 C64   
12    female  24.0000      0      0          PC 17477   69.3000      B35   
13    female  26.0000      0      0             19877   78.8500      NaN   
14      male  80.0000      0      0             27042   30.0000      A23   
15      male      NaN      0      0          PC 17318   25.9250      NaN   
16      male  24.0000      0      1          PC 17558  247.5208  B58 B60   
17    female  50.0000      0      1          PC 17558  247.5208  B58 B60   
18    female  32.0000      0      0             11813   76.2917      D15   
19      male  36.0000      0      0             13050   75.2417       C6   
20      male  37.0000      1      1             11751   52.5542      D35   
21    female  47.0000      1      1             11751   52.5542      D35   
22      male  26.0000      0      0            111369   30.0000     C148   
23    female  42.0000      0      0          PC 17757  227.5250      NaN   
24    female  29.0000      0      0          PC 17483  221.7792      C97   
25      male  25.0000      0      0             13905   26.0000      NaN   
26      male  25.0000      1      0             11967   91.0792      B49   
27    female  19.0000      1      0             11967   91.0792      B49   
28    female  35.0000      0      0          PC 17760  135.6333      C99   
29      male  28.0000      0      0            110564   26.5500      C52   
...      ...      ...    ...    ...               ...       ...      ...   
1279  female  14.0000      0      0            350406    7.8542      NaN   
1280    male  22.0000      0      0            349252    7.8958      NaN   
1281    male  22.0000      0      0            345767    9.0000      NaN   
1282    male      NaN      0      0            359309    8.0500      NaN   
1283    male      NaN      0      0        C.A. 49867    7.5500      NaN   
1284    male      NaN      0      0  SOTON/OQ 3101316    8.0500      NaN   
1285    male  32.5000      0      0            345775    9.5000      NaN   
1286  female  38.0000      0      0              2688    7.2292      NaN   
1287    male  51.0000      0      0            347064    7.7500      NaN   
1288    male  18.0000      1      0           3101267    6.4958      NaN   
1289    male  21.0000      1      0           3101266    6.4958      NaN   
1290  female  47.0000      1      0            363272    7.0000      NaN   
1291    male      NaN      0      0              3410    8.7125      NaN   
1292    male      NaN      0      0     S.O./P.P. 751    7.5500      NaN   
1293    male      NaN      0      0          A/5 2466    8.0500      NaN   
1294    male  28.5000      0      0             54636   16.1000      NaN   
1295    male  21.0000      0      0  SOTON/OQ 3101317    7.2500      NaN   
1296    male  27.0000      0      0            315154    8.6625      NaN   
1297    male      NaN      0      0        A/4. 34244    7.2500      NaN   
1298    male  36.0000      0      0            345771    9.5000      NaN   
1299    male  27.0000      1      0              2659   14.4542      NaN   
1300  female  15.0000      1      0              2659   14.4542      NaN   
1301    male  45.5000      0      0              2628    7.2250      NaN   
1302    male      NaN      0      0              2647    7.2250      NaN   
1303    male      NaN      0      0              2627   14.4583      NaN   
1304  female  14.5000      1      0              2665   14.4542      NaN   
1305  female      NaN      1      0              2665   14.4542      NaN   
1306    male  26.5000      0      0              2656    7.2250      NaN   
1307    male  27.0000      0      0              2670    7.2250      NaN   
1308    male  29.0000      0      0            315082    7.8750      NaN   

     embarked boat   body                           home.dest  
0           S    2    NaN                        St Louis, MO  
1           S   11    NaN     Montreal, PQ / Chesterville, ON  
2           S  NaN    NaN     Montreal, PQ / Chesterville, ON  
3           S  NaN  135.0     Montreal, PQ / Chesterville, ON  
4           S  NaN    NaN     Montreal, PQ / Chesterville, ON  
5           S    3    NaN                        New York, NY  
6           S   10    NaN                          Hudson, NY  
7           S  NaN    NaN                         Belfast, NI  
8           S    D    NaN                 Bayside, Queens, NY  
9           C  NaN   22.0                 Montevideo, Uruguay  
10          C  NaN  124.0                        New York, NY  
11          C    4    NaN                        New York, NY  
12          C    9    NaN                       Paris, France  
13          S    6    NaN                                 NaN  
14          S    B    NaN                       Hessle, Yorks  
15          S  NaN    NaN                        New York, NY  
16          C  NaN    NaN                        Montreal, PQ  
17          C    6    NaN                        Montreal, PQ  
18          C    8    NaN                                 NaN  
19          C    A    NaN                        Winnipeg, MN  
20          S    5    NaN                        New York, NY  
21          S    5    NaN                        New York, NY  
22          C    5    NaN                        New York, NY  
23          C    4    NaN                                 NaN  
24          S    8    NaN                                 NaN  
25          C  NaN  148.0                   San Francisco, CA  
26          C    7    NaN                        Dowagiac, MI  
27          C    7    NaN                        Dowagiac, MI  
28          S    8    NaN                                 NaN  
29          S    D    NaN  Stockholm, Sweden / Washington, DC  
...       ...  ...    ...                                 ...  
1279        S  NaN    NaN                                 NaN  
1280        S  NaN    NaN                                 NaN  
1281        S  NaN    NaN      Antwerp, Belgium / Stanton, OH  
1282        S  NaN    NaN                                 NaN  
1283        S  NaN    NaN                                 NaN  
1284        S  NaN    NaN                                 NaN  
1285        S  NaN  298.0                                 NaN  
1286        C    C    NaN                                 NaN  
1287        S  NaN    NaN                                 NaN  
1288        S  NaN  314.0                                 NaN  
1289        S  NaN    NaN                                 NaN  
1290        S  NaN    NaN                                 NaN  
1291        S  NaN    NaN                                 NaN  
1292        S  NaN    NaN                                 NaN  
1293        S  NaN    NaN                                 NaN  
1294        S  NaN   14.0                                 NaN  
1295        S  NaN    NaN                                 NaN  
1296        S  NaN  131.0                                 NaN  
1297        S  NaN    NaN                                 NaN  
1298        S  NaN    NaN                                 NaN  
1299        C    C    NaN                                 NaN  
1300        C  NaN    NaN                                 NaN  
1301        C  NaN  312.0                                 NaN  
1302        C  NaN    NaN                                 NaN  
1303        C  NaN    NaN                                 NaN  
1304        C  NaN  328.0                                 NaN  
1305        C  NaN    NaN                                 NaN  
1306        C  NaN  304.0                                 NaN  
1307        C  NaN    NaN                                 NaN  
1308        S  NaN    NaN                                 NaN  

[1309 rows x 14 columns]>



In [8]:

    
'''
To describe the INTENDED values and types of the data we will show you the titanic.html file that was provided to us
Notice:
    - 'age' is of type double, so someone can be 17.5 years old, mostly used with babies that are 0.x years old
    - 'cabin' is stored as integer, but it har characters and letters
    - By this model, embarked is stored as an integer, witch has to be interpreted as the 3 different embarkation ports
    - It says that 'boat' is stored as a integer even though it has spaces and letters, it should be stored as string
    
PS: it might be that the information stored as integer is supposed to be categorical data,
        ...because they have a "small" amount of valid options
''' 

# Display html info in Jupyter Notebook
from IPython.core.display import display, HTML
htmlFile = 'Data/titanic.html'
display(HTML(htmlFile))









    




Data frame:titanic3
1309 observations and 14 variables, maximum # NAs:1188

Name Labels Units Levels Storage NAs
pclass   3 integer    0
survived Survived double    0
name Name character    0
sex   2 integer    0
age Age Year double  263
sibsp Number of Siblings/Spouses Aboard double    0
parch Number of Parents/Children Aboard double    0
ticket Ticket Number character    0
fare Passenger Fare British Pound (\243) double    1
cabin 187 integer    0
embarked   3 integer    2
boat  28 integer    0
body Body Identification Number double 1188
home.dest Home/Destination character    0



Variable Levels
pclass 1st
2nd
3rd
sex female
male
cabin 
A10
A11
A14
A16
A18
A19
A20
A21
A23
A24
A26
A29
A31
A32
A34
A36
A5
A6
A7
A9
B10
B101
B102
B11
B18
B19
B20
B22
B24
B26
B28
B3
B30
B35
B36
B37
B38
B39
B4
B41
B42
B45
B49
B5
B50
B51 B53 B55
B52 B54 B56
B57 B59 B63 B66
B58 B60
B61
B69
B71
B73
B77
B78
B79
B80
B82 B84
B86
B94
B96 B98
C101
C103
C104
C105
C106
C110
C111
C116
C118
C123
C124
C125
C126
C128
C130
C132
C148
C2
C22 C26
C23 C25 C27
C28
C30
C31
C32
C39
C45
C46
C47
C49
C50
C51
C52
C53
C54
C55 C57
C6
C62 C64
C65
C68
C7
C70
C78
C80
C82
C83
C85
C86
C87
C89
C90
C91
C92
C93
C95
C97
C99
D
D10 D12
D11
D15
D17
D19
D20
D21
D22
D26
D28
D30
D33
D34
D35
D36
D37
D38
D40
D43
D45
D46
D47
D48
D49
D50
D56
D6
D7
D9
E10
E101
E12
E121
E17
E24
E25
E31
E33
E34
E36
E38
E39 E41
E40
E44
E45
E46
E49
E50
E52
E58
E60
E63
E67
E68
E77
E8
F
F E46
F E57
F E69
F G63
F G73
F2
F33
F38
F4
G6
T
embarked Cherbourg
Queenstown
Southampton
boat 
1
10
11
12
13
13 15
13 15 B
14
15
15 16
16
2
3
4
5
5 7
5 9
6
7
8
8 10
9
A
B
C
C D
D



In [9]:

    
''' 
The default types of the data after import:
Notice:
    - the strings and characters are imported as objects
    - 'survived' is imported as int instead of double (which is in our opinion better since it's only 0 and 1
    - 'sex' is imported as object not integer because it is a string
'''

titanic.dtypes









    Out[9]:





pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object



In [10]:

    
''' 
Below you can see the value range of the different numerical values.

name, sex, ticket, cabin, embarked, boat and home.dest is not included because they can't be quantified numerically.
''' 

titanic.describe()









    Out[10]:







  
    
      
      pclass
      survived
      age
      sibsp
      parch
      fare
      body
    
  
  
    
      count
      1309.000000
      1309.000000
      1046.000000
      1309.000000
      1309.000000
      1308.000000
      121.000000
    
    
      mean
      2.294882
      0.381971
      29.881135
      0.498854
      0.385027
      33.295479
      160.809917
    
    
      std
      0.837836
      0.486055
      14.413500
      1.041658
      0.865560
      51.758668
      97.696922
    
    
      min
      1.000000
      0.000000
      0.166700
      0.000000
      0.000000
      0.000000
      1.000000
    
    
      25%
      2.000000
      0.000000
      21.000000
      0.000000
      0.000000
      7.895800
      72.000000
    
    
      50%
      3.000000
      0.000000
      28.000000
      0.000000
      0.000000
      14.454200
      155.000000
    
    
      75%
      3.000000
      1.000000
      39.000000
      1.000000
      0.000000
      31.275000
      256.000000
    
    
      max
      3.000000
      1.000000
      80.000000
      8.000000
      9.000000
      512.329200
      328.000000



In [11]:

    
'''
Additional information that is important to remember when manipulation the data
is if/where there are NaN values in the dataset
'''

# This displays the number of NaN there is in different attributes
print(pd.isnull(titanic).sum())

'''
Some of this data is missing while some is meant to describe 'No' or something of meaning.
Example:
    Cabin has 1014 NaN in its column, it might be that every passenger had a cabin and the data is missing.
    Or it could mean that most passengers did not have a cabin or a mix. The displayed titanic.html file 
    give us some insight if it is correct. It says that there are 0 NaN in the column. This indicates that
    there are 1014 people without a cabin. Boat has also 823 NaN's, while the titanic lists 0 NaN's. 
    It is probably because most of those who died probably weren't in a boat.
'''









    



pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64






    Out[11]:





"\nSome of this data is missing while some is meant to describe 'No' or something of meaning.\nExample:\n    Cabin has 1014 NaN in its column, it might be that every passenger had a cabin and the data is missing.\n    Or it could mean that most passengers did not have a cabin or a mix. The displayed titanic.html file \n    give us some insight if it is correct. It says that there are 0 NaN in the column. This indicates that\n    there are 1014 people without a cabin. Boat has also 823 NaN's, while the titanic lists 0 NaN's. \n    It is probably because most of those who died probably weren't in a boat.\n"



In [12]:

    
'''
What attributes should be stored as categorical information?

Categorical data is essentially 8-bit integers which means it can store up to 2^8 = 256 categories
Benefit is that it makes memory usage lower and it has a performance increase in calculations.
'''

print('Number of unique values in... :')
for attr in titanic:
    print("   {attr}: {u}".format(attr=attr, u=len(titanic[attr].unique())))









    



Number of unique values in... :
   pclass: 3
   survived: 2
   name: 1307
   sex: 2
   age: 99
   sibsp: 7
   parch: 8
   ticket: 939
   fare: 282
   cabin: 187
   embarked: 4
   boat: 29
   body: 122
   home.dest: 370



In [13]:

    
'''
We think it will be smart to categorize: 'pclass', 'survived', 'sex', 'cabin', 'embarked' and 'boat'
because they have under 256 categories and don't have a strong numerical value like 'age'
'survived' is a bordercase because it might be more practical to work with integers in some settings
'''

# changing the attributes to categorical data
titanic.pclass = titanic.pclass.astype('category')
titanic.survived = titanic.survived.astype('category')
titanic.sex = titanic.sex.astype('category')
titanic.cabin = titanic.cabin.astype('category')
titanic.embarked = titanic.embarked.astype('category')
titanic.boat = titanic.boat.astype('category')

#Illustrate the change by printing out the new types
titanic.dtypes









    Out[13]:





pclass       category
survived     category
name           object
sex          category
age           float64
sibsp           int64
parch           int64
ticket         object
fare          float64
cabin        category
embarked     category
boat         category
body          float64
home.dest      object
dtype: object

Question 3.2

"Plot histograms for the travel class, embarkation port, sex and age attributes. For the latter one, use discrete decade intervals. "

assumptions:



In [14]:

    
#Plotting the ratio different classes(1st, 2nd and 3rd class) the passengers have
pc = titanic.pclass.value_counts().sort_index().plot(kind='bar')
pc.set_title('Travel classes')
pc.set_ylabel('Number of passengers')
pc.set_xlabel('Travel class')
pc.set_xticklabels(('1st class', '2nd class', '3rd class'))
plt.show(pc)

#Plotting the amount of people that embarked from different cities(C=Cherbourg, Q=Queenstown, S=Southampton)
em = titanic.embarked.value_counts().sort_index().plot(kind='bar')
em.set_title('Ports of embarkation')
em.set_ylabel('Number of passengers')
em.set_xlabel('Port of embarkation')
em.set_xticklabels(('Cherbourg', 'Queenstown', 'Southampton'))
plt.show(em)

#Plotting what sex the passengers are
sex = titanic.sex.value_counts().plot(kind='bar')
sex.set_title('Gender of the passengers')
sex.set_ylabel('Number of Passengers')
sex.set_xlabel('Gender')
sex.set_xticklabels(('Female', 'Male'))
plt.show(sex)

#Plotting agegroup of passengers
bins = [0,10,20,30,40,50,60,70,80]
age_grouped = pd.DataFrame(pd.cut(titanic.age, bins))
ag = age_grouped.age.value_counts().sort_index().plot.bar()
ag.set_title('Age of Passengers ')
ag.set_ylabel('Number of passengers')
ag.set_xlabel('Age groups')
plt.show(ag)

Question 3.3

Calculate the proportion of passengers by cabin floor. Present your results in a pie chart.

assumptions:

Because we are tasked with categorizing persons by the floor of their cabin it was problematic that you had cabin input: "F E57" and "F G63". There were only 7 of these instances with conflicting cabinfloors. We also presumed that the was a floor "T". Even though there was only one instance, so it might have been a typo.
We assume that you don't want to include people without cabinfloor



In [15]:

    
'''
Parsing the cabinfloor, into floors A, B, C, D, E, F, G, T and display in a pie chart

'''
#Dropping NaN (People without cabin)
cabin_floors = titanic.cabin.dropna()

# removes digits and spaces
cabin_floors = cabin_floors.str.replace(r'[\d ]+', '')
# removes duplicate letters and leave unique (CC -> C) (FG -> G)
cabin_floors = cabin_floors.str.replace(r'(.)(?=.*\1)', '')
# removes ambigous data from the dataset (FE -> NaN)(FG -> NaN)
cabin_floors = cabin_floors.str.replace(r'([A-Z]{1})\w+', 'NaN' )

# Recategorizing (Since we altered the entries, we messed with the categories)
cabin_floors = cabin_floors.astype('category')
# Removing NaN (uin this case ambigous data)
cabin_floors = cabin_floors.cat.remove_categories('NaN')
cabin_floors = cabin_floors.dropna()

# Preparing data for plt.pie
numberOfCabinPlaces = cabin_floors.count()
grouped = cabin_floors.groupby(cabin_floors).count()
sizes = np.array(grouped)
labels = np.array(grouped.index)

# Plotting the pie chart
plt.pie(sizes, labels=labels, autopct='%1.1f%%', pctdistance=0.75, labeldistance=1.1)
print("There are {cabin} passengers that have cabins and {nocabin} passengers without a cabin"
      .format(cabin=numberOfCabinPlaces, nocabin=(len(titanic) - numberOfCabinPlaces)))









    



There are 288 passengers that have cabins and 1021 passengers without a cabin

Question 3.4

For each travel class, calculate the proportion of the passengers that survived. Present your results in pie charts.

assumptions:



In [16]:

    
# function that returns the number of people that survived and died given a specific travelclass
def survivedPerClass(pclass):
    survived = len(titanic.survived[titanic.survived == 1][titanic.pclass == pclass])
    died = len(titanic.survived[titanic.survived == 0][titanic.pclass == pclass])
    return [survived, died]

# Fixing the layout horizontal
the_grid = plt.GridSpec(1, 3)
labels = ["Survived", "Died"]

# Each iteration plots a pie chart
for p in titanic.pclass.unique():
    sizes = survivedPerClass(p)
    plt.subplot(the_grid[0, p-1], aspect=1 )
    plt.pie(sizes, labels=labels, autopct='%1.1f%%')
    
plt.show()

Question 3.5

"Calculate the proportion of the passengers that survived by travel class and sex. Present your results in a single histogram."

assumptions:

1. By "proportions" We assume it is a likelyhood-percentage of surviving



In [23]:

    
# group by selected data and get a count for each category
survivalrate = titanic.groupby(['pclass', 'sex', 'survived']).size()

# calculate percentage
survivalpercentage = survivalrate.groupby(level=['pclass', 'sex']).apply(lambda x: x / x.sum() * 100)

# plotting in a histogram
histogram = survivalpercentage.filter(like='1', axis=0).plot(kind='bar')
histogram.set_title('Proportion of the passengers that survived by travel class and sex')
histogram.set_ylabel('Percent likelyhood of surviving titanic')
histogram.set_xlabel('class/gender group')
plt.show(histogram)

Question 3.6

"Create 2 equally populated age categories and calculate survival proportions by age category, travel class and sex. Present your results in a DataFrame with unique index."

assumptions:

By "proportions" we assume it is a likelyhood-percentage of surviving
To create 2 equally populated age categories; we will find the median and round up from the median to nearest whole year difference before splitting.



In [24]:

    
#drop NaN rows
age_without_nan = titanic.age.dropna()

#categorizing
age_categories = pd.qcut(age_without_nan, 2, labels=["Younger", "Older"])

#Numbers to explain difference
median = int(np.float64(age_without_nan.median()))
amount = int(age_without_nan[median])
print("The Median age is {median} years old".format(median = median))
print("and there are {amount} passengers that are {median} year old \n".format(amount=amount, median=median))

print(age_categories.groupby(age_categories).count())
print("\nAs you can see the pd.qcut does not cut into entirely equal sized bins, because the age is of a discreet nature")









    



The Median age is 28 years old
and there are 35 passengers that are 28 year old 

age
Older      510
Younger    536
Name: age, dtype: int64

As you can see the pd.qcut does not cut into entirely equal sized bins, because the age is of a discreet nature



In [25]:

    
# imported for the sake of surpressing some warnings
import warnings
warnings.filterwarnings('ignore')

# extract relevant attributes
csas = titanic[['pclass', 'sex', 'age', 'survived']]
csas.dropna(subset=['age'], inplace=True)

# Defining the categories
csas['age_group'] = csas.age > csas.age.median()
csas['age_group'] = csas['age_group'].map(lambda age_category: 'older' if age_category else "younger")

# Converting to int to make it able to aggregate and give percentage
csas.survived = csas.survived.astype(int)

g_categories = csas.groupby(['pclass', 'age_group', 'sex'])
result = pd.DataFrame(g_categories.survived.mean()).rename(columns={'survived': 'survived proportion'})

# reset current index and spesify the unique index
result.reset_index(inplace=True)
unique_index = result.pclass.astype(str) + ': ' + result.age_group.astype(str) + ' ' + result.sex.astype(str)

# Finalize the unique index dataframe
result_w_unique = result[['survived proportion']]
result_w_unique.set_index(unique_index, inplace=True)
print(result_w_unique)









    



                   survived proportion
1: older female               0.967391
1: older male                 0.302521
1: younger female             0.951220
1: younger male               0.531250
2: older female               0.872340
2: older male                 0.083333
2: younger female             0.910714
2: younger male               0.216216
3: older female               0.372093
3: older male                 0.144000
3: younger female             0.513761
3: younger male               0.183036

Country	Deaths	New cases	daily average of Deaths	daily average of New cases	month
Guinea	109	212	4.03704	7.85185	August
Guinea	190	370	6.78571	13.2143	September
Guinea	notEnoughdatas	notEnoughdatas	notEnoughdatas	notEnoughdatas	October
Liberia	18	31	1.38462	2.38462	June
Liberia	36	44	1.44	1.76	July
Liberia	203	237	7.80769	9.11538	August
Liberia	538	519	18.5517	17.8966	September
Liberia	NaN	NaN	NaN	NaN	October
Liberia	0	367	0	13.1071	November
Liberia	0	NaN	0	NaN	December
Sierra Leone	123	360	6.47368	18.9474	August
Sierra Leone	154	1069	5.31034	36.8621	September
Sierra Leone	538	1724	17.9333	57.4667	October
Sierra Leone	419	1835	14.9643	65.5357	November
Sierra Leone	NaN	660	NaN	55	December

	Name	Value	BARCODE	GROUP	SAMPLE
0	Archaea "Crenarchaeota" Thermoprotei Desulfuro...	7	MID1	EXTRACTION CONTROL	NaN
1	Archaea "Crenarchaeota" Thermoprotei Desulfuro...	2	MID1	EXTRACTION CONTROL	NaN
2	Archaea "Crenarchaeota" Thermoprotei Sulfoloba...	3	MID1	EXTRACTION CONTROL	NaN
3	Archaea "Crenarchaeota" Thermoprotei Thermopro...	3	MID1	EXTRACTION CONTROL	NaN
4	Archaea "Euryarchaeota" "Methanomicrobia" Meth...	7	MID1	EXTRACTION CONTROL	NaN

	Value	BARCODE	GROUP	SAMPLE
Name
Archaea "Crenarchaeota" Thermoprotei Desulfurococcales Desulfurococcaceae Ignisphaera	7	MID1	EXTRACTION CONTROL	unknown
Archaea "Crenarchaeota" Thermoprotei Desulfurococcales Pyrodictiaceae Pyrolobus	2	MID1	EXTRACTION CONTROL	unknown
Archaea "Crenarchaeota" Thermoprotei Sulfolobales Sulfolobaceae Stygiolobus	3	MID1	EXTRACTION CONTROL	unknown
Archaea "Crenarchaeota" Thermoprotei Thermoproteales Thermofilaceae Thermofilum	3	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" "Methanomicrobia" Methanocellales Methanocellaceae Methanocella	7	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" "Methanomicrobia" Methanosarcinales Methanosarcinaceae Methanimicrococcus	1	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" "Methanomicrobia" Methanosarcinales Methermicoccaceae Methermicoccus	1	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Archaeoglobi Archaeoglobales Archaeoglobaceae Ferroglobus	1	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Archaeoglobi Archaeoglobales Archaeoglobaceae Geoglobus	1	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Haloplanus	1	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Haloquadratum	4	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Halosimplex	1	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Natronobacterium	2	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Halobacteria Halobacteriales Halobacteriaceae Natronomonas	4	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Methanococci Methanococcales Methanocaldococcaceae Methanotorris	1	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Methanopyri Methanopyrales Methanopyraceae Methanopyrus	12	MID1	EXTRACTION CONTROL	unknown
Archaea "Euryarchaeota" Thermoplasmata Thermoplasmatales Thermoplasmatales_incertae_sedis Thermogymnomonas	2	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Acidothermaceae Acidothermus	2	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Beutenbergiaceae Salana	1	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Bogoriellaceae Bogoriella	1	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Cellulomonadaceae Tropheryma	1	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Corynebacteriaceae Turicella	2	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Cryptosporangiaceae Cryptosporangium	53	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermabacteraceae Dermabacter	1	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermabacteraceae Devriesea	2	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Dermatophilaceae Kineosphaera	3	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Frankineae_incertae_sedis Fodinicola	1	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Geodermatophilaceae Blastococcus	99	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Geodermatophilaceae Geodermatophilus	17	MID1	EXTRACTION CONTROL	unknown
Bacteria "Actinobacteria" Actinobacteria Actinomycetales Intrasporangiaceae Kribbia	2	MID1	EXTRACTION CONTROL	unknown
...	...	...	...	...
Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oceanospirillaceae Neptuniibacter	10	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oceanospirillales_incertae_sedis Spongiispira	1	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Oceanospirillales Oleiphilaceae Oleiphilus	1	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Pasteurellales Pasteurellaceae Haemophilus	1	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Moraxellaceae Alkanindiges	12	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Moraxellaceae Perlucidibaca	4	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azomonas	805	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azorhizophilus	862	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Azotobacter	12	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas	3534	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Piscirickettsiaceae Cycloclasticus	4	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Piscirickettsiaceae Sulfurivirga	1	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Thiotrichaceae Beggiatoa	25	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Thiotrichales Thiotrichales_incertae_sedis Fangia	2	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Sinobacteraceae Alkanibacter	3	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Sinobacteraceae Sinobacter	2	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Aquimonas	1	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Frateuria	3	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Ignatzschineria	2	MID9	Control 2	stool
Bacteria "Proteobacteria" Gammaproteobacteria Xanthomonadales Xanthomonadaceae Luteibacter	1	MID9	Control 2	stool
Bacteria "Synergistetes" Synergistia Synergistales Synergistaceae Aminomonas	3	MID9	Control 2	stool
Bacteria "Tenericutes" Mollicutes Anaeroplasmatales Anaeroplasmataceae Asteroleplasma	1	MID9	Control 2	stool
Bacteria "Tenericutes" Mollicutes Haloplasmatales Haloplasmataceae Haloplasma	11	MID9	Control 2	stool
Bacteria "Thermodesulfobacteria" Thermodesulfobacteria Thermodesulfobacteriales Thermodesulfobacteriaceae Caldimicrobium	1	MID9	Control 2	stool
Bacteria "Thermodesulfobacteria" Thermodesulfobacteria Thermodesulfobacteriales Thermodesulfobacteriaceae Thermodesulfatator	1	MID9	Control 2	stool
Bacteria "Thermotogae" Thermotogae Thermotogales Thermotogaceae Geotoga	43	MID9	Control 2	stool
Bacteria "Thermotogae" Thermotogae Thermotogales Thermotogaceae Kosmotoga	16	MID9	Control 2	stool
Bacteria "Verrucomicrobia" Opitutae Opitutales Opitutaceae Opitutus	1	MID9	Control 2	stool
Bacteria Cyanobacteria Cyanobacteria Chloroplast Chlorarachniophyceae	1	MID9	Control 2	stool
Bacteria Cyanobacteria Cyanobacteria Chloroplast Streptophyta	10	MID9	Control 2	stool

Name	Labels	Units	Levels	Storage	NAs
pclass			3	integer	0
survived	Survived			double	0
name	Name			character	0
sex			2	integer	0
age	Age	Year		double	263
sibsp	Number of Siblings/Spouses Aboard			double	0
parch	Number of Parents/Children Aboard			double	0
ticket	Ticket Number			character	0
fare	Passenger Fare	British Pound (\243)		double	1
cabin			187	integer	0
embarked			3	integer	2
boat			28	integer	0
body	Body Identification Number			double	1188
home.dest	Home/Destination			character	0

Variable	Levels
pclass	1st
	2nd
	3rd
sex	female
	male
cabin
	A10
	A11
	A14
	A16
	A18
	A19
	A20
	A21
	A23
	A24
	A26
	A29
	A31
	A32
	A34
	A36
	A5
	A6
	A7
	A9
	B10
	B101
	B102
	B11
	B18
	B19
	B20
	B22
	B24
	B26
	B28
	B3
	B30
	B35
	B36
	B37
	B38
	B39
	B4
	B41
	B42
	B45
	B49
	B5
	B50
	B51 B53 B55
	B52 B54 B56
	B57 B59 B63 B66
	B58 B60
	B61
	B69
	B71
	B73
	B77
	B78
	B79
	B80
	B82 B84
	B86
	B94
	B96 B98
	C101
	C103
	C104
	C105
	C106
	C110
	C111
	C116
	C118
	C123
	C124
	C125
	C126
	C128
	C130
	C132
	C148
	C2
	C22 C26
	C23 C25 C27
	C28
	C30
	C31
	C32
	C39
	C45
	C46
	C47
	C49
	C50
	C51
	C52
	C53
	C54
	C55 C57
	C6
	C62 C64
	C65
	C68
	C7
	C70
	C78
	C80
	C82
	C83
	C85
	C86
	C87
	C89
	C90
	C91
	C92
	C93
	C95
	C97
	C99
	D
	D10 D12
	D11
	D15
	D17
	D19
	D20
	D21
	D22
	D26
	D28
	D30
	D33
	D34
	D35
	D36
	D37
	D38
	D40
	D43
	D45
	D46
	D47
	D48
	D49
	D50
	D56
	D6
	D7
	D9
	E10
	E101
	E12
	E121
	E17
	E24
	E25
	E31
	E33
	E34
	E36
	E38
	E39 E41
	E40
	E44
	E45
	E46
	E49
	E50
	E52
	E58
	E60
	E63
	E67
	E68
	E77
	E8
	F
	F E46
	F E57
	F E69
	F G63
	F G73
	F2
	F33
	F38
	F4
	G6
	T
embarked	Cherbourg
	Queenstown
	Southampton
boat
	1
	10
	11
	12
	13
	13 15
	13 15 B
	14
	15
	15 16
	16
	2
	3
	4
	5
	5 7
	5 9
	6
	7
	8
	8 10
	9
	A
	B
	C
	C D
	D

	pclass	survived	age	sibsp	parch	fare	body
count	1309.000000	1309.000000	1046.000000	1309.000000	1309.000000	1308.000000	121.000000
mean	2.294882	0.381971	29.881135	0.498854	0.385027	33.295479	160.809917
std	0.837836	0.486055	14.413500	1.041658	0.865560	51.758668	97.696922
min	1.000000	0.000000	0.166700	0.000000	0.000000	0.000000	1.000000
25%	2.000000	0.000000	21.000000	0.000000	0.000000	7.895800	72.000000
50%	3.000000	0.000000	28.000000	0.000000	0.000000	14.454200	155.000000
75%	3.000000	1.000000	39.000000	1.000000	0.000000	31.275000	256.000000
max	3.000000	1.000000	80.000000	8.000000	9.000000	512.329200	328.000000

Table of Contents

Task 1. Compiling Ebola Data

Task 2. RNA Sequences

Task 3. Class War in Titanic

For each of the following questions state clearly your assumptions and discuss your findings:

Question 3.1

Describe the type and the value range of each attribute. Indicate and transform the attributes that can be Categorical.

Data frame:titanic3

Question 3.2

"Plot histograms for the travel class, embarkation port, sex and age attributes. For the latter one, use discrete decade intervals. "

Question 3.3

Calculate the proportion of passengers by cabin floor. Present your results in a pie chart.

Question 3.4

For each travel class, calculate the proportion of the passengers that survived. Present your results in pie charts.

Question 3.5

"Calculate the proportion of the passengers that survived by travel class and sex. Present your results in a single histogram."

Question 3.6

"Create 2 equally populated age categories and calculate survival proportions by age category, travel class and sex. Present your results in a DataFrame with unique index."

Describe the type and the value range of each attribute. Indicate and transform the attributes that can be `Categorical`.