In [1]:
# Imports
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import csv
import calendar
import webbrowser
from datetime import datetime
# Constants
DATA_FOLDER = 'Data/'
The DATA_FOLDER/ebola
folder contains summarized reports of Ebola cases from three countries (Guinea, Liberia and Sierra Leone) during the recent outbreak of the disease in West Africa. For each country, there are daily reports that contain various information about the outbreak in several cities in each country.
Use pandas to import these data files into a single Dataframe
.
Using this DataFrame
, calculate for each country, the daily average per month of new cases and deaths.
Make sure you handle all the different expressions for new cases and deaths that are used in the reports.
In [2]:
'''
Functions needed to solve task 1
'''
#function to import excel file into a dataframe
def importdata(path,date):
allpathFiles = glob.glob(DATA_FOLDER+path+'/*.csv')
list_data = []
for file in allpathFiles:
excel = pd.read_csv(file,parse_dates=[date])
list_data.append(excel)
return pd.concat(list_data)
#function to add the month on a new column of a DataFrame
def add_month(df):
copy_df = df.copy()
months = [calendar.month_name[x.month] for x in copy_df.Date]
copy_df['Month'] = months
return copy_df
#founction which loc only the column within a country and a specified month
#return a dataframe
def chooseCountry_month(dataframe,country,descr,month):
df = dataframe.loc[(dataframe['Country']==country) & (dataframe['Description']==descr)]
#df = add_month(df)
df_month = df.loc[(df['Month']==month)]
return df_month
# Create a dataframe with the number of death, the new cases and the daily infos for a country and a specified month
def getmonthresults(dataframe,country,month):
if country =='Liberia':
descr_kill ='Total death/s in confirmed cases'
descr_cases ='Total confirmed cases'
if country =='Guinea':
descr_kill ='Total deaths of confirmed'
descr_cases ='Total cases of confirmed'
if country == 'Sierra Leone':
descr_kill ='death_confirmed'
descr_cases ='cum_confirmed'
df_kill = chooseCountry_month(dataframe,country,descr_kill,month)
df_cases = chooseCountry_month(dataframe,country,descr_cases,month)
#calculate the number of new cases and of new deaths for the all month
res_kill = int(df_kill.iloc[len(df_kill)-1].Totals)-int(df_kill.iloc[0].Totals)
res_cases = int(df_cases.iloc[len(df_cases)-1].Totals)-int(df_cases.iloc[0].Totals)
#calculate the number of days counted which is last day of register - first day of register
nb_day = df_kill.iloc[len(df_kill)-1].Date.day-df_kill.iloc[0].Date.day
# Sometimes the values in the dataframe are wrong due to the excelfiles which are not all the same!
# We then get negative results. Therefor we replace them all by NaN !
if(res_cases < 0)&(res_kill <0):
monthreport = pd.DataFrame({'New cases':[np.nan],'Deaths':[np.nan],'daily average of New cases':[np.nan],'daily average of Deaths':[np.nan],'month':[month],'Country':[country]})
elif(res_cases >= 0) &( res_kill <0):
monthreport = pd.DataFrame({'New cases':[res_cases],'Deaths':[np.nan],'daily average of New cases':[res_cases/nb_day],'daily average of Deaths':[np.nan],'month':[month],'Country':[country]})
elif(res_cases < 0) & (res_kill >= 0):
monthreport = pd.DataFrame({'New cases':[np.nan],'Deaths':[res_kill],'daily average of New cases':[np.nan],'daily average of Deaths':[res_kill/nb_day],'month':[month],'Country':[country]})
elif(nb_day == 0):
monthreport = pd.DataFrame({'New cases':'notEnoughdatas','Deaths':'notEnoughdatas','daily average of New cases':'notEnoughdatas','daily average of Deaths':'notEnoughdatas','month':[month],'Country':[country]})
else:
monthreport = pd.DataFrame({'New cases':[res_cases],'Deaths':[res_kill],'daily average of New cases':[res_cases/nb_day],'daily average of Deaths':[res_kill/nb_day],'month':[month],'Country':[country]})
return monthreport
#check if the month and the country is in the dataframe df
def checkData(df,month,country):
check = df.loc[(df['Country']==country)& (df['Month']== month)]
return check
#return a dataframe with all the infos(daily new cases, daily death) for each month and each country
def getResults(data):
Countries = ['Guinea','Liberia','Sierra Leone']
Months = ['January','February','March','April','May','June','July','August','September','October','November','December']
results=[]
compteur =0
for country in Countries:
for month in Months:
if not(checkData(data,month,country).empty) : #check if the datas for the month and country exist
res = getmonthresults(data,country,month)
results.append(res)
return pd.concat(results)
In [3]:
# import data from guinea
path_guinea = 'Ebola/guinea_data/'
data_guinea = importdata(path_guinea,'Date')
# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_guinea = data_guinea[['Date', 'Description','Totals']]
data_guinea['Country'] = ['Guinea']*len(data_guinea)
#search for New cases and death!!
#descr(newcases): "Total cases of confirmed" // descr(deaths): "Total deaths of confirmed"
data_guinea = data_guinea.loc[(data_guinea.Description=='Total cases of confirmed')|(data_guinea.Description=='Total deaths of confirmed')]
#import data from liberia
path_liberia = 'Ebola/liberia_data/'
data_liberia = importdata(path_liberia,'Date')
# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_liberia = data_liberia[['Date', 'Variable','National']]
data_liberia['Country'] = ['Liberia']*len(data_liberia)
#search for New cases and death!!
#descr(newcases): "Total confirmed cases" // descr(deaths): "Total death/s in confirmed cases"
data_liberia = data_liberia.loc[(data_liberia.Variable=='Total confirmed cases')|(data_liberia.Variable=='Total death/s in confirmed cases')]
#change the name of the columns to be able merge the 3 data sets
data_liberia = data_liberia.rename(columns={'Date': 'Date', 'Variable': 'Description','National':'Totals'})
#import data from sierra leonne
path_sl = 'Ebola/sl_data/'
data_sl = importdata(path_sl,'date')
# set the new order / change the columns / keep only the relevant datas / add the name of the country
data_sl = data_sl[['date', 'variable','National']]
data_sl['Country'] = ['Sierra Leone']*len(data_sl)
#search for new cases and death
#descr(newcases): "cum_confirmed" // descr(deaths): "death_confirmed"
data_sl = data_sl.loc[(data_sl.variable=='cum_confirmed')|(data_sl.variable=='death_confirmed')]
#change the name of the columns to be able merge the 3 data sets
data_sl = data_sl.rename(columns={'date': 'Date', 'variable': 'Description','National':'Totals'})
#merge the 3 dataframe into ONE which we'll apply our analysis
dataFrame = [data_guinea,data_liberia,data_sl]
data = pd.concat(dataFrame)
# Replace the NaN by 0;
data = data.fillna(0)
#add a column with the month
data = add_month(data)
#get the results from the data set -> see the function
results = getResults(data)
#print the resuults
results
Out[3]:
In the DATA_FOLDER/microbiome
subdirectory, there are 9 spreadsheets of microbiome data that was acquired from high-throughput RNA sequencing procedures, along with a 10th file that describes the content of each.
Use pandas to import the first 9 spreadsheets into a single DataFrame
.
Then, add the metadata information from the 10th spreadsheet as columns in the combined DataFrame
.
Make sure that the final DataFrame
has a unique index and all the NaN
values have been replaced by the tag unknown
.
In [4]:
Sheet10_Meta = pd.read_excel(DATA_FOLDER +'microbiome/metadata.xls')
allFiles = glob.glob(DATA_FOLDER + 'microbiome' + "/MID*.xls")
allFiles
Out[4]:
In [5]:
#Creating an empty DataFrame to store our data and initializing a counter.
Combined_data = pd.DataFrame()
K = 0
while (K < int(len(allFiles))):
#Creating a DataFrame and filling it with the excel's data
df = pd.read_excel(allFiles[K], header=None)
#Getting the metadata of the corresponding spreadsheet
df['BARCODE'] = Sheet10_Meta.at[int(K), 'BARCODE']
df['GROUP'] = Sheet10_Meta.at[int(K), 'GROUP']
df['SAMPLE'] = Sheet10_Meta.at[int(K),'SAMPLE']
#Append the recently created DataFrame to our combined one
Combined_data = Combined_data.append(df)
K = K + 1
#Renaming the columns with meaningfull names
Combined_data.columns = ['Name', 'Value','BARCODE','GROUP','SAMPLE']
Combined_data.head()
Out[5]:
3. Cleaning and reindexing
At first we get rid of the NaN value, we must replace them by "unknown". In order to have a more meaningful and single index, we will reset it to be the name of the RNA sequence.
In [6]:
#Replacing the NaN values with unknwown
Combined_data = Combined_data.fillna('unknown')
#Reseting the index
Combined_data = Combined_data.set_index('Name')
#Showing the result
Combined_data
Out[6]:
Categorical
. DataFrame
with unique index.Categorical
.Assumptions:
- "For each exercise, please provide both a written explanation of the steps you will apply to manipulate the data, and the corresponding code." We assume that "written explanation can come in the form of commented code as well as text"
- We assume that we must not describe the value range of attributes that contain string as we dont feel the length of strings or ASCI-values don't give any insight
In [7]:
'''
Here is a sample of the information in the titanic dataframe
'''
# Importing titanic.xls info with Pandas
titanic = pd.read_excel('Data/titanic.xls')
# printing only the 30 first and last rows of information
print(titanic.head)
In [8]:
'''
To describe the INTENDED values and types of the data we will show you the titanic.html file that was provided to us
Notice:
- 'age' is of type double, so someone can be 17.5 years old, mostly used with babies that are 0.x years old
- 'cabin' is stored as integer, but it har characters and letters
- By this model, embarked is stored as an integer, witch has to be interpreted as the 3 different embarkation ports
- It says that 'boat' is stored as a integer even though it has spaces and letters, it should be stored as string
PS: it might be that the information stored as integer is supposed to be categorical data,
...because they have a "small" amount of valid options
'''
# Display html info in Jupyter Notebook
from IPython.core.display import display, HTML
htmlFile = 'Data/titanic.html'
display(HTML(htmlFile))
In [9]:
'''
The default types of the data after import:
Notice:
- the strings and characters are imported as objects
- 'survived' is imported as int instead of double (which is in our opinion better since it's only 0 and 1
- 'sex' is imported as object not integer because it is a string
'''
titanic.dtypes
Out[9]:
In [10]:
'''
Below you can see the value range of the different numerical values.
name, sex, ticket, cabin, embarked, boat and home.dest is not included because they can't be quantified numerically.
'''
titanic.describe()
Out[10]:
In [11]:
'''
Additional information that is important to remember when manipulation the data
is if/where there are NaN values in the dataset
'''
# This displays the number of NaN there is in different attributes
print(pd.isnull(titanic).sum())
'''
Some of this data is missing while some is meant to describe 'No' or something of meaning.
Example:
Cabin has 1014 NaN in its column, it might be that every passenger had a cabin and the data is missing.
Or it could mean that most passengers did not have a cabin or a mix. The displayed titanic.html file
give us some insight if it is correct. It says that there are 0 NaN in the column. This indicates that
there are 1014 people without a cabin. Boat has also 823 NaN's, while the titanic lists 0 NaN's.
It is probably because most of those who died probably weren't in a boat.
'''
Out[11]:
In [12]:
'''
What attributes should be stored as categorical information?
Categorical data is essentially 8-bit integers which means it can store up to 2^8 = 256 categories
Benefit is that it makes memory usage lower and it has a performance increase in calculations.
'''
print('Number of unique values in... :')
for attr in titanic:
print(" {attr}: {u}".format(attr=attr, u=len(titanic[attr].unique())))
In [13]:
'''
We think it will be smart to categorize: 'pclass', 'survived', 'sex', 'cabin', 'embarked' and 'boat'
because they have under 256 categories and don't have a strong numerical value like 'age'
'survived' is a bordercase because it might be more practical to work with integers in some settings
'''
# changing the attributes to categorical data
titanic.pclass = titanic.pclass.astype('category')
titanic.survived = titanic.survived.astype('category')
titanic.sex = titanic.sex.astype('category')
titanic.cabin = titanic.cabin.astype('category')
titanic.embarked = titanic.embarked.astype('category')
titanic.boat = titanic.boat.astype('category')
#Illustrate the change by printing out the new types
titanic.dtypes
Out[13]:
In [14]:
#Plotting the ratio different classes(1st, 2nd and 3rd class) the passengers have
pc = titanic.pclass.value_counts().sort_index().plot(kind='bar')
pc.set_title('Travel classes')
pc.set_ylabel('Number of passengers')
pc.set_xlabel('Travel class')
pc.set_xticklabels(('1st class', '2nd class', '3rd class'))
plt.show(pc)
#Plotting the amount of people that embarked from different cities(C=Cherbourg, Q=Queenstown, S=Southampton)
em = titanic.embarked.value_counts().sort_index().plot(kind='bar')
em.set_title('Ports of embarkation')
em.set_ylabel('Number of passengers')
em.set_xlabel('Port of embarkation')
em.set_xticklabels(('Cherbourg', 'Queenstown', 'Southampton'))
plt.show(em)
#Plotting what sex the passengers are
sex = titanic.sex.value_counts().plot(kind='bar')
sex.set_title('Gender of the passengers')
sex.set_ylabel('Number of Passengers')
sex.set_xlabel('Gender')
sex.set_xticklabels(('Female', 'Male'))
plt.show(sex)
#Plotting agegroup of passengers
bins = [0,10,20,30,40,50,60,70,80]
age_grouped = pd.DataFrame(pd.cut(titanic.age, bins))
ag = age_grouped.age.value_counts().sort_index().plot.bar()
ag.set_title('Age of Passengers ')
ag.set_ylabel('Number of passengers')
ag.set_xlabel('Age groups')
plt.show(ag)
assumptions:
In [15]:
'''
Parsing the cabinfloor, into floors A, B, C, D, E, F, G, T and display in a pie chart
'''
#Dropping NaN (People without cabin)
cabin_floors = titanic.cabin.dropna()
# removes digits and spaces
cabin_floors = cabin_floors.str.replace(r'[\d ]+', '')
# removes duplicate letters and leave unique (CC -> C) (FG -> G)
cabin_floors = cabin_floors.str.replace(r'(.)(?=.*\1)', '')
# removes ambigous data from the dataset (FE -> NaN)(FG -> NaN)
cabin_floors = cabin_floors.str.replace(r'([A-Z]{1})\w+', 'NaN' )
# Recategorizing (Since we altered the entries, we messed with the categories)
cabin_floors = cabin_floors.astype('category')
# Removing NaN (uin this case ambigous data)
cabin_floors = cabin_floors.cat.remove_categories('NaN')
cabin_floors = cabin_floors.dropna()
# Preparing data for plt.pie
numberOfCabinPlaces = cabin_floors.count()
grouped = cabin_floors.groupby(cabin_floors).count()
sizes = np.array(grouped)
labels = np.array(grouped.index)
# Plotting the pie chart
plt.pie(sizes, labels=labels, autopct='%1.1f%%', pctdistance=0.75, labeldistance=1.1)
print("There are {cabin} passengers that have cabins and {nocabin} passengers without a cabin"
.format(cabin=numberOfCabinPlaces, nocabin=(len(titanic) - numberOfCabinPlaces)))
In [16]:
# function that returns the number of people that survived and died given a specific travelclass
def survivedPerClass(pclass):
survived = len(titanic.survived[titanic.survived == 1][titanic.pclass == pclass])
died = len(titanic.survived[titanic.survived == 0][titanic.pclass == pclass])
return [survived, died]
# Fixing the layout horizontal
the_grid = plt.GridSpec(1, 3)
labels = ["Survived", "Died"]
# Each iteration plots a pie chart
for p in titanic.pclass.unique():
sizes = survivedPerClass(p)
plt.subplot(the_grid[0, p-1], aspect=1 )
plt.pie(sizes, labels=labels, autopct='%1.1f%%')
plt.show()
In [23]:
# group by selected data and get a count for each category
survivalrate = titanic.groupby(['pclass', 'sex', 'survived']).size()
# calculate percentage
survivalpercentage = survivalrate.groupby(level=['pclass', 'sex']).apply(lambda x: x / x.sum() * 100)
# plotting in a histogram
histogram = survivalpercentage.filter(like='1', axis=0).plot(kind='bar')
histogram.set_title('Proportion of the passengers that survived by travel class and sex')
histogram.set_ylabel('Percent likelyhood of surviving titanic')
histogram.set_xlabel('class/gender group')
plt.show(histogram)
assumptions:
In [24]:
#drop NaN rows
age_without_nan = titanic.age.dropna()
#categorizing
age_categories = pd.qcut(age_without_nan, 2, labels=["Younger", "Older"])
#Numbers to explain difference
median = int(np.float64(age_without_nan.median()))
amount = int(age_without_nan[median])
print("The Median age is {median} years old".format(median = median))
print("and there are {amount} passengers that are {median} year old \n".format(amount=amount, median=median))
print(age_categories.groupby(age_categories).count())
print("\nAs you can see the pd.qcut does not cut into entirely equal sized bins, because the age is of a discreet nature")
In [25]:
# imported for the sake of surpressing some warnings
import warnings
warnings.filterwarnings('ignore')
# extract relevant attributes
csas = titanic[['pclass', 'sex', 'age', 'survived']]
csas.dropna(subset=['age'], inplace=True)
# Defining the categories
csas['age_group'] = csas.age > csas.age.median()
csas['age_group'] = csas['age_group'].map(lambda age_category: 'older' if age_category else "younger")
# Converting to int to make it able to aggregate and give percentage
csas.survived = csas.survived.astype(int)
g_categories = csas.groupby(['pclass', 'age_group', 'sex'])
result = pd.DataFrame(g_categories.survived.mean()).rename(columns={'survived': 'survived proportion'})
# reset current index and spesify the unique index
result.reset_index(inplace=True)
unique_index = result.pclass.astype(str) + ': ' + result.age_group.astype(str) + ' ' + result.sex.astype(str)
# Finalize the unique index dataframe
result_w_unique = result[['survived proportion']]
result_w_unique.set_index(unique_index, inplace=True)
print(result_w_unique)