In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot')
In [2]:
%%javascript
$.getScript('misc/kmahelona_ipython_notebook_toc.js')
Task: load the following file as a data frame
In [3]:
fn = r"data/drinks.csv"
In [4]:
# Answer:
df = pd.read_csv(fn, sep=",")
Task: See the first 10 entries
In [5]:
# Answer:
df.head(10)
Out[5]:
Task: Which country has the highest alcohol consumption (total litres of pure alcohol)?
In [6]:
# Answer
df.sort_values("total_litres_of_pure_alcohol", ascending=False).head()
Out[6]:
Task: Which continent drinks most beer on average?
In [7]:
# Answer
# df.groupby("continent").beer_servings.mean()
# or
df.groupby("continent")["beer_servings"].mean()
Out[7]:
Task: List all unique continents.
In [8]:
# Answer:
df["continent"].unique()
# or
# set(df["continent"].tolist())
Out[8]:
Task: Which countries have missing values in the continent column?
In [9]:
# Answer:
df[df["continent"].isnull()]
Out[9]:
Task: Set "the" missing continent with a name of your choice.
In [10]:
# Answer:
df.loc[df["continent"].isnull(), "continent"] = "NAs"
Task: For each continent print "the" statistics (summary stats using "df.describe()") for wine consumption.
In [11]:
# Answer:
# df.groupby('continent').wine_servings.describe()
# or
df.groupby('continent')["wine_servings"].describe()
Out[11]:
Task: Print the median alcoohol consumption per continent for every column
In [12]:
# Answer:
df.groupby('continent').median()
Out[12]:
Task: Print the mean, min and max values for spirit consumption.
In [13]:
# Answer:
df.groupby('continent').spirit_servings.agg(['mean', 'min', 'max'])
Out[13]:
Task: GroupBy Continent and create a Boxplot. (Hint: using e.g. figsize=(12, 9), rot=90
might help with legibility.)
In [14]:
# Answer:
df.groupby("continent").boxplot(figsize=(12, 9), rot=90)
Out[14]:
Task: Import the first dataset cars1 and cars2. Assign each to a to a variable called cars1 and cars2.
In [15]:
# Answer:
cars1 = pd.read_csv("data/cars1.csv")
cars2 = pd.read_csv("data/cars2.csv")
Task: It seems our first dataset has some unnamed blank columns, fix cars1.
In [16]:
# Answer:
cars1 = cars1.loc[:, "mpg":"car"]
Task: Join cars1 and cars2 into a single DataFrame called cars
In [17]:
# Answer:
cars = cars1.append(cars2)
# or
cars_ = pd.concat([cars1, cars2], axis=0)
# check for equality
cars_.equals(cars)
Out[17]:
Task: Create function that returns the first word of the string in the "car" column, the manufacturer name. Use the "apply" method to create a new column in the DataFrame.
In [18]:
# Answer
def grep_firt_word(string_):
return string_.split()[0]
cars["vendor"] = cars["car"].apply(grep_firt_word)
# or
cars["vendor"] = cars["car"].apply(lambda s: s.split()[0])
Consider the following DataFrames for the next exercises
In [19]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
Task: Concatenate the three DataFrames along the rows.
In [20]:
# Answer:
pd.concat([df1, df2, df3], axis=0)
# or
# df1.append(df2).append(df3)
Out[20]:
Task: How many missing values (NaNs) are produced if you concatenate along the other axis (appending the columns)?
In [21]:
# Answer
print sum(pd.concat([df1, df2, df3], axis=1).isnull().sum())
# or
shape1 = df1.shape
shape2 = df2.shape
shape3 = df3.shape
num_true_values = shape1[0]*shape1[1] + shape2[0]*shape2[1] + shape3[0]*shape3[1]
total_num_values = (shape1[0] + shape2[0] + shape3[0]) * (shape1[1] + shape2[1] + shape3[1])
print total_num_values - num_true_values
assert sum(pd.concat([df1, df2, df3], axis=1).isnull().sum()) == total_num_values - num_true_values
Let's consider another data set to do some more Merge, Join & Concatenate exerciseses
In [22]:
raw_data_1 = {
'subject_id': ['1', '2', '3', '4', '5'],
'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'],
'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']}
raw_data_2 = {
'subject_id': ['4', '5', '6', '7', '8', '9', '10'],
'first_name': ['Alice', 'Ayoung', 'Bran', 'Bryce', 'Betty', 'Jane', np.nan],
'last_name': ['Aoni', 'Atiches', 'Balwner', 'Brice', 'Btisan', np.nan, 'Doe']}
raw_data_3 = {
'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]}
data1 = pd.DataFrame(raw_data_1, columns = ['subject_id', 'first_name', 'last_name'])
data2 = pd.DataFrame(raw_data_2, columns = ['subject_id', 'first_name', 'last_name'])
data3 = pd.DataFrame(raw_data_3, columns = ['subject_id','test_id'])
Task: Join the two dataframes, data1
and data2
, along rows and assign all_data
. Make sure that the row index is unique.
In [23]:
# Answer:
all_data = pd.concat([data1, data2], ignore_index=True)
assert len(set(all_data.index)) == all_data.shape[0]
# otherwise you might get unexpected behaviour:
all_data = pd.concat([data1, data2], ignore_index=False)
all_data.loc[0, :]
Out[23]:
Task: Join the two dataframes, data1
and data2
, along columns and assing to all_data_col
.
In [24]:
# Answer:
all_data_col = pd.concat([data1, data2], axis = 1)
all_data_col
Out[24]:
Task: Merge all_data
and data3
along the subject_id value.
In [25]:
# Answer:
pd.merge(all_data, data3)
Out[25]:
Task: How many test_ids have missing values in the first or last name column?
In [26]:
# Answer:
dfm = pd.merge(all_data, data3, how="outer")
cond = dfm["test_id"].notnull() & (dfm["first_name"].isnull() | dfm["last_name"].isnull())
sum(cond)
Out[26]:
Task: Merge only the data that has the same 'subject_id' in both data1
and data2
.
In [27]:
# Answer:
pd.merge(data1, data2, how='inner')
Out[27]:
The transform method returns an object that is indexed the same (same size) as the one being grouped.
Task: Given a DataFrame with a column of group IDs, 'groups', and a column of corresponding integer values, 'vals', replace any negative values in 'vals' with the group mean.
In [28]:
df = pd.DataFrame([[1, 1], [1, -1], [2, 1], [2, 2]], columns=["groups", "vals"])
df
Out[28]:
In [29]:
# Answer
def replace_within_group(group):
mask = group < 0
# Select those values where it is < 0, and replace
# them with the mean of the values which are not < 0.
group[mask] = group[~mask].mean() # "~" is the "invert" or "complement" operation
return group
df.groupby(['groups'])['vals'].transform(replace_within_group)
Out[29]:
Task: Use groupby in conjunction with transform across multiple columns: We want to group by one to n columns and apply a function on these groups across two columns.
1. Calculate the sum of `a` and `b` and assign it to a column named `e`.
2. Group by 'c' and `d`, and calculate the sum of `e`
In [30]:
df = pd.DataFrame({'a':[1,2,3,4,5,6],
'b':[1,2,3,4,5,6],
'c':['q', 'q', 'q', 'q', 'w', 'w'],
'd':['z','z','z','o','o','o']})
In [31]:
# Answer:
df['e'] = df['a'] + df['b']
df['f'] = (df.groupby(['c', 'd'])['e'].transform('sum'))
Task: Normalize (standardize) the data by calculating the z-score. Group the data by year
and calculate the z-score per group. z = (value - mean) / standard_deviation
In [32]:
index = pd.date_range('10/1/1999', periods=1100)
ser = pd.Series(np.random.normal(0.5, 2, 1100), index=index)
ser = ser.rolling(window=100,min_periods=100).mean().dropna()
In [33]:
# Answer:
key = lambda x: x.year
zscore = lambda x: (x - x.mean()) / x.std()
transformed = ser.groupby(key).transform(zscore)
Task: We would expect the result to now have mean 0 and standard deviation 1 within each group, which we can easily check. Calculate the mean and standard deviation within each group.
In [34]:
# Answer:
grouped = transformed.groupby(key)
grouped.mean()
grouped.std()
Out[34]:
Task: Visually compare the original and transformed data sets.
In [35]:
# Answer:
compare = pd.DataFrame({'Original': ser, 'Transformed': transformed}, columns=["Transformed", "Original"])
compare.plot(figsize=(12,9), color=["blue", "red"])
Out[35]:
Task: Let's reshape this small example DataFrame of "extended" ICU codes. Each person has different code-associations. Only positive associations are listed. Transform (reshape) the DataFrame to a wide format (one column per code) that lists positive and negative (missing) associations as Booleans.
In [36]:
df = pd.DataFrame({"Person": ["a", "a", "a", "b", "c", "c"], "Code": ["D99", "E32", "A41", "D99", "D99", "A41"]}, columns=["Person", "Code"])
df
Out[36]:
In [37]:
# Answer:
df["Values"] = pd.Series([True]*df.shape[0])
dfp = df.pivot(index="Person", columns="Code", values="Values")
dfp = dfp.fillna(value=False)
dfp = dfp.reset_index()
dfp
Out[37]:
Task: In the data/microbiome subdirectory, there are 9 spreadsheets of microbiome data that was acquired from high-throughput RNA sequencing procedures, along with a 10th file that describes the content of each. Write code that imports each of the data spreadsheets and combines them into a single DataFrame
, adding the identifying information from the metadata spreadsheet as columns in the combined DataFrame
.
In [38]:
# Answer
metadata = pd.read_excel(r"data/microbiome/metadata.xls", sheetname='Sheet1')
chunks = []
for i in range(9):
df_temp = pd.read_excel(r"data/microbiome/MID{0}.xls".format(i+1), 'Sheet 1', header=None, names=['Taxon', 'Count'])
df_temp = df_temp.set_index("Taxon")
df_temp.columns = ['Count']
df_temp.index.name = 'Taxon'
for colname in metadata.columns:
df_temp[colname] = metadata.loc[i, colname]
chunks.append(df_temp)
df = pd.concat(chunks)
df = df.reset_index()
df.head()
Out[38]:
Load the dataset in titanic.xls
. It contains data on all the passengers that travelled on the Titanic.
Task: Women and children first?
groupby
method to calculate the proportion of passengers that survived by sex.
In [39]:
# Answers
titanic = pd.read_excel("data/titanic.xls", "titanic")
# 1.
# explicit version
# titanic.groupby('sex')['survived'].agg("mean")
# or short version
titanic.groupby('sex')['survived'].mean()
# 2.
titanic.groupby(['pclass','sex'])['survived'].mean()
# 3.
titanic['agecat'] = pd.cut(titanic.age, [0, 13, 20, 64, 100], labels=['child', 'adolescent', 'adult', 'senior'])
titanic.groupby(['agecat', 'pclass','sex'])['survived'].mean()
titanic.groupby(['agecat', 'pclass','sex'])['survived'].agg(["mean", "count"])
Out[39]:
Let's plot the number of survivors grouped by sex and passenger class.
In [40]:
titanic.groupby(['sex','pclass'])["survived"].sum().plot(kind='barh')
Out[40]:
Task: Let's also look at the deaths (and not only at the survivors) within the groups and create a stacked Barplot of survivers vs. deaths grouped by sex and passenger-class (as before).
In [41]:
# Answer
# 1.
titanic.survived = titanic.survived.astype(bool)
# 2.
death_counts = pd.crosstab([titanic.pclass, titanic.sex], titanic.survived)
# 3.
death_counts.plot(kind='bar', stacked=True)
Out[41]:
Task: Another way of comparing the groups is to look at the survival rate, by adjusting for the number of people in each group. Create a stacked, horizontal Barplot of the adjusted death counts
In [42]:
# Answer
# 1.
sum_death_counts = death_counts.sum(axis=1).astype(float)
# 2.
adjusted_death_counts = death_counts.div(sum_death_counts, axis=0)
# 3.
adjusted_death_counts.plot(kind='barh', stacked=True)
# or in short:
# death_counts.div(death_counts.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True)
Out[42]: