In [1]:
from __future__ import print_function
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
PROJ_ROOT = os.path.join(os.pardir)
print(os.path.abspath(PROJ_ROOT))
Tell everyone when your notebook was run, and with which packages. This is especially useful for nbview, blog posts, and other media where you are not sharing the notebook as executable code.
In [2]:
!pip install watermark
In [ ]:
# once it is installed, you'll just need this in future notebooks:
%load_ext watermark
In [ ]:
%watermark -a "Peter Bull" -d -t -v -p numpy,pandas
In [ ]:
%watermark?
Continuum's conda
tool provides a way to create isolated environments. In fact, you've already seen this at work if you followed the pydata setup instructions to setup your machine for this tutorial. The conda env
functionality let's you created an isolated environment on your machine for
To create an empty environment:
conda create -n <name> python=3
Note: python=2
will create a Python 2 environment; python=3
will create a Python 3 environment.
To work in a particular virtual environment:
source activate <name>
To leave a virtual environment:
source deactivate
Note: on Windows, the commands are just activate
and deactivate
, no need to type source
.
There are other Python tools for environment isolation, but none of them are perfect. If you're interested in the other options, virtualenv
and pyenv
both provide environment isolation. There are sometimes compatibility issues between the Anaconda Python distribution and these packages, so if you've got Anaconda on your machine you can use conda env
to create and manage environments.
#lifehack
: create a new environment for every project you work on
pip
requirements.txt fileIt's a convention in the Python ecosystem to track a project's dependencies in a file called requirements.txt
. We recommend using this file to keep track of your MRE, "Minimum reproducible environment".
Conda
#lifehack
: never again run pip install <package>
. Instead, update requirements.txt
and run pip install -r requirements.txt
In [ ]:
# what does requirements.txt look like?
print(open(os.path.join(PROJ_ROOT, 'requirements.txt')).read())
The format for a line in the requirements file is:
Syntax | Result |
---|---|
package_name |
for whatever the latest version on PyPI is |
package_name==X.X.X |
for an exact match of version X.X.X |
package_name>=X.X.X |
for at least version X.X.X |
Now, contributors can create a new virtual environment (using conda or any other tool) and install your dependencies just by running:
pip install -r requirements.txt
In [ ]:
## Try adding parameter index=0
pump_data_path = os.path.join(PROJ_ROOT,
"data",
"raw",
"pumps_train_values.csv")
df = pd.read_csv(pump_data_path, index=0)
df.head(3)
In [ ]:
pd.read_csv?
In [ ]:
In [ ]:
df.describe()
In [ ]:
## Paste for 'construction_year' and plot
## Paste for 'gps_height' and plot
plot_data = df['amount_tsh']
sns.kdeplot(plot_data, bw=100)
plt.show()
In [ ]:
def kde_plot(dataframe, variable, upper=None, lower=None, bw=0.1):
""" Plots a density plot for a variable with optional upper and
lower bounds on the data (inclusive).
"""
plot_data = dataframe[variable]
if upper is not None:
plot_data = plot_data[plot_data <= upper]
if lower is not None:
plot_data = plot_data[plot_data >= lower]
sns.kdeplot(plot_data, bw=bw)
plt.savefig(os.path.join(PROJ_ROOT, 'reports', 'figures', '{}.png'.format(variable)))
plt.show()
In [ ]:
kde_plot(df, 'amount_tsh', bw=100, lower=0)
kde_plot(df, 'construction_year', bw=1, lower=1000, upper=2016)
kde_plot(df, 'gps_height', bw=0.1)
So, we've got some invalid data in this dataset. For example, water pumps installed before in the year 0. We'll want to have a function to load and clean this data since we will probably be using this data in multiple datasets.
Here's a first pass at a function that will do that for us. Now, we've got the function implemented in the notebook, but let's bring it to a standalone file.
We'll copy these functions into:
src/features/build_features.py
In [ ]:
def awesome_function(s):
from IPython.display import display, HTML
css = """
.blink {
animation-duration: 1s;
animation-name: blink;
animation-iteration-count: infinite;
animation-timing-function: steps(2, start);
}
@keyframes blink {
80% {
visibility: hidden;
}
}"""
to_show = HTML(
'<style>{}</style>'.format(css) +
'<p class="blink"> {} IS AWESOME!!!!! </p>'.format(s)
)
display(to_show)
def remove_invalid_data(path):
""" Takes a path to a water pumps csv, loads in pandas, removes
invalid columns and returns the dataframe.
"""
df = pd.read_csv(path, index_col=0)
# preselected columns
useful_columns = ['amount_tsh',
'gps_height',
'longitude',
'latitude',
'region',
'population',
'construction_year',
'extraction_type_class',
'status_group',
'management_group',
'quality_group',
'source_type',
'waterpoint_type']
df = df[useful_columns]
invalid_values = {
'amount_tsh': {0: np.nan},
'longitude': {0: np.nan},
'installer': {0: np.nan},
'construction_year': {0: np.nan},
}
# drop rows with invalid values
df.replace(invalid_values, inplace=True)
# drop any rows in the dataset that have NaNs
df.dropna(how="any")
# create categorical columns
for c in df.columns:
if df[c].dtype == 'object':
df[c] = df[c].astype('category')
df.drop('status_group')
return pd.get_dummies(df)
If I'm just loading local python files that I expect to use in this project, I often just add the src
folder to the Python path using sys.path.append
. This tells Python to look in that folder for modules that we can import. This works well for local code and notebooks.
In [ ]:
# add local python functions
import sys
# add the 'src' directory as one where we can import modules
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)
# import my method from the source code
from features.build_features import remove_invalid_data
from features.build_features import awesome_function
In [ ]:
# edit function in file!
awesome_function("ODSC")
In [ ]:
df = remove_invalid_data(pump_data_path)
In [ ]:
%debug
As mentioned in the slides, using sys.path.append
is not the best way to distribute code that you want to run on other machines. For that, create a real Python package that can be separately developed, maintained, and deployed.
We can build a python package to solve that! In fact, there is a cookiecutter to create Python packages. Once we create this package, we can install it in "editable" mode, which means that as we change the code the changes will get picked up if the package is used. The process looks like:
cookiecutter https://github.com/wdm0006/cookiecutter-pipproject
cd package_name
pip install -e .
Now we can have a separate repository for this code and it can be used across projects without having to maintain code in multiple places.
In [ ]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
df = remove_invalid_data(pump_data_path)
labels = (pd.read_csv(os.path.join(PROJ_ROOT, 'data', 'raw', 'pumps_train_labels.csv'),
index_col=0)
.loc[df.index])
pl = Pipeline([
('interactions', PolynomialFeatures(degree=2)),
('clf', LogisticRegression())
])
pl.fit(df, labels)
In [ ]:
pl.predict(df)
In [ ]:
import itertools
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig(os.path.join(PROJ_ROOT, "reports", "figures", "confusion_matrix.png"))
plt.show()
In [ ]:
cm = confusion_matrix(labels, clf.predict(df),
labels=['functional', 'non functional', 'functional needs repair'])
plot_confusion_matrix(cm,
['functional', 'non functional', 'functional needs repair'])
In [ ]:
!tree {PROJ_ROOT}