In [ ]:
%matplotlib inline
from __future__ import print_function
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
PROJ_ROOT = os.path.join(os.pardir, os.pardir)
Don't forget all the fun debugging tools we covered while you work on these exercises.
%debug
%pdb
import q;q.d()
%prun
You'll notice that our dataset actually has two different files, pumps_train_values.csv
and pumps_train_labels.csv
. We want to load both of these together in a single DataFrame
for our exploratory analysis. Create a function that:
id
column as the indexdate_recorded
columns
In [ ]:
def load_pumps_data(values_path, labels_path):
# YOUR CODE HERE
pass
values = os.path.join(PROJ_ROOT, "data", "raw", "pumps_train_values.csv")
labels = os.path.join(PROJ_ROOT, "data", "raw", "pumps_train_labels.csv")
df = load_pumps_data(values, labels)
assert df.shape == (59400, 40)
In [ ]:
#SOLUTION
def load_pumps_data(values_path, labels_path):
train = pd.read_csv(values_path, index_col='id', parse_dates=["date_recorded"])
labels = pd.read_csv(labels_path, index_col='id')
return train.join(labels)
values = os.path.join(PROJ_ROOT, "data", "raw", "pumps_train_values.csv")
labels = os.path.join(PROJ_ROOT, "data", "raw", "pumps_train_labels.csv")
df = load_pumps_data(values, labels)
assert df.shape == (59400, 40)
Now that we've loaded our data, we want to do some pre-processing before we model. From inspection of the data, we've noticed that there are some numeric values that are probably not valid that we want to replace.
Select the relevant columns for modeling. For the purposes of this exercise, we'll select:
useful_columns = ['amount_tsh',
'gps_height',
'longitude',
'latitude',
'region',
'population',
'construction_year',
'extraction_type_class',
'management_group',
'quality_group',
'source_type',
'waterpoint_type',
'status_group']
Replace longitude, and population where it is 0 with mean for that region.
zero_is_bad_value = ['longitude', 'population']
Replace the latitude where it is -2E-8 (a different bad value) with the mean for that region.
other_bad_value = ['latitude']
Replace construction_year less than 1000 with the mean construction year.
A skeleton for this work is below where clean_raw_data
will call replace_value_with_grouped_mean
internally.
Copy and Paste the skeleton below into a Python file called preprocess.py
in src/features/
. Import and autoload the methods from that file to run tests on your changes in this notebook.
In [ ]:
def clean_raw_data(df):
""" Takes a dataframe and performs four steps:
- Selects columns for modeling
- For numeric variables, replaces 0 values with mean for that region
- Fills invalid construction_year values with the mean construction_year
- Converts strings to categorical variables
:param df: A raw dataframe that has been read into pandas
:returns: A dataframe with the preprocessing performed.
"""
pass
def replace_value_with_grouped_mean(df, value, column, to_groupby):
""" For a given numeric value (e.g., 0) in a particular column, take the
mean of column (excluding value) grouped by to_groupby and return that
column with the value replaced by that mean.
:param df: The dataframe to operate on.
:param value: The value in column that should be replaced.
:param column: The column in which replacements need to be made.
:param to_groupby: Groupby this variable and take the mean of column.
Replace value with the group's mean.
:returns: The data frame with the invalid values replaced
"""
pass
In [ ]:
#SOLUTION
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
import os
import sys
# add the 'src' directory as one where we can import modules
src_dir = os.path.join(PROJ_ROOT, 'src')
sys.path.append(src_dir)
# import my method from the source code
%aimport features.preprocess_solution
from features.preprocess_solution import clean_raw_data
In [ ]:
cleaned_df = clean_raw_data(df)
# verify construction year
assert (cleaned_df.construction_year > 1000).all()
# verify filled in other values
for numeric_col in ["population", "longitude", "latitude"]:
assert (cleaned_df[numeric_col] != 0).all()
# verify the types are in the expected types
assert (cleaned_df.dtypes
.astype(str)
.isin(["int64", "float64", "category"])).all()
# check some actual values
assert cleaned_df.latitude.mean() == -5.970642969008563
assert cleaned_df.longitude.mean() == 35.14119354200863
assert cleaned_df.population.mean() == 277.3070009774711
Now that we've got a feature matrix, let's train a model! Add a function as defined below to the src/model/train_model.py
The function should use sklearn.linear_model.LogisticRegression
to train a logistic regression model. In a dataframe with categorical variables pd.get_dummies
will do encoding that can be passed to sklearn
.
The LogisticRegression
class in sklearn
handles muticlass models automatically, so no need to use get_dummies
on status_group
.
Finally, this method should return a GridSearchCV object that has been run with the following parameters for a logistic regression model:
params = {'C': [0.1, 1, 10]}
In [ ]:
def logistic(df):
""" Trains a multinomial logistic regression model to predict the
status of a water pump given characteristics about the pump.
:param df: The dataframe with the features and the label.
:returns: A trained GridSearchCV classifier
"""
pass
In [ ]:
#SOLUTION
#import my method from the source code
%aimport model.train_model_solution
from model.train_model_solution import logistic
In [ ]:
%%time
clf = logistic(cleaned_df)
assert clf.best_score_ > 0.5
In [ ]:
# Just for fun, let's profile the whole stack and see what's slowest!
%prun logistic(clean_raw_data(load_pumps_data(values, labels)))