In [1]:
%matplotlib inline
from __future__ import print_function
from __future__ import division
import os
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from matplotlib import pyplot as plt
from matplotlib.colors import rgb2hex
import seaborn as sns
import statsmodels.api as sm
# let's not pollute this blog post with warnings
from warnings import filterwarnings
filterwarnings('ignore')
In [2]:
observations = pd.read_csv(os.path.join('data', 'training_set_observations.csv'), index_col=0)
observations.head()
Out[2]:
In [3]:
observations.columns
Out[3]:
In [4]:
observations.describe()
Out[4]:
In [5]:
corr = observations.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
plt.figure(figsize=(12, 12))
sns.heatmap(corr, vmax=1, square=True)
Out[5]:
In [6]:
print(
"We have {} penguin observations from {} to {} at {} unique sites in the Antarctic!" \
.format(observations.shape[0],
observations.season_starting.min(),
observations.season_starting.max(),
observations.site_id.nunique())
)
In [7]:
# How many observations do we have for each species?
observations.common_name.value_counts()
Out[7]:
In [8]:
# How many differnet sites do we see each species at?
(observations.groupby("common_name")
.site_id
.nunique())
Out[8]:
In [9]:
# How many count types do we have for each species?
(observations.groupby("common_name")
.count_type
.value_counts())
Out[9]:
In [10]:
nest_counts = pd.read_csv(
os.path.join('data', 'training_set_nest_counts.csv'),
index_col=[0,1]
)
# Let's look at the first 10 rows, and the last 10 columns
nest_counts.iloc[:10, -10:]
Out[10]:
In [11]:
# get a sort order for the sites with the most observations
sorted_idx = (pd.notnull(nest_counts)
.sum(axis=1)
.sort_values(ascending=False)
.index)
# get the top 25 most common sites and divide by the per-series mean
to_plot = nest_counts.loc[sorted_idx].head(25)
to_plot = to_plot.divide(to_plot.mean(axis=1), axis=0)
# plot the data
plt.gca().matshow(to_plot,
cmap='viridis')
plt.show()
In [12]:
def preprocess_timeseries(timeseries, first_year, fillna_value=0):
""" Takes one of the timeseries dataframes, removes
columns before `first_year`, and fills NaN values
with the preceeding value. Then backfills any
remaining NaNs.
As a courtesy, also turns year column name into
integers for easy comparisons.
"""
# column type
timeseries.columns = timeseries.columns.astype(int)
# subset to just data after first_year
timeseries = timeseries.loc[:, timeseries.columns >= first_year]
# Forward fill count values. This is a strong assumption.
timeseries.fillna(method="ffill", axis=1, inplace=True)
timeseries.fillna(method="bfill", axis=1, inplace=True)
# For sites with no observations, fill with fill_na_value
timeseries.fillna(fillna_value, inplace=True)
return timeseries
nest_counts = preprocess_timeseries(nest_counts,
1980,
fillna_value=0.0)
nest_counts.head()
Out[12]:
In [13]:
# get the top 25 most common sites and divide by the per-series mean
to_plot = nest_counts.loc[sorted_idx].head(25)
to_plot = to_plot.divide(to_plot.mean(axis=1), axis=0)
plt.gca().matshow(to_plot,
cmap='viridis')
plt.show()
In [14]:
e_n_values = pd.read_csv(
os.path.join('data', 'training_set_e_n.csv'),
index_col=[0,1]
)
# Process error data to match our nest_counts data
e_n_values = preprocess_timeseries(e_n_values, 1980, fillna_value=0.05)
e_n_values.head()
Out[14]:
In [15]:
def amape(y_true, y_pred, accuracies):
""" Adjusted MAPE
"""
not_nan_mask = ~np.isnan(y_true)
# calculate absolute error
abs_error = (np.abs(y_true[not_nan_mask] - y_pred[not_nan_mask]))
# calculate the percent error (replacing 0 with 1
# in order to avoid divide-by-zero errors).
pct_error = abs_error / np.maximum(1, y_true[not_nan_mask])
# adjust error by count accuracies
adj_error = pct_error / accuracies[not_nan_mask]
# return the mean as a percentage
return np.mean(adj_error)
In [16]:
# Let's confirm the best possible score is 0!
amape(nest_counts.values,
nest_counts.values,
e_n_values.values)
Out[16]:
In [17]:
from sklearn.svm import SVR
def train_model_per_row(ts, acc, split_year=2010):
# Split into train/test to tune our parameter
train = ts.iloc[ts.index < split_year]
rng = np.random.RandomState(1)
test = ts.iloc[ts.index >= split_year]
test_acc = acc.iloc[acc.index >= split_year]
# Store best lag parameter
best_mape = np.inf
best_lag = None
# Test linear regression models with the most recent
# 2 points through using all of the points
for lag in range(2, train.shape[0]):
# fit the model
#temp_model = LinearRegression()
#temp_model = DecisionTreeRegressor(max_depth=4)
#temp_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
# n_estimators=300, random_state=rng)
temp_model = SVR(kernel='poly', C=1e3, degree=2)
temp_model.fit(
train.index[-lag:].values.reshape(-1, 1),
train[-lag:]
)
# make our predictions on the test set
preds = temp_model.predict(
test.index.values.reshape(-1, 1)
)
# calculate the score using the custom metric
mape = amape(test.values,
preds,
test_acc.values)
# if it's the best score yet, hold on to the parameter
if mape < best_mape:
best_mape = mape
best_lag = lag
# return model re-trained on entire dataset
#final_model = LinearRegression()
final_model = SVR(kernel='poly', C=1e3, degree=2)
final_model.fit(
ts.index[-best_lag:].values.reshape(-1, 1),
ts[-best_lag:]
)
return final_model, best_mape ,best_lag
In [19]:
models = {}
avg_Best_Mape = 0.0
avg_best_lag = 0.0
iteration = 0
for i, row in tqdm_notebook(nest_counts.iterrows(),
total=nest_counts.shape[0]):
acc = e_n_values.loc[i]
models[i], best_mape, best_lag = train_model_per_row(row, acc)
avg_Best_Mape = avg_Best_Mape + best_mape
avg_best_lag = avg_best_lag + best_lag
iteration = iteration + 1
avg_Best_Mape = avg_Best_Mape / iteration
avg_best_lag = avg_best_lag / iteration
print("Avg Best Mape : {0}".format(avg_Best_Mape))
print("Avg Best Lag : {0}".format(avg_best_lag))
In [20]:
submission_format = pd.read_csv(
os.path.join('data','submission_format.csv'),
index_col=[0, 1]
)
print(submission_format.shape)
submission_format.head()
Out[20]:
In [21]:
preds = []
# For every row in the submission file
for i, row in tqdm_notebook(submission_format.iterrows(),
total=submission_format.shape[0]):
# get the model for this site + common_name
model = models[i]
# make predictions using the model
row_predictions = model.predict(
submission_format.columns.values.reshape(-1, 1)
)
# keep our predictions, rounded to nearest whole number
preds.append(np.round(row_predictions))
# Create a dataframe that we can write out to a CSV
prediction_df = pd.DataFrame(preds,
index=submission_format.index,
columns=submission_format.columns)
prediction_df.head()
Out[21]:
In [22]:
prediction_df.to_csv('predictions_SVM_poly.csv')
In [ ]:
In [ ]: