notebook.community

Edit and run



In [1]:

    
%matplotlib inline

from __future__ import print_function
from __future__ import division

import os

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

from matplotlib import pyplot as plt
from matplotlib.colors import rgb2hex
import seaborn as sns

import statsmodels.api as sm

# let's not pollute this blog post with warnings
from warnings import filterwarnings
filterwarnings('ignore')



In [2]:

    
observations = pd.read_csv(os.path.join('data', 'training_set_observations.csv'), index_col=0)
observations.head()









    Out[2]:






  
    
      
      site_name
      site_id
      ccamlr_region
      longitude_epsg_4326
      latitude_epsg_4326
      common_name
      day
      month
      year
      season_starting
      penguin_count
      accuracy
      count_type
      vantage
      e_n
    
  
  
    
      0
      Acuna Island
      ACUN
      48.2
      -44.637
      -60.7612
      chinstrap penguin
      28.0
      12.0
      1983
      1983
      4000.0
      4.0
      nests
      ground
      0.50
    
    
      1
      Acuna Island
      ACUN
      48.2
      -44.637
      -60.7612
      adelie penguin
      NaN
      NaN
      1993
      1993
      2008.0
      1.0
      nests
      ground
      0.05
    
    
      2
      Acuna Island
      ACUN
      48.2
      -44.637
      -60.7612
      adelie penguin
      NaN
      NaN
      1994
      1994
      1920.0
      1.0
      nests
      NaN
      0.05
    
    
      3
      Acuna Island
      ACUN
      48.2
      -44.637
      -60.7612
      adelie penguin
      NaN
      NaN
      2004
      2004
      1880.0
      1.0
      nests
      ground
      0.05
    
    
      4
      Acuna Island
      ACUN
      48.2
      -44.637
      -60.7612
      adelie penguin
      25.0
      2.0
      2011
      2010
      3079.0
      5.0
      nests
      vhr
      0.90



In [3]:

    
observations.columns









    Out[3]:





Index(['site_name', 'site_id', 'ccamlr_region', 'longitude_epsg_4326',
       'latitude_epsg_4326', 'common_name', 'day', 'month', 'year',
       'season_starting', 'penguin_count', 'accuracy', 'count_type', 'vantage',
       'e_n'],
      dtype='object')



In [4]:

    
observations.describe()









    Out[4]:






  
    
      
      longitude_epsg_4326
      latitude_epsg_4326
      day
      month
      year
      season_starting
      penguin_count
      accuracy
      e_n
    
  
  
    
      count
      2952.000000
      2952.000000
      1797.000000
      2226.000000
      2952.000000
      2952.000000
      2935.000000
      2935.000000
      2935.000000
    
    
      mean
      -11.722006
      -65.935391
      15.593767
      6.943845
      1997.558604
      1997.206978
      8569.280409
      1.826235
      0.167291
    
    
      std
      85.926830
      4.380338
      8.818179
      5.244836
      12.724139
      12.671414
      29579.280288
      1.258289
      0.243676
    
    
      min
      -140.326600
      -77.578012
      1.000000
      1.000000
      1895.000000
      1895.000000
      0.000000
      1.000000
      0.050000
    
    
      25%
      -62.866700
      -67.450250
      8.000000
      1.000000
      1987.000000
      1986.000000
      272.000000
      1.000000
      0.050000
    
    
      50%
      -58.933000
      -64.794600
      15.000000
      11.000000
      1999.000000
      1999.000000
      1098.000000
      1.000000
      0.050000
    
    
      75%
      39.425200
      -62.596000
      24.000000
      12.000000
      2009.000000
      2009.000000
      3612.000000
      2.000000
      0.100000
    
    
      max
      171.169200
      -60.533300
      31.000000
      12.000000
      2014.000000
      2013.000000
      428516.000000
      5.000000
      0.900000



In [5]:

    
corr = observations.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
plt.figure(figsize=(12, 12))
sns.heatmap(corr, vmax=1, square=True)









    Out[5]:





<matplotlib.axes._subplots.AxesSubplot at 0x170c099f2b0>



In [6]:

    
print(
    "We have {} penguin observations from {} to {} at {} unique sites in the Antarctic!" \
    .format(observations.shape[0],
            observations.season_starting.min(),
            observations.season_starting.max(),
            observations.site_id.nunique())
)









    



We have 2952 penguin observations from 1895 to 2013 at 619 unique sites in the Antarctic!



In [7]:

    
# How many observations do we have for each species?
observations.common_name.value_counts()









    Out[7]:





adelie penguin       1387
gentoo penguin        791
chinstrap penguin     774
Name: common_name, dtype: int64



In [8]:

    
# How many differnet sites do we see each species at?
(observations.groupby("common_name")
             .site_id
             .nunique())









    Out[8]:





common_name
adelie penguin       281
chinstrap penguin    340
gentoo penguin       105
Name: site_id, dtype: int64



In [9]:

    
# How many count types do we have for each species?
(observations.groupby("common_name")
             .count_type
             .value_counts())









    Out[9]:





common_name        count_type
adelie penguin     nests         976
                   adults        223
                   chicks        188
chinstrap penguin  nests         608
                   adults         86
                   chicks         80
gentoo penguin     nests         629
                   chicks        161
                   adults          1
Name: count_type, dtype: int64



In [10]:

    
nest_counts = pd.read_csv(
                os.path.join('data', 'training_set_nest_counts.csv'),
                index_col=[0,1]
              )

# Let's look at the first 10 rows, and the last 10 columns
nest_counts.iloc[:10, -10:]









    Out[10]:






  
    
      
      
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
    
    
      site_id
      common_name
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      ACUN
      adelie penguin
      1880.0
      NaN
      NaN
      NaN
      NaN
      NaN
      3079.0
      NaN
      NaN
      NaN
    
    
      chinstrap penguin
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      ADAM
      adelie penguin
      NaN
      NaN
      NaN
      NaN
      NaN
      76.0
      NaN
      NaN
      NaN
      NaN
    
    
      ADAR
      adelie penguin
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      338231.0
      NaN
      NaN
      428516.0
    
    
      AILS
      chinstrap penguin
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      AITC
      chinstrap penguin
      NaN
      NaN
      NaN
      NaN
      5620.0
      NaN
      NaN
      4047.0
      NaN
      NaN
    
    
      gentoo penguin
      NaN
      1998.0
      1639.0
      1383.0
      2210.0
      1900.0
      1319.0
      2213.0
      NaN
      NaN
    
    
      AITK
      chinstrap penguin
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      AKAR
      adelie penguin
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      106.0
      NaN
      NaN
      NaN
    
    
      ALAS
      adelie penguin
      NaN
      NaN
      NaN
      NaN
      1080.0
      NaN
      NaN
      NaN
      NaN
      NaN



In [11]:

    
# get a sort order for the sites with the most observations
sorted_idx = (pd.notnull(nest_counts)
                .sum(axis=1)
                .sort_values(ascending=False)
                .index)

# get the top 25 most common sites and divide by the per-series mean
to_plot = nest_counts.loc[sorted_idx].head(25)
to_plot = to_plot.divide(to_plot.mean(axis=1), axis=0)

# plot the data
plt.gca().matshow(to_plot,
                  cmap='viridis')
plt.show()



In [12]:

    
def preprocess_timeseries(timeseries, first_year, fillna_value=0):
    """ Takes one of the timeseries dataframes, removes
        columns before `first_year`, and fills NaN values
        with the preceeding value. Then backfills any
        remaining NaNs.
        
        As a courtesy, also turns year column name into
        integers for easy comparisons.
    """
    # column type
    timeseries.columns = timeseries.columns.astype(int)
    
    # subset to just data after first_year
    timeseries = timeseries.loc[:, timeseries.columns >= first_year]
   
    # Forward fill count values. This is a strong assumption.
    timeseries.fillna(method="ffill", axis=1, inplace=True)
    timeseries.fillna(method="bfill", axis=1, inplace=True)
    
    # For sites with no observations, fill with fill_na_value
    timeseries.fillna(fillna_value, inplace=True)
    
    return timeseries

nest_counts = preprocess_timeseries(nest_counts,
                                    1980,
                                    fillna_value=0.0)
nest_counts.head()









    Out[12]:






  
    
      
      
      1980
      1981
      1982
      1983
      1984
      1985
      1986
      1987
      1988
      1989
      ...
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
    
    
      site_id
      common_name
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      ACUN
      adelie penguin
      2008.0
      2008.0
      2008.0
      2008.0
      2008.0
      2008.0
      2008.0
      2008.0
      2008.0
      2008.0
      ...
      1880.0
      1880.0
      1880.0
      1880.0
      1880.0
      1880.0
      3079.0
      3079.0
      3079.0
      3079.0
    
    
      chinstrap penguin
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      ...
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
      4000.0
    
    
      ADAM
      adelie penguin
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      ...
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
      76.0
    
    
      ADAR
      adelie penguin
      256806.0
      256806.0
      256806.0
      256806.0
      256806.0
      256806.0
      282307.0
      282307.0
      272338.0
      272338.0
      ...
      338777.0
      338777.0
      338777.0
      338777.0
      338777.0
      338777.0
      338231.0
      338231.0
      338231.0
      428516.0
    
    
      AILS
      chinstrap penguin
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      ...
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
      6000.0
    
  

5 rows × 34 columns



In [13]:

    
# get the top 25 most common sites and divide by the per-series mean
to_plot = nest_counts.loc[sorted_idx].head(25)
to_plot = to_plot.divide(to_plot.mean(axis=1), axis=0)

plt.gca().matshow(to_plot,
                cmap='viridis')
plt.show()



In [14]:

    
e_n_values = pd.read_csv(
                 os.path.join('data', 'training_set_e_n.csv'),
                 index_col=[0,1]
             )

# Process error data to match our nest_counts data
e_n_values = preprocess_timeseries(e_n_values, 1980, fillna_value=0.05)
e_n_values.head()









    Out[14]:






  
    
      
      
      1980
      1981
      1982
      1983
      1984
      1985
      1986
      1987
      1988
      1989
      ...
      2004
      2005
      2006
      2007
      2008
      2009
      2010
      2011
      2012
      2013
    
    
      site_id
      common_name
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      ACUN
      adelie penguin
      0.05
      0.05
      0.05
      0.05
      0.05
      0.05
      0.05
      0.05
      0.05
      0.05
      ...
      0.05
      0.05
      0.05
      0.05
      0.05
      0.05
      0.9
      0.9
      0.9
      0.9
    
    
      chinstrap penguin
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      ...
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.5
      0.5
      0.5
      0.5
    
    
      ADAM
      adelie penguin
      0.90
      0.90
      0.90
      0.90
      0.90
      0.90
      0.90
      0.90
      0.90
      0.90
      ...
      0.90
      0.90
      0.90
      0.90
      0.90
      0.90
      0.9
      0.9
      0.9
      0.9
    
    
      ADAR
      adelie penguin
      0.10
      0.10
      0.10
      0.10
      0.10
      0.10
      0.10
      0.10
      0.10
      0.10
      ...
      0.10
      0.10
      0.10
      0.10
      0.10
      0.10
      0.9
      0.9
      0.9
      0.1
    
    
      AILS
      chinstrap penguin
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      ...
      0.50
      0.50
      0.50
      0.50
      0.50
      0.50
      0.5
      0.5
      0.5
      0.5
    
  

5 rows × 34 columns



In [15]:

    
def amape(y_true, y_pred, accuracies):
    """ Adjusted MAPE
    """
    not_nan_mask = ~np.isnan(y_true)
    
    # calculate absolute error
    abs_error = (np.abs(y_true[not_nan_mask] - y_pred[not_nan_mask]))
    
    # calculate the percent error (replacing 0 with 1
    # in order to avoid divide-by-zero errors).
    pct_error = abs_error / np.maximum(1, y_true[not_nan_mask])
    
    # adjust error by count accuracies
    adj_error = pct_error / accuracies[not_nan_mask]
    
    # return the mean as a percentage
    return np.mean(adj_error)



In [16]:

    
# Let's confirm the best possible score is 0!
amape(nest_counts.values,
      nest_counts.values,
      e_n_values.values)









    Out[16]:





0.0



In [38]:

    
from sklearn.linear_model import BayesianRidge, LinearRegression
from sklearn import neighbors
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import ElasticNet

def train_model_per_row(ts, acc, split_year=2010):
    n_neighbors = 2
    # Split into train/test to tune our parameter
    train = ts.iloc[ts.index < split_year]
    
    test = ts.iloc[ts.index >= split_year]
    test_acc = acc.iloc[acc.index >= split_year]
    
    # Store best lag parameter
    best_mape = np.inf 
    best_lag = None
  
    # Test linear regression models with the most recent
    # 2 points through using all of the points
    for lag in range(2, train.shape[0]):
        # fit the model
        #temp_model = LinearRegression()
        epsilon_values = [1.35, 1.5, 1.75, 1.9]
        temp_model = neighbors.KNeighborsRegressor(n_neighbors, weights="distance")



        temp_model.fit(
            train.index[-lag:].values.reshape(-1, 1),
            train[-lag:]
        )
        
        # make our predictions on the test set
        preds = temp_model.predict(
                    test.index.values.reshape(-1, 1)
                )

        # calculate the score using the custom metric
        mape = amape(test.values,
                     preds,
                     test_acc.values)

        # if it's the best score yet, hold on to the parameter
        if mape < best_mape:
            best_mape = mape
            best_lag = lag


    # return model re-trained on entire dataset
    #final_model = LinearRegression()
    final_model = neighbors.KNeighborsRegressor(n_neighbors, weights="distance")
    
    final_model.fit(
        ts.index[-best_lag:].values.reshape(-1, 1),
        ts[-best_lag:]
    )

    return final_model, best_mape ,best_lag



In [39]:

    
models = {}
avg_Best_Mape = 0.0
avg_best_lag  = 0.0
iteration = 0
for i, row in tqdm_notebook(nest_counts.iterrows(),
                            total=nest_counts.shape[0]):
    acc = e_n_values.loc[i]
    models[i], best_mape, best_lag  = train_model_per_row(row, acc)
    avg_Best_Mape = avg_Best_Mape + best_mape
    avg_best_lag = avg_best_lag + best_lag
    iteration = iteration + 1
avg_Best_Mape = avg_Best_Mape / iteration
avg_best_lag = avg_best_lag / iteration  
print("Avg Best Mape : {0}".format(avg_Best_Mape))
print("Avg Best Lag  : {0}".format(avg_best_lag))









    



Avg Best Mape : 66.61544716415143
Avg Best Lag  : 2.0



In [40]:

    
submission_format = pd.read_csv(
    os.path.join('data','submission_format.csv'),
    index_col=[0, 1]
)

print(submission_format.shape)
submission_format.head()









    



(648, 4)






    Out[40]:






  
    
      
      
      2014
      2015
      2016
      2017
    
    
      site_id
      common_name
      
      
      
      
    
  
  
    
      ACUN
      adelie penguin
      0.0
      0.0
      0.0
      0.0
    
    
      chinstrap penguin
      0.0
      0.0
      0.0
      0.0
    
    
      ADAM
      adelie penguin
      0.0
      0.0
      0.0
      0.0
    
    
      ADAR
      adelie penguin
      0.0
      0.0
      0.0
      0.0
    
    
      AILS
      chinstrap penguin
      0.0
      0.0
      0.0
      0.0



In [41]:

    
preds = []

# For every row in the submission file
for i, row in tqdm_notebook(submission_format.iterrows(),
                            total=submission_format.shape[0]):
    
    # get the model for this site + common_name
    model = models[i]
    
    # make predictions using the model
    row_predictions = model.predict(
        submission_format.columns.values.reshape(-1, 1)
    )
       
    # keep our predictions, rounded to nearest whole number
    preds.append(np.round(row_predictions))

# Create a dataframe that we can write out to a CSV
prediction_df = pd.DataFrame(preds,
                             index=submission_format.index,
                             columns=submission_format.columns)

prediction_df.head()









    









    Out[41]:






  
    
      
      
      2014
      2015
      2016
      2017
    
    
      site_id
      common_name
      
      
      
      
    
  
  
    
      ACUN
      adelie penguin
      3079.0
      3079.0
      3079.0
      3079.0
    
    
      chinstrap penguin
      4000.0
      4000.0
      4000.0
      4000.0
    
    
      ADAM
      adelie penguin
      76.0
      76.0
      76.0
      76.0
    
    
      ADAR
      adelie penguin
      398421.0
      392402.0
      389822.0
      388389.0
    
    
      AILS
      chinstrap penguin
      6000.0
      6000.0
      6000.0
      6000.0



In [42]:

    
prediction_df.to_csv('predictions_KNeighborsRegressor_n2_weightdistance.csv')



In [ ]:



In [ ]:

	site_name	site_id	ccamlr_region	longitude_epsg_4326	latitude_epsg_4326	common_name	day	month	year	season_starting	penguin_count	accuracy	count_type	vantage	e_n
0	Acuna Island	ACUN	48.2	-44.637	-60.7612	chinstrap penguin	28.0	12.0	1983	1983	4000.0	4.0	nests	ground	0.50
1	Acuna Island	ACUN	48.2	-44.637	-60.7612	adelie penguin	NaN	NaN	1993	1993	2008.0	1.0	nests	ground	0.05
2	Acuna Island	ACUN	48.2	-44.637	-60.7612	adelie penguin	NaN	NaN	1994	1994	1920.0	1.0	nests	NaN	0.05
3	Acuna Island	ACUN	48.2	-44.637	-60.7612	adelie penguin	NaN	NaN	2004	2004	1880.0	1.0	nests	ground	0.05
4	Acuna Island	ACUN	48.2	-44.637	-60.7612	adelie penguin	25.0	2.0	2011	2010	3079.0	5.0	nests	vhr	0.90

	longitude_epsg_4326	latitude_epsg_4326	day	month	year	season_starting	penguin_count	accuracy	e_n
count	2952.000000	2952.000000	1797.000000	2226.000000	2952.000000	2952.000000	2935.000000	2935.000000	2935.000000
mean	-11.722006	-65.935391	15.593767	6.943845	1997.558604	1997.206978	8569.280409	1.826235	0.167291
std	85.926830	4.380338	8.818179	5.244836	12.724139	12.671414	29579.280288	1.258289	0.243676
min	-140.326600	-77.578012	1.000000	1.000000	1895.000000	1895.000000	0.000000	1.000000	0.050000
25%	-62.866700	-67.450250	8.000000	1.000000	1987.000000	1986.000000	272.000000	1.000000	0.050000
50%	-58.933000	-64.794600	15.000000	11.000000	1999.000000	1999.000000	1098.000000	1.000000	0.050000
75%	39.425200	-62.596000	24.000000	12.000000	2009.000000	2009.000000	3612.000000	2.000000	0.100000
max	171.169200	-60.533300	31.000000	12.000000	2014.000000	2013.000000	428516.000000	5.000000	0.900000

		2004	2005	2006	2007	2008	2009	2010	2011	2012	2013
site_id	common_name
ACUN	adelie penguin	1880.0	NaN	NaN	NaN	NaN	NaN	3079.0	NaN	NaN	NaN
ACUN	chinstrap penguin	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
ADAM	adelie penguin	NaN	NaN	NaN	NaN	NaN	76.0	NaN	NaN	NaN	NaN
ADAR	adelie penguin	NaN	NaN	NaN	NaN	NaN	NaN	338231.0	NaN	NaN	428516.0
AILS	chinstrap penguin	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
AITC	chinstrap penguin	NaN	NaN	NaN	NaN	5620.0	NaN	NaN	4047.0	NaN	NaN
AITC	gentoo penguin	NaN	1998.0	1639.0	1383.0	2210.0	1900.0	1319.0	2213.0	NaN	NaN
AITK	chinstrap penguin	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
AKAR	adelie penguin	NaN	NaN	NaN	NaN	NaN	NaN	106.0	NaN	NaN	NaN
ALAS	adelie penguin	NaN	NaN	NaN	NaN	1080.0	NaN	NaN	NaN	NaN	NaN