In [1]:
%matplotlib inline
from __future__ import print_function, division
import sys
sys.path.append('..')
import os
os.chdir("..")
import matplotlib.pyplot as plt
import numpy as np
import vessel_scoring.models
from vessel_scoring.models import train_model_on_data
from vessel_scoring import data, utils
import vessel_scoring.colspec
# import vessel_scoring.evaluate_model_new as evmodel
from vessel_scoring.evaluate_model import evaluate_model, compare_models
from IPython.core.display import display, display_html, HTML, Markdown, publish_display_data
from sklearn import metrics
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D
In [2]:
# Load training and test data
# Data supplied by Kristina
all_lline_kristina, train_lline_kristina, valid_lline_kristina, test_lline_kristina = data.load_dataset_by_vessel(
'datasets/kristina_longliner.measures.npz')
all_trawl_kristina, train_trawl_kristina, valid_trawl_kristina, test_trawl_kristina = data.load_dataset_by_vessel(
'datasets/kristina_trawl.measures.npz')
all_pseine_kristina, train_pseine_kristina, valid_pseine_kristina, test_pseine_kristina = data.load_dataset_by_vessel(
'datasets/kristina_ps.measures.npz')
# Crowd sourced longliner data
all_lline_crowd_alex, train_lline_crowd_alex, valid_lline_crowd_alex, test_lline_crowd_alex = data.load_dataset_by_vessel(
"datasets/classified-filtered.measures.npz")
# # Crowd sourced data from DavidK's setup
# _, train_lline_crowd_new, valid_lline_crowd_new, test_lline_crowd_new = data.load_dataset_by_vessel(
# "../datasets/id_fishing_points_longliner_classified_tracks.measures.npz")
# test_lline_crowd = utils.concat_common_fields(test_lline_crowd_new, test_lline_crowd_alex)
# _, train_trawl_crowd_new, valid_trawl_crowd_new, test_trawl_crowd_new = data.load_dataset_by_vessel(
# "../datasets/id_fishing_points_trawler_classified_tracks.measures.npz")
# all_pseine_crowd, train_pseine_crowd_new, valid_pseine_crowd_new, test_pseine_crowd_new = data.load_dataset_by_vessel(
# "../datasets/id_fishing_points_purse_seine_classified_tracks.measures.npz")
# Slow transits (used to train models to avoid classifying slow transits as fishing)
TRANSIT_WEIGHT = 10
x_tran, xtrain_tran, xcross_tran, xtest_tran = data.load_dataset_by_vessel(
'datasets/slow-transits.measures.npz', even_split=False)
train_tran = utils.concatenate_different_recarrays([xtrain_tran, xcross_tran] * TRANSIT_WEIGHT)
train_base = utils.concatenate_different_recarrays([train_trawl_kristina, train_lline_kristina, train_pseine_kristina,
valid_lline_kristina, valid_trawl_kristina, valid_pseine_kristina]
# train_lline_crowd_new[::2], train_lline_crowd_alex[::2], train_trawl_crowd_new, train_pseine_crowd_new,
# valid_lline_crowd_new[::2], valid_lline_crowd_alex[::2], valid_trawl_crowd_new, valid_pseine_crowd_new
)
train_lline = utils.concatenate_different_recarrays([train_lline_kristina, valid_lline_kristina]
# train_lline_crowd_new[::2], train_lline_crowd_alex[::2],
# valid_lline_crowd_new[::2], valid_lline_crowd_alex[::2],
+ [train_tran] * TRANSIT_WEIGHT)
train_trawl = utils.concatenate_different_recarrays([train_trawl_kristina, valid_trawl_kristina]
# train_trawl_crowd_new, valid_trawl_crowd_new,
+ [train_tran] * TRANSIT_WEIGHT)
train_pseine = utils.concatenate_different_recarrays([train_pseine_kristina, valid_pseine_kristina]
#train_pseine_kristina, valid_pseine_crowd_new,
+ [train_tran] * TRANSIT_WEIGHT)
train = utils.concatenate_different_recarrays([train_base, train_tran])
In [3]:
train_base['measure_speedavg_21600'].mean()
Out[3]:
In [4]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,10))
ax1 = fig.add_subplot(221, projection='3d')
ax2 = fig.add_subplot(222)
ax3 = fig.add_subplot(223)
for ax in [ax1, ax2, ax3]:
ax.set_xlabel('average speed')
ax.set_xlim(0, 17)
for ax in [ax1, ax3]:
ax.set_ylabel('speed deviation')
ax.set_ylim(0, 9)
for ax in [ax1]:
ax.set_zlabel('course deviation')
ax.set_zlim(0, 50)
ax2.set_ylabel('course deviation')
ax2.set_ylim(0, 50)
fig.subplots_adjust(hspace=.3)
fig.subplots_adjust(wspace=.3)
is_fishing = utils.is_fishy(train_base)
avg_speed = 17 * (1 - train_base['measure_speedavg_21600'])
speed_deviation = 17 * train_base['measure_speedstddev_21600']
# Note that this is only approximately the standard deviation of the course since we use
# the cos/sin trick to get around branch cut at 2 Pi.
course_deviation = train_base['measure_coursestddev_21600'] * 180 / np.pi * np.sqrt(2)
alpha = 0.05
ax1.scatter(avg_speed[is_fishing],
speed_deviation[is_fishing],
course_deviation[is_fishing], marker='.', c='#FF0000', alpha=alpha, edgecolors='face')
ax1.scatter(avg_speed[~is_fishing],
speed_deviation[~is_fishing],
course_deviation[~is_fishing], marker='.', c='#0000FF', alpha=alpha, edgecolors='face')
ax2.scatter(avg_speed[is_fishing],
course_deviation[is_fishing], marker='.', c='#FF0000', alpha=alpha, edgecolors='face')
ax2.scatter(avg_speed[~is_fishing],
course_deviation[~is_fishing], marker='.', c='#0000FF', alpha=alpha, edgecolors='face')
ax3.scatter(avg_speed[is_fishing],
speed_deviation[is_fishing], marker='.', c='#FF0000', alpha=alpha, edgecolors='face')
ax3.scatter(avg_speed[~is_fishing],
speed_deviation[~is_fishing], marker='.', c='#0000FF', alpha=alpha, edgecolors='face')
plt.show()
In [5]:
# evmodel.plot_vessel_track(train_lline, 224259000)
Our initial test and training data consisted of roughly a dozen different vessels of each type classified over a multi-year period by Kristina Boerder of Dalhousie University. One-quarter of those are used for testing, so there is a relatively small number of different vessels in the test sets.
In addition, we are beginning to collect crowd sourced data for both testing and training. Some of the early crowd sourced data, available for long liners only, is used as an additional test set in the examples below.
In [6]:
for name, all_data, test_data in [("trawlers", all_trawl_kristina, test_trawl_kristina),
("purse seiners", all_pseine_kristina, test_pseine_kristina),
("longliners", all_lline_kristina, test_lline_kristina),
("lonflinwea alex", all_lline_crowd_alex, train_lline_crowd_alex),
("transits", x_tran, xtest_tran),
("transits (training)", x_tran, train_tran),
]:
mmsi_count = len(set(test_data['mmsi']))
pt_count = len(test_data)
fishing_fraction = utils.is_fishy(test_data).sum() / pt_count
print("For {0} we have {1} test vessels with {2} test points; {3}% of the are fishing".format(
name, mmsi_count, pt_count, 100 * fishing_fraction))
print("\t {0} total vessels with {1} points".format(
len(set(all_data['mmsi'])), len(all_data)))
print("Total training", len(train_tran) / 10 + len(train_base))
print("train tran", len(train_tran) / 10)
In [7]:
# TODO: Automate
md = """
| |Vessels | Points | Training Points | Validation Points|
|:-------------|:------:|:------:|:---------------:|:----------------:|
| Longliner | 16 |569,504 | 15,000 | 5000 |
| Trawler | 6 |828,162 | 15,000 | 5000 |
| Purse Seine | 7 |398,897 | 15,000 | 5000 |
| Slow Transits| 2 |9,038 | 6,514 | |
"""
display(Markdown(md))
In [8]:
md = """
| |Predicted fishing classified<br/>correctly (precision)|Fishing captured<br/>(recall)|Non–fishing classified as<br/>fishing (false positive rate)|
|:-------------------------------------------------|:-----------------------:|:-----------------------------------------------------:|:---:|
|Longliner | 97% | 78% | 9% |
|Trawler | 93% | 91% | 10% |
|Purse Seine | 11% | 73% | 22% |
"""
display(Markdown(md))
In [9]:
import IPython.core.display as _display
import IPython.core
dir(_display)
dir(IPython.core)
from IPython.core import oinspect
In [10]:
# Prepare the models
from vessel_scoring.legacy_heuristic_model import LegacyHeuristicModel
from vessel_scoring.logistic_model import LogisticModel
from vessel_scoring.random_forest_model import RandomForestModel
test_data = {'Long liners': test_lline_kristina,
'Long liners - crowd': test_lline_crowd_alex,
'Trawlers': test_trawl_kristina,
'Purse seiners': test_pseine_kristina,
}
GEARS = [('ps', 'Purse seiners'),
('trawl', 'Trawlers'),
('longliner', 'Long liners'),
('longliner', 'Long liners - crowd'),
]
The models output a numbers between 0 and 1 that correspond to how
confident they are that there is fishing occuring. For
the first set of comparisons we treat predictions >0.5
as fishing and those <=0.5 as nonfishing. This allows us to use
precision, recall and f1-score as metrics. We also show Receiver
Operator Characteristic (ROC) area under the curve (AUC) plots and
precision recall plots.
In [11]:
datasets = vessel_scoring.models.load_data()
for gear, title in GEARS:
X_test = test_data[title]
spec = vessel_scoring.models.untrained_models['Logistic']
mdl = vessel_scoring.models.train_model('Logistic', spec, datasets)
display(HTML("<h2>{}</h2>".format(title)))
compare_models([("Logistic", mdl)], X_test)
display(HTML("<hr/>"))
In [12]:
GEARS
Out[12]:
In [13]:
f, a1 = plt.subplots(1, figsize=(10,5))
spec = vessel_scoring.models.untrained_models['Logistic']
mdl = vessel_scoring.models.train_model('Logistic', spec, datasets)
for gear, title in GEARS:
td = test_data[title]
score = mdl.predict_proba(td)[:,1]
precisions, recalls, thresholds = metrics.precision_recall_curve(utils.is_fishy(td), score)
a1.plot(recalls, precisions, label=gear)
a1.set_ylabel('Recall')
a1.set_xlabel('Precision')
a1.legend(loc="center left")
a1.set_ylim(0, 1)
Out[13]: