Imports


In [1]:
import pandas, json
from pandas import Series
from pandas.io.json import json_normalize

from os import path, listdir

from math import nan, pi, radians, floor

from skimage.io import imread, imshow
from skimage.color import rgb2gray
from skimage.filters import sobel
from skimage.measure import label
from skimage.segmentation import slic, join_segmentations
from skimage.morphology import watershed
from skimage.color import label2rgb
from skimage import data

import numpy

from matplotlib import cm, pyplot

Magic


In [2]:
%matplotlib inline

Constants


In [3]:
HOME_PATH = path.join("/home", "jovyan")
SNAPSHOT_PATH = path.join(HOME_PATH, "data", "production", "state", "snapshots.json")
BASE_IMAGE_PATH = '/home/miguel/IdeaProjects/RapBot/'

Load snapshots


In [4]:
def to_dataframe(file, snapshot_path=SNAPSHOT_PATH):
    dataframe = pandas.read_json(path.join(snapshot_path, file), lines=True)
    return json_normalize(json.loads(dataframe.to_json(orient='records')))
    
def load_snapshot_data(snapshot_path=SNAPSHOT_PATH):
    json_paths = listdir(snapshot_path)
    dataframes = [to_dataframe(file) for file in json_paths]
    
    return pandas.concat(dataframes)

Clean the snapshots


In [5]:
def remove_empty_rows(data):
    """Filter snapshots without an image"""
    return data.replace({"/dev/null": nan}).dropna(axis=0, how='any')

def localize_paths(data):
    """Replace desktop paths with container paths"""
    return data.apply(lambda p: path.join(HOME_PATH, path.relpath(p, BASE_IMAGE_PATH)))
    
def clean_data(data):
    filtered_data = remove_empty_rows(data)
    filtered_data["imagePath"] = localize_paths(filtered_data["imagePath"])
    return filtered_data

Add categories to snapshots


In [6]:
def add_quadrant(data):
    data["quadrant"] = data["drive.orientation"].apply(lambda orientation: 1 + floor(radians(orientation) / (pi / 2)))

def add_direction(data):
    data["direction"] = data["drive.orientation"].apply(lambda orientation: "forward" if orientation >= 0 and orientation <= 180 else "reverse")
    
def add_movement(data):
    data["movement"] = data["drive.throttle"].apply(lambda throttle: "stopped" if throttle == 0 else "moving")

Load the images


In [7]:
def load_image(path):
    """Loads an image given a path."""
    return imread(path, as_grey=True)
    
def transform_image(image):
    # Make segmentation using edge-detection and watershed.
    edges = sobel(image)

    # Identify some background and foreground pixels from the intensity values.
    # These pixels are used as seeds for watershed.
    foreground, background = 1, 2
    
    markers = np.zeros_like(image)
    markers[image < 30.0] = background
    markers[image > 150.0] = foreground

    return watershed(edges, markers)

def image_features(image):
    """Transforms an image into a row of features."""
    return Series(image.reshape(image.shape[0] * image.shape[1]))

def load_image_data(data):
    data["image"] = data["imagePath"].apply(lambda p: load_image(p))
    return data

Prepare


In [8]:
snapshots = load_image_data(clean_data(load_snapshot_data()))

add_quadrant(snapshots)
add_direction(snapshots)
add_movement(snapshots)

snapshots.head()


Out[8]:
drive.orientation drive.throttle imagePath start timeWindow trigger uuid vehicle.backLeft.command.value vehicle.backLeft.speed vehicle.backRight.command.value vehicle.backRight.speed vehicle.frontLeft.command.value vehicle.frontLeft.speed vehicle.frontRight.command.value vehicle.frontRight.speed image quadrant direction movement
1 90 0 /home/jovyan/data/production/images/image48725... 1525650406810 82 websocket e2ce5c18-7f39-4155-8139-e28a22e228e3 4 0 4 0 4 0 4 0 [[0.289659215686, 0.289659215686, 0.2896592156... 2 forward stopped
2 90 5 /home/jovyan/data/production/images/image51646... 1525650411211 98 driver 76f86f63-f9c9-4060-90db-6d3ef8662875 1 5 1 5 1 5 1 5 [[0.276487843137, 0.290224705882, 0.2941462745... 2 forward moving
3 30 5 /home/jovyan/data/production/images/image82544... 1525650412232 77 driver 2a1e667b-03d3-4683-bd08-8eb95578e27b 1 5 1 2 1 5 1 2 [[0.272871764706, 0.280714901961, 0.2767933333... 1 forward moving
4 30 0 /home/jovyan/data/production/images/image92102... 1525650413583 63 driver 15dab0e6-eb2e-4287-926d-fbaa875f59cc 1 0 1 0 1 0 1 0 [[0.284636470588, 0.288558039216, 0.2924796078... 1 forward stopped
5 30 5 /home/jovyan/data/production/images/image68236... 1525650414664 56 driver 66f0349b-24c1-49ce-a9a2-72f2d6783964 1 5 1 2 1 5 1 2 [[0.286303137255, 0.274538431373, 0.2902247058... 1 forward moving

Determine stratification


In [9]:
columns = [
    "movement", "trigger", "quadrant", 
    "vehicle.frontLeft.command.value", "vehicle.backLeft.command.value",
    "vehicle.frontRight.command.value", "vehicle.backRight.command.value"
]

for column in columns:
    print("*" * 80)
    print(snapshots[column].value_counts())


********************************************************************************
stopped    857
moving     159
Name: movement, dtype: int64
********************************************************************************
snapshot      712
driver        297
websocket       4
$a              1
connection      1
$b              1
Name: trigger, dtype: int64
********************************************************************************
2    692
4    197
1     94
3     33
Name: quadrant, dtype: int64
********************************************************************************
1    669
2    230
4    117
Name: vehicle.frontLeft.command.value, dtype: int64
********************************************************************************
1    669
2    230
4    117
Name: vehicle.backLeft.command.value, dtype: int64
********************************************************************************
1    669
2    230
4    117
Name: vehicle.frontRight.command.value, dtype: int64
********************************************************************************
1    669
2    230
4    117
Name: vehicle.backRight.command.value, dtype: int64

Split into train and test sets


In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(snapshots, snapshots["quadrant"]):
    train_set = snapshots.iloc[train_index]
    test_set = snapshots.iloc[test_index]

In [11]:
print(len(train_set), "train + ", len(test_set), "test")


812 train +  204 test

In [12]:
snapshots = train_set

Visualize the data


In [13]:
def plot_images(instances, shape, images_per_row=10, **options):
    length, width = shape
    
    images_per_row = min(len(instances), images_per_row)
    n_rows = (len(instances) - 1) // images_per_row + 1
    n_empty = n_rows * images_per_row - len(instances)
    
    for i in range(0, n_empty):
        instances.append(numpy.zeros((length, width)))
    
    row_images = []
    for row in range(n_rows):
        rimages = instances[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(numpy.concatenate(rimages, axis=1))
    
    image = numpy.concatenate(row_images, axis=0)
    
    pyplot.figure(figsize=(50, 50))
    pyplot.imshow(image, cmap = cm.gray, **options)
    pyplot.axis("off")
    pyplot.show()

In [14]:
images = snapshots["image"]
shape = images.head(n=1).values[0].shape
images_per_row = 5
rows = 5
samples = images.sample(n=5*rows).values

plot_images(samples, shape, images_per_row=images_per_row)



In [15]:
snapshots.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 812 entries, 430 to 291
Data columns (total 19 columns):
drive.orientation                   812 non-null int64
drive.throttle                      812 non-null int64
imagePath                           812 non-null object
start                               812 non-null int64
timeWindow                          812 non-null int64
trigger                             812 non-null object
uuid                                812 non-null object
vehicle.backLeft.command.value      812 non-null int64
vehicle.backLeft.speed              812 non-null int64
vehicle.backRight.command.value     812 non-null int64
vehicle.backRight.speed             812 non-null int64
vehicle.frontLeft.command.value     812 non-null int64
vehicle.frontLeft.speed             812 non-null int64
vehicle.frontRight.command.value    812 non-null int64
vehicle.frontRight.speed            812 non-null int64
image                               812 non-null object
quadrant                            812 non-null int64
direction                           812 non-null object
movement                            812 non-null object
dtypes: int64(13), object(6)
memory usage: 126.9+ KB

In [16]:
snapshots.describe()


Out[16]:
drive.orientation drive.throttle start timeWindow vehicle.backLeft.command.value vehicle.backLeft.speed vehicle.backRight.command.value vehicle.backRight.speed vehicle.frontLeft.command.value vehicle.frontLeft.speed vehicle.frontRight.command.value vehicle.frontRight.speed quadrant
count 812.000000 812.000000 8.120000e+02 812.000000 812.000000 812.000000 812.000000 812.000000 812.000000 812.000000 812.000000 812.000000 812.000000
mean 126.724138 13.657635 1.525652e+12 52.615764 1.588670 13.500000 1.588670 9.160099 1.588670 13.500000 1.588670 9.160099 2.328818
std 81.729805 49.430628 4.020661e+05 18.243798 0.983599 48.860846 0.983599 34.729378 0.983599 48.860846 0.983599 34.729378 0.891946
min 30.000000 0.000000 1.525650e+12 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000
25% 90.000000 0.000000 1.525652e+12 46.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 2.000000
50% 90.000000 0.000000 1.525652e+12 52.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 2.000000
75% 105.000000 0.000000 1.525652e+12 60.000000 2.000000 0.000000 2.000000 0.000000 2.000000 0.000000 2.000000 0.000000 2.000000
max 330.000000 255.000000 1.525652e+12 116.000000 4.000000 255.000000 4.000000 255.000000 4.000000 255.000000 4.000000 255.000000 4.000000

In [17]:
snapshots.hist(bins=50, figsize=(20, 15))


Out[17]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f5311483860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f53110a8320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f5310fa29b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f53113bff28>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f5311110668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f53111106a0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f531121ed68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f53111e0080>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f53110eb6d8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f53110fee48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f530e5f1518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f530e544be0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f530e4c0358>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f530e4a25f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f530e41c2b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f531146c160>]], dtype=object)

In [18]:
corr_matrix = snapshots.corr()
corr_matrix["drive.orientation"].sort_values(ascending=False)


Out[18]:
drive.orientation                   1.000000
quadrant                            0.979938
vehicle.frontRight.command.value    0.239676
vehicle.frontLeft.command.value     0.239676
vehicle.backRight.command.value     0.239676
vehicle.backLeft.command.value      0.239676
start                               0.206758
timeWindow                         -0.060833
vehicle.frontRight.speed           -0.130391
vehicle.backRight.speed            -0.130391
drive.throttle                     -0.192336
vehicle.frontLeft.speed            -0.195499
vehicle.backLeft.speed             -0.195499
Name: drive.orientation, dtype: float64

Train a model


In [19]:
def prepare_data(data):
    X = data["image"].apply(lambda image: image_features(image)).values
    y = data["drive.orientation"].values
    
    return X, y

In [20]:
X, y = prepare_data(snapshots)
X_test, y_test = prepare_data(test_set)

In [21]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [28]:
def rmse(model):
    y_predictions = tree_reg.predict(X_test)
    mse = mean_squared_error(y_test, y_predictions)
    
    return numpy.sqrt(mse)

In [23]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X, y)


Out[23]:
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=42,
           splitter='best')

In [29]:
rmse(tree_reg)


Out[29]:
58.757561048703828

In [30]:
from sklearn.externals import joblib

joblib.dump(tree_reg, "tree_reg.pkl") # DIFF

tree_reg_loaded = joblib.load("tree_reg.pkl") # DIFF

In [31]:
from sklearn.ensemble import RandomForestRegressor

rnd_reg = RandomForestRegressor(n_estimators=500, max_leaf_nodes=256, n_jobs=-1, random_state=42)
rnd_reg.fit(X, y)


Out[31]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=256,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [32]:
rmse(rnd_reg)


Out[32]:
58.757561048703828

In [33]:
from sklearn.externals import joblib

joblib.dump(tree_reg, "rnd_reg.pkl") # DIFF

rnd_reg_loaded = joblib.load("rnd_reg.pkl") # DIFF

In [34]:
def plot_importance(data):
    image = data.reshape(shape)
    pyplot.imshow(image, cmap=cm.hot, interpolation="nearest")
    pyplot.axis("off")
    
    cbar = pyplot.colorbar(ticks=[data.min(), data.max()])
    cbar.ax.set_yticklabels(['Not important', 'Very important'])

    pyplot.show()

In [35]:
plot_importance(rnd_reg.feature_importances_)



In [ ]: