In [1]:
import pandas, json
from pandas import Series
from pandas.io.json import json_normalize
from os import path, listdir
from math import nan, pi, radians, floor
from skimage.io import imread, imshow
from skimage.color import rgb2gray
from skimage.filters import sobel
from skimage.measure import label
from skimage.segmentation import slic, join_segmentations
from skimage.morphology import watershed
from skimage.color import label2rgb
from skimage import data
import numpy
from matplotlib import cm, pyplot
In [2]:
%matplotlib inline
In [3]:
HOME_PATH = path.join("/home", "jovyan")
SNAPSHOT_PATH = path.join(HOME_PATH, "data", "production", "state", "snapshots.json")
BASE_IMAGE_PATH = '/home/miguel/IdeaProjects/RapBot/'
In [4]:
def to_dataframe(file, snapshot_path=SNAPSHOT_PATH):
dataframe = pandas.read_json(path.join(snapshot_path, file), lines=True)
return json_normalize(json.loads(dataframe.to_json(orient='records')))
def load_snapshot_data(snapshot_path=SNAPSHOT_PATH):
json_paths = listdir(snapshot_path)
dataframes = [to_dataframe(file) for file in json_paths]
return pandas.concat(dataframes)
In [5]:
def remove_empty_rows(data):
"""Filter snapshots without an image"""
return data.replace({"/dev/null": nan}).dropna(axis=0, how='any')
def localize_paths(data):
"""Replace desktop paths with container paths"""
return data.apply(lambda p: path.join(HOME_PATH, path.relpath(p, BASE_IMAGE_PATH)))
def clean_data(data):
filtered_data = remove_empty_rows(data)
filtered_data["imagePath"] = localize_paths(filtered_data["imagePath"])
return filtered_data
In [6]:
def add_quadrant(data):
data["quadrant"] = data["drive.orientation"].apply(lambda orientation: 1 + floor(radians(orientation) / (pi / 2)))
def add_direction(data):
data["direction"] = data["drive.orientation"].apply(lambda orientation: "forward" if orientation >= 0 and orientation <= 180 else "reverse")
def add_movement(data):
data["movement"] = data["drive.throttle"].apply(lambda throttle: "stopped" if throttle == 0 else "moving")
In [7]:
def load_image(path):
"""Loads an image given a path."""
return imread(path, as_grey=True)
def transform_image(image):
# Make segmentation using edge-detection and watershed.
edges = sobel(image)
# Identify some background and foreground pixels from the intensity values.
# These pixels are used as seeds for watershed.
foreground, background = 1, 2
markers = np.zeros_like(image)
markers[image < 30.0] = background
markers[image > 150.0] = foreground
return watershed(edges, markers)
def image_features(image):
"""Transforms an image into a row of features."""
return Series(image.reshape(image.shape[0] * image.shape[1]))
def load_image_data(data):
data["image"] = data["imagePath"].apply(lambda p: load_image(p))
return data
In [8]:
snapshots = load_image_data(clean_data(load_snapshot_data()))
add_quadrant(snapshots)
add_direction(snapshots)
add_movement(snapshots)
snapshots.head()
Out[8]:
In [9]:
columns = [
"movement", "trigger", "quadrant",
"vehicle.frontLeft.command.value", "vehicle.backLeft.command.value",
"vehicle.frontRight.command.value", "vehicle.backRight.command.value"
]
for column in columns:
print("*" * 80)
print(snapshots[column].value_counts())
In [10]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(snapshots, snapshots["quadrant"]):
train_set = snapshots.iloc[train_index]
test_set = snapshots.iloc[test_index]
In [11]:
print(len(train_set), "train + ", len(test_set), "test")
In [12]:
snapshots = train_set
In [13]:
def plot_images(instances, shape, images_per_row=10, **options):
length, width = shape
images_per_row = min(len(instances), images_per_row)
n_rows = (len(instances) - 1) // images_per_row + 1
n_empty = n_rows * images_per_row - len(instances)
for i in range(0, n_empty):
instances.append(numpy.zeros((length, width)))
row_images = []
for row in range(n_rows):
rimages = instances[row * images_per_row : (row + 1) * images_per_row]
row_images.append(numpy.concatenate(rimages, axis=1))
image = numpy.concatenate(row_images, axis=0)
pyplot.figure(figsize=(50, 50))
pyplot.imshow(image, cmap = cm.gray, **options)
pyplot.axis("off")
pyplot.show()
In [14]:
images = snapshots["image"]
shape = images.head(n=1).values[0].shape
images_per_row = 5
rows = 5
samples = images.sample(n=5*rows).values
plot_images(samples, shape, images_per_row=images_per_row)
In [15]:
snapshots.info()
In [16]:
snapshots.describe()
Out[16]:
In [17]:
snapshots.hist(bins=50, figsize=(20, 15))
Out[17]:
In [18]:
corr_matrix = snapshots.corr()
corr_matrix["drive.orientation"].sort_values(ascending=False)
Out[18]:
In [19]:
def prepare_data(data):
X = data["image"].apply(lambda image: image_features(image)).values
y = data["drive.orientation"].values
return X, y
In [20]:
X, y = prepare_data(snapshots)
X_test, y_test = prepare_data(test_set)
In [21]:
from sklearn.base import BaseEstimator, TransformerMixin
# Create a class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
In [28]:
def rmse(model):
y_predictions = tree_reg.predict(X_test)
mse = mean_squared_error(y_test, y_predictions)
return numpy.sqrt(mse)
In [23]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X, y)
Out[23]:
In [29]:
rmse(tree_reg)
Out[29]:
In [30]:
from sklearn.externals import joblib
joblib.dump(tree_reg, "tree_reg.pkl") # DIFF
tree_reg_loaded = joblib.load("tree_reg.pkl") # DIFF
In [31]:
from sklearn.ensemble import RandomForestRegressor
rnd_reg = RandomForestRegressor(n_estimators=500, max_leaf_nodes=256, n_jobs=-1, random_state=42)
rnd_reg.fit(X, y)
Out[31]:
In [32]:
rmse(rnd_reg)
Out[32]:
In [33]:
from sklearn.externals import joblib
joblib.dump(tree_reg, "rnd_reg.pkl") # DIFF
rnd_reg_loaded = joblib.load("rnd_reg.pkl") # DIFF
In [34]:
def plot_importance(data):
image = data.reshape(shape)
pyplot.imshow(image, cmap=cm.hot, interpolation="nearest")
pyplot.axis("off")
cbar = pyplot.colorbar(ticks=[data.min(), data.max()])
cbar.ax.set_yticklabels(['Not important', 'Very important'])
pyplot.show()
In [35]:
plot_importance(rnd_reg.feature_importances_)
In [ ]: