In [1]:
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import collections
import multibinner as mb
In [2]:
from skimage import io
image = np.flipud(io.imread('https://media4.giphy.com/media/S3mBspMr0r5HW/200_s.gif'))
Initial data are read from an image, then n_data samples will be extracted from the data.
The image contains 200x200 = 40k pixels
We will extract 400k random points from the image and build a pandas.DataFrame
This mimics the sampling process of a spacecraft for example : looking at a target (Earth or another body) and getting way more data points you need to reconstruct a coherent representation.
Moreover, visualize 400k x 3 columns of point is difficult, thus we will multibin the DataFrame to 200 bins on the x and 200 on the y direction, calculate the average for each bin and return 200x200 array of data in output.
The multibin.MultiBinnedDataFrame could generate as many dimension as one like, the 2D example here is for the sake of representation.
In [3]:
image_df = pd.DataFrame(image.reshape(-1,image.shape[-1]),columns=['red','green','blue'])
image_df.describe()
Out[3]:
In [4]:
n_data = image.reshape(-1,image.shape[-1]).shape[0]*10 # 10 times the original number of pixels : overkill!
x = np.random.random_sample(n_data)*image.shape[1]
y = np.random.random_sample(n_data)*image.shape[0]
In [5]:
data = pd.DataFrame({'x' : x, 'y' : y })
# extract the random point from the original image and add some noise
for index,name in zip(*(range(image.shape[-1]),['red','green','blue'])):
data[name] = image[data.y.astype(int),data.x.astype(int),index]+np.random.rand(n_data)*.1
In [6]:
data.describe().T
Out[6]:
In [7]:
pd.tools.plotting.scatter_matrix(data.sample(n=1000), alpha=0.5 , lw=0, figsize=(12, 12), diagonal='hist');
In [8]:
# Let's multibinning!
# functions we want to apply on the data in a single multidimensional bin:
aggregated_functions = {
'red' : {'elements' : len ,'average' : np.average},
'green' : {'average' : np.average},
'blue' : {'average' : np.average}
}
# the columns we want to have in output:
out_columns = ['red','green','blue']
# define the bins for sepal_length
group_variables = collections.OrderedDict([
('y',mb.bingenerator({ 'start' : 0 ,'stop' : image.shape[0], 'n_bins' : image.shape[0]})),
('x',mb.bingenerator({ 'start' : 0 ,'stop' : image.shape[1], 'n_bins' : image.shape[1]}))
])
# I use OrderedDict to have fixed order, a normal dict is fine too.
# that is the object collecting all the data that define the multi binning
mbdf = mb.MultiBinnedDataFrame(binstocolumns = True,
dataframe = data,
group_variables = group_variables,
aggregated_functions = aggregated_functions,
out_columns = out_columns)
In [9]:
mbdf.MBDataFrame.describe().T
Out[9]:
In [10]:
# reconstruct the multidimensional array defined by group_variables
outstring = []
for key,val in mbdf.group_variables.iteritems():
outstring.append('{} bins ({})'.format(val['n_bins'],key))
key = 'red_average'
print '{} array = {}'.format(key,' x '.join(outstring))
print
print mbdf.col_df_to_array(key)
In [11]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(figsize=[16,10], ncols=2, nrows=2)
cm = plt.get_cmap('jet')
key = 'red_elements'
imgplot = ax1.imshow(mbdf.col_df_to_array(key), cmap = cm,
interpolation='none',origin='lower')
plt.colorbar(imgplot, orientation='vertical', ax = ax1)
ax1.set_title('elements per bin')
ax1.grid(False)
key = 'red_average'
imgplot = ax2.imshow(mbdf.col_df_to_array(key), cmap = cm,
interpolation='none',origin='lower')
plt.colorbar(imgplot, orientation='vertical', ax = ax2)
ax2.set_title(key)
ax2.grid(False)
key = 'green_average'
imgplot = ax3.imshow(mbdf.col_df_to_array(key), cmap = cm,
interpolation='none',origin='lower')
plt.colorbar(imgplot, orientation='vertical', ax = ax3)
ax3.set_title(key)
ax3.grid(False)
key = 'blue_average'
imgplot = ax4.imshow(mbdf.col_df_to_array(key), cmap = cm,
interpolation='none',origin='lower')
plt.colorbar(imgplot, orientation='vertical', ax = ax4)
ax4.set_title(key)
ax4.grid(False)
In [12]:
rgb_image_dict = mbdf.all_df_to_array()
rgb_image = rgb_image_dict['red_average']
for name in ['green_average','blue_average']:
rgb_image = np.dstack((rgb_image,rgb_image_dict[name]))
In [13]:
fig, (ax1,ax2) = plt.subplots(figsize=[16,10], ncols=2)
ax1.imshow(255-rgb_image,interpolation='bicubic',origin='lower')
ax1.set_title('MultiBinnedDataFrame')
ax2.imshow(image ,interpolation='bicubic',origin='lower')
ax2.set_title('Original Image')
Out[13]:
In the images above, on the right the original one and on the left the result of picking 400k random point on the image, rebinning to 200x200 on the (x,y) columns and calculating the average on each of the resulting 40kbins.
The bins contain from 1 to 29 point (10 on average).
Thanks from me and Mario!