In [ ]:
#Clustering multivarié de données racinaires - Sixtine
In [1]:
import csv
import numpy as np
%matplotlib inline
import matplotlib.pylab as plt
# Path and filename
csv_path = "/projects/Jonathan/VP/Sixtine/"
csv_fname = "AllLateralRootsbis.csv"
# MultiVariate Pairwise Distance matrix object filename:
mvpd_fname = "mvpd_AllLateralRootsbis.pkz"
In [2]:
decimal_change = lambda x: x.replace(',', '.')
replaceNA = lambda x: x.replace('NA','nan')
strip_white = lambda x: x.replace(' ', '')
In [3]:
dataframe=[]
head=True
with open(csv_path+csv_fname, 'rb') as f:
reader = csv.reader(f, delimiter=';')
for row in reader:
if head:
headers = row
head=False
else:
dataframe.append(tuple(map(strip_white,map(replaceNA,(map(decimal_change,row))))))
#dataframe[:2]
In [4]:
# -- Declaring the columns types and headers:
form_types = [int,int,int,np.str,float,float,int,float,float,float]
#zip(headers,form_types)
data = np.array(dataframe[:100], dtype=zip(headers,form_types))
#data
In [5]:
d = data.view(np.recarray)
d.day.shape
Out[5]:
In [6]:
d.root
Out[6]:
In [7]:
from openalea.stat_tool.multivariate_clustering import mvpd_matrix
In [8]:
# A basic initialisation of the 'mvpd_matrix' object can be done as following:
mvpd = mvpd_matrix()
In [9]:
# ...but the object is empty and you will have to add observations using 'self.add_variable':
mvpd.add_variable(var_data=d.insertion_position, var_name="insertion_position", var_type="Numeric", var_unit="mm")
mvpd.add_variable(var_data=d.growth, var_name="growth", var_type="Numeric", var_unit=u"mm.h\u207B\u00B9")
Out[9]:
In [10]:
# ...and finally we assemble those 3 variables in one MVPD matrix using 'self.create_mvpd_matrix':
mvpd.create_mvpd_matrix(["insertion_position", "growth"], [1/2.,1/2.])
In [11]:
# Steps one and two can be done automatically using:
mvpd = mvpd_matrix([d.insertion_position, d.growth], ["insertion_position", "growth"],
["Numeric", "Numeric"])
In [12]:
# All steps leading to the standardised MVPD matrix can be done AT ONCE:
mvpd = mvpd_matrix([d.insertion_position, d.growth], ["insertion_position", "growth"],
["Numeric","Numeric"], [1/2.,1/2.])
In [13]:
mvpd.save_mvpd(csv_path+mvpd_fname)
Out[13]:
In [14]:
from openalea.stat_tool.multivariate_clustering import mvpd_matrix, load_mvpd
mvpd = load_mvpd(csv_path+mvpd_fname)
In [15]:
# It is still possible to add another variable using 'self.add_variable'.
mvpd.add_variable(var_data=d.parent_length, var_name="parent_length", var_type="Numeric", var_unit="mm")
# To compute a new mvpd matrix, then simply do:
mvpd.create_mvpd_matrix(["parent_length", "growth"], [1/2.,1/2.])
#this method allow a faster creation of the standardised mvpd matrix using the existing pariwise distance matrices saved within the object!
In [16]:
mvpd.add_variable(var_data=d.parent_length, var_name="length", var_type="Numeric", var_unit="mm")
mvpd = mvpd_matrix([d.insertion_position, d.length, d.growth], ["insertion_position", "length", "growth"],
["Numeric","Numeric","Numeric"], [1/3.,1/3.,1/3.])
In [17]:
plt.figure(figsize=(6,6))
plt.imshow(mvpd._global_distance_matrix)
Out[17]:
The agglomerative clustering performs a hierarchical clustering using a bottom up approach: each observation starts in its own cluster, and clusters are successively merged together.
The linkage criteria determines the metric used for the merge strategy:
Agglomerative clustering can also scale to large number of samples when it is used jointly with a connectivity matrix, but is computationally expensive when no connectivity constraints are added between samples: it considers at each step all the possible merges.
In [18]:
mvpd.cluster(5, method = "ward")
#mvpd._clustering
Out[18]:
SpectralClustering does a low-dimension embedding of the affinity matrix between samples, followed by a KMeans in the low dimensional space.
SpectralClustering requires the number of clusters to be specified. It works well for a small number of clusters but is not advised when using many clusters.
Note that if the values of your similarity matrix are not well distributed, e.g. with negative values or with a distance matrix rather than a similarity, the spectral problem will be singular and the problem not solvable.
WARNING: spectral clustering works on an affinity or similarity matrix, and we have previously computed a distance matrix!!
Transforming distance into well-behaved similarities is not necesarilly easy and should be done with care.
By default we have used the transformation proposed by scikit.learn
:
similarity = np.exp(-beta * distance / distance.std())
, with beta = 1
In [19]:
mvpd.cluster(5, method = "spectral")
Out[19]:
In [20]:
from openalea.stat_tool.multivariate_clustering import ClustererChecker
cc = ClustererChecker(mvpd)
In [21]:
cc.global_cluster_distances()
Out[21]:
In [22]:
cc.within_cluster_distances()
Out[22]:
In [23]:
cc.between_cluster_distances()
Out[23]:
In [24]:
# Computation:
cc.cluster_distance_matrix()
# Display it as an heat-map (with values inside)
cc.plot_cluster_distances(print_values=True)
# Save it under 'test_ClustCheck.png':
cc.plot_cluster_distances(print_values=True,savefig=csv_path+'test_ClustCheck.png')
In [25]:
# Compute the distance of each individual (roots) to each cluster center:
cc.vertex2clusters_distance()
Out[25]:
In [26]:
# Compute the distance of each individual (roots) to THEIR cluster center:
cc.vertex_distance2cluster_center()
Out[26]:
In [27]:
# Display the distances to their cluster center by sorted individuals:
cc.plot_vertex_distance2cluster_center()
In [28]:
# By giving a filename (and the rigth path) is will be saved on drive:
cc.plot_vertex_distance2cluster_center(savefig=csv_path+'roots2cluster_center.png')
In [29]:
#cc.properties_boxplot_by_cluster()
cc.info_clustering["variables"]cc.properties_boxplot_by_cluster()