In [10]:
%autosave 0
from __future__ import print_function


Autosave disabled

Clustering RNA structures

We start by clustering the structures obtained from the previous example "example_08_snippet.ipynb", where we extracted all fragments with sequence GNRA from the PDB of the large ribosomal subunit.

First, we calculate the g-vectors for all PDB files


In [11]:
import glob
import barnaba as bb
import numpy as np

flist = glob.glob("snippet/*.pdb")
if(len(flist)==0):
    print("# You need to run the example example8_snippet.ipynb")
    exit()
    
# calculate G-VECTORS  for all files
gvecs = []
for f in flist:
    gvec,seq = bb.dump_gvec(f)
    assert len(seq)==4
    gvecs.extend(gvec)


# Loading snippet/1S72_100_G_9_0_00146.pdb 
# Loading snippet/1S72_101_G_9_0_00147.pdb 
# Loading snippet/1S72_102_G_9_0_00148.pdb 
# Loading snippet/1S72_1037_G_0_0_00055.pdb 
# Loading snippet/1S72_1055_G_0_0_00056.pdb 
# Loading snippet/1S72_1075_G_0_0_00057.pdb 
# Loading snippet/1S72_1076_G_0_0_00058.pdb 
# Loading snippet/1S72_1087_G_0_0_00059.pdb 
# Loading snippet/1S72_1121_G_0_0_00060.pdb 
# Loading snippet/1S72_1158_G_0_0_00061.pdb 
# Loading snippet/1S72_1163_G_0_0_00062.pdb 
# Loading snippet/1S72_116_G_0_0_00003.pdb 
# Loading snippet/1S72_1190_G_0_0_00063.pdb 
# Loading snippet/1S72_1197_G_0_0_00064.pdb 
# Loading snippet/1S72_1239_G_0_0_00065.pdb 
# Loading snippet/1S72_1258_G_0_0_00066.pdb 
# Loading snippet/1S72_1284_G_0_0_00067.pdb 
# Loading snippet/1S72_1315_G_0_0_00068.pdb 
# Loading snippet/1S72_1325_G_0_0_00069.pdb 
# Loading snippet/1S72_1327_G_0_0_00070.pdb 
# Loading snippet/1S72_1349_G_0_0_00071.pdb 
# Loading snippet/1S72_1354_G_0_0_00072.pdb 
# Loading snippet/1S72_1376_G_0_0_00073.pdb 
# Loading snippet/1S72_1387_G_0_0_00074.pdb 
# Loading snippet/1S72_1389_G_0_0_00075.pdb 
# Loading snippet/1S72_142_G_0_0_00004.pdb 
# Loading snippet/1S72_1468_G_0_0_00076.pdb 
# Loading snippet/1S72_1484_G_0_0_00077.pdb 
# Loading snippet/1S72_1489_G_0_0_00078.pdb 
# Loading snippet/1S72_1490_G_0_0_00079.pdb 
# Loading snippet/1S72_1491_G_0_0_00080.pdb 
# Loading snippet/1S72_149_G_0_0_00005.pdb 
# Loading snippet/1S72_1523_G_0_0_00081.pdb 
# Loading snippet/1S72_1525_G_0_0_00082.pdb 
# Loading snippet/1S72_157_G_0_0_00006.pdb 
# Loading snippet/1S72_1588_G_0_0_00083.pdb 
# Loading snippet/1S72_1595_G_0_0_00084.pdb 
# Loading snippet/1S72_1604_G_0_0_00085.pdb 
# Loading snippet/1S72_1627_G_0_0_00086.pdb 
# Loading snippet/1S72_1628_G_0_0_00087.pdb 
# Loading snippet/1S72_1629_G_0_0_00088.pdb 
# Loading snippet/1S72_1634_G_0_0_00089.pdb 
# Loading snippet/1S72_164_G_0_0_00007.pdb 
# Loading snippet/1S72_1655_G_0_0_00090.pdb 
# Loading snippet/1S72_1681_G_0_0_00091.pdb 
# Loading snippet/1S72_1707_G_0_0_00092.pdb 
# Loading snippet/1S72_1709_G_0_0_00093.pdb 
# Loading snippet/1S72_1726_G_0_0_00094.pdb 
# Loading snippet/1S72_1730_G_0_0_00095.pdb 
# Loading snippet/1S72_1743_G_0_0_00096.pdb 
# Loading snippet/1S72_1744_G_0_0_00097.pdb 
# Loading snippet/1S72_1752_G_0_0_00098.pdb 
# Loading snippet/1S72_1773_G_0_0_00099.pdb 
# Loading snippet/1S72_1780_G_0_0_00100.pdb 
# Loading snippet/1S72_1794_G_0_0_00101.pdb 
# Loading snippet/1S72_180_G_0_0_00008.pdb 
# Loading snippet/1S72_1812_G_0_0_00102.pdb 
# Loading snippet/1S72_1819_G_0_0_00103.pdb 
# Loading snippet/1S72_1837_G_0_0_00104.pdb 
# Loading snippet/1S72_1849_G_0_0_00105.pdb 
# Loading snippet/1S72_184_G_0_0_00009.pdb 
# Loading snippet/1S72_1855_G_0_0_00106.pdb 
# Loading snippet/1S72_1863_G_0_0_00107.pdb 
# Loading snippet/1S72_190_G_0_0_00010.pdb 
# Loading snippet/1S72_196_G_0_0_00011.pdb 
# Loading snippet/1S72_201_G_0_0_00012.pdb 
# Loading snippet/1S72_2051_G_0_0_00108.pdb 
# Loading snippet/1S72_2080_G_0_0_00109.pdb 
# Loading snippet/1S72_2092_G_0_0_00110.pdb 
# Loading snippet/1S72_2093_G_0_0_00111.pdb 
# Loading snippet/1S72_2097_G_0_0_00112.pdb 
# Loading snippet/1S72_213_G_0_0_00013.pdb 
# Loading snippet/1S72_219_G_0_0_00014.pdb 
# Loading snippet/1S72_223_G_0_0_00015.pdb 
# Loading snippet/1S72_2249_G_0_0_00113.pdb 
# Loading snippet/1S72_2299_G_0_0_00114.pdb 
# Loading snippet/1S72_229_G_0_0_00016.pdb 
# Loading snippet/1S72_2337_G_0_0_00115.pdb 
# Loading snippet/1S72_2350_G_0_0_00116.pdb 
# Loading snippet/1S72_2359_G_0_0_00117.pdb 
# Loading snippet/1S72_2365_G_0_0_00118.pdb 
# Loading snippet/1S72_2399_G_0_0_00119.pdb 
# Loading snippet/1S72_2410_G_0_0_00120.pdb 
# Loading snippet/1S72_2412_G_0_0_00121.pdb 
# Loading snippet/1S72_2426_G_0_0_00122.pdb 
# Loading snippet/1S72_2453_G_0_0_00123.pdb 
# Loading snippet/1S72_2466_G_0_0_00124.pdb 
# Loading snippet/1S72_2480_G_0_0_00125.pdb 
# Loading snippet/1S72_2501_G_0_0_00126.pdb 
# Loading snippet/1S72_2574_G_0_0_00127.pdb 
# Loading snippet/1S72_2580_G_0_0_00128.pdb 
# Loading snippet/1S72_259_G_0_0_00017.pdb 
# Loading snippet/1S72_2609_G_0_0_00129.pdb 
# Loading snippet/1S72_2630_G_0_0_00130.pdb 
# Loading snippet/1S72_2632_G_0_0_00131.pdb 
# Loading snippet/1S72_2696_G_0_0_00132.pdb 
# Loading snippet/1S72_2700_G_0_0_00133.pdb 
# Loading snippet/1S72_2738_G_0_0_00134.pdb 
# Loading snippet/1S72_2740_G_0_0_00135.pdb 
# Loading snippet/1S72_2773_G_0_0_00136.pdb 
# Loading snippet/1S72_2798_G_0_0_00137.pdb 
# Loading snippet/1S72_2809_G_0_0_00138.pdb 
# Loading snippet/1S72_2810_G_0_0_00139.pdb 
# Loading snippet/1S72_2815_G_0_0_00140.pdb 
# Loading snippet/1S72_2877_G_0_0_00141.pdb 
# Loading snippet/1S72_2882_G_0_0_00142.pdb 
# Loading snippet/1S72_314_G_0_0_00018.pdb 
# Loading snippet/1S72_324_G_0_0_00019.pdb 
# Loading snippet/1S72_334_G_0_0_00020.pdb 
# Loading snippet/1S72_351_G_0_0_00021.pdb 
# Loading snippet/1S72_404_G_0_0_00022.pdb 
# Loading snippet/1S72_426_G_0_0_00023.pdb 
# Loading snippet/1S72_446_G_0_0_00024.pdb 
# Loading snippet/1S72_456_G_0_0_00025.pdb 
# Loading snippet/1S72_469_G_0_0_00026.pdb 
# Loading snippet/1S72_482_G_0_0_00027.pdb 
# Loading snippet/1S72_499_G_0_0_00028.pdb 
# Loading snippet/1S72_49_G_9_0_00143.pdb 
# Loading snippet/1S72_504_G_0_0_00029.pdb 
# Loading snippet/1S72_506_G_0_0_00030.pdb 
# Loading snippet/1S72_518_G_0_0_00031.pdb 
# Loading snippet/1S72_529_G_0_0_00032.pdb 
# Loading snippet/1S72_537_G_0_0_00033.pdb 
# Loading snippet/1S72_577_G_0_0_00034.pdb 
# Loading snippet/1S72_588_G_0_0_00035.pdb 
# Loading snippet/1S72_599_G_0_0_00036.pdb 
# Loading snippet/1S72_600_G_0_0_00037.pdb 
# Loading snippet/1S72_627_G_0_0_00038.pdb 
# Loading snippet/1S72_640_G_0_0_00039.pdb 
# Loading snippet/1S72_64_G_0_0_00000.pdb 
# Loading snippet/1S72_657_G_0_0_00040.pdb 
# Loading snippet/1S72_679_G_0_0_00041.pdb 
# Loading snippet/1S72_689_G_0_0_00042.pdb 
# Loading snippet/1S72_690_G_0_0_00043.pdb 
# Loading snippet/1S72_691_G_0_0_00044.pdb 
# Loading snippet/1S72_743_G_0_0_00045.pdb 
# Loading snippet/1S72_74_G_9_0_00144.pdb 
# Loading snippet/1S72_77_G_0_0_00001.pdb 
# Loading snippet/1S72_805_G_0_0_00046.pdb 
# Loading snippet/1S72_816_G_0_0_00047.pdb 
# Loading snippet/1S72_854_G_0_0_00048.pdb 
# Loading snippet/1S72_871_G_0_0_00049.pdb 
# Loading snippet/1S72_873_G_0_0_00050.pdb 
# Loading snippet/1S72_892_G_0_0_00051.pdb 
# Loading snippet/1S72_90_G_9_0_00145.pdb 
# Loading snippet/1S72_911_G_0_0_00052.pdb 
# Loading snippet/1S72_92_G_0_0_00002.pdb 
# Loading snippet/1S72_948_G_0_0_00053.pdb 
# Loading snippet/1S72_958_G_0_0_00054.pdb 

Then, we reshape the array so that has the dimension $(N,n \ast 4\ast 4)$, where N is the number of frames and n is the number of nucleotides


In [12]:
gvecs = np.array(gvecs)
gvecs = gvecs.reshape(149,-1)
print(gvecs.shape)


(149, 64)

C. We project the data using a simple principal component analysis on the g-vectors


In [13]:
import barnaba.cluster as cc
# calculate PCA
v,w = cc.pca(gvecs,nevecs=3)
print("# Cumulative explained variance of component: 1=%5.1f 2:=%5.1f 3=%5.1f" % (v[0]*100,v[1]*100,v[2]*100))


# Cumulative explained variance of component: 1= 26.6 2:= 48.7 3= 64.9

In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("white")

plt.scatter(w[:,0],w[:,1])
plt.xlabel("PC1")
plt.ylabel("PC2")


Out[14]:
Text(0,0.5,'PC2')

D. We make use of DBSCAN in sklearn to perform clustering. The function cc.dbscan takes four arguments:
i. the list of G-vectors gvec
ii. the list of labels for each point
iii. the eps value iv. min_samples v. (optional) the weight of the samples for non-uniform clustering

The function outputs some information on the clustering: the number of clusters, the number of samples assigned to clusters (non noise), and silouetthe.

For each cluster it reports the size, the maximum eRMSD distance between samples in a cluster (IC=intra-cluster), the median intra-cluster eRMSD, the maximum and median distance from the centroid.


In [15]:
new_labels, center_idx = cc.dbscan(gvecs,range(gvecs.shape[0]),eps=0.35,min_samples=8)


# eps:0.700 min_samples:8  nclusters: 4
#  silhouette score: 0.1728
# Avg silhouette: 0.6111 
# assigned samples :71 total samples:149 
#  N size       max eRMSD (IC)       med eRMSD (IC) max eRMSD (centroid) med eRMSD (centroid) center 
# 00 0026                0.613                0.309                0.440                0.222 00 118
# 01 0022                0.553                0.337                0.372                0.280 01 11
# 02 0015                0.543                0.286                0.370                0.229 02 4
# 03 0008                0.453                0.228                0.311                0.158 03 8

We can now color the PCA according to the different cluster and display the centroid as a label:


In [16]:
cp = sns.color_palette("hls",len(center_idx)+1)
colors = [cp[j-1] if(j!=0) else (0.77,0.77,0.77) for j in new_labels]
size = [40 if(j!=0) else 10 for j in new_labels]
#do scatterplot
plt.scatter(w[:,0],w[:,1],s=size,c=colors)
for i,k in enumerate(center_idx):
    plt.text(w[k,0],w[k,1],str(i),ha='center',va='center',fontsize=25)


E. We finally visualise the 4 centroids:


In [17]:
import py3Dmol

cluster_0 = open(flist[center_idx[0]],'r').read()
cluster_1 = open(flist[center_idx[1]],'r').read()
cluster_2 = open(flist[center_idx[2]],'r').read()
cluster_3 = open(flist[center_idx[3]],'r').read()

p = py3Dmol.view(width=900,height=600,viewergrid=(2,2))
#p = py3Dmol.view(width=900,height=600)
#p.addModel(query_s,'pdb')
p.addModel(cluster_0,'pdb',viewer=(0,0))
p.addModel(cluster_1,'pdb',viewer=(0,1))
p.addModel(cluster_2,'pdb',viewer=(1,0))
p.addModel(cluster_3,'pdb',viewer=(1,1))


#p.addModel(hit_0,'pdb',viewer=(0,1))
p.setStyle({'stick':{}})
p.setBackgroundColor('0xeeeeee')
p.zoomTo()
p.show()


Out[17]:

It is interesting to observe that cluster 0 and 1 (in close proximity in the PCA projection), correspond to A-form-like structures. Cluster 2 corresponds to the classic GNRA fold.