In [1]:
from __future__ import division
%pylab inline
import pandas as pd
from Bio import SeqIO
import nwalign
import itertools
from scipy import signal
import mahotas
import sklearn
import glob
import os.path as op
from mpl_toolkits.mplot3d import Axes3D
import generate_features
from skimage.feature import local_binary_pattern,match_template,structure_tensor,structure_tensor_eigvals
from sklearn import cluster,preprocessing
from scipy.stats import linregress
def scaleMatrix(mat):
mMin = mat.min()
mMax = mat.max()
s = mat-mMin
s =255*mat/(mMax-mMin)
return s
In [ ]:
# matD = np.load('../simulate_sequence/reference2/matrices/sd_000.npz')
matD = np.load('../real_data/m130928_232712_42213_c100518541910000001823079209281310_s1_p0.1.subreads.len.gte.8.npz')
for i,mat in matD.iteritems():
# mat= generate_features.segmentMat(mat,k=10)
mat=generate_features.binarizeMatrix(mat)
plt.figure(figsize=(14,6))
plt.subplot(121)
imshow(mat,interpolation='none',cmap=cm.gray,norm=plt.Normalize())
plt.subplot(122)
result = generate_features.computeHaralick(mat,stepP=1,featureNames=['contrast'])
stem(np.array(result)[0:100])
plt.show()
In [2]:
df = generate_features.processFolder('../simulate_sequence/reference2/matrices/')
# df = generate_features.processFolder('../real_data/')
In [ ]:
matD = np.load('../simulate_sequence/simulated_matrices/matrices.npz')
idxs=[]
pds = []
for i,j in matD.iteritems():
strL= i.split("_")
idxs += [i]
pds += [strL[1]]
cols = pd.DataFrame(data=pds,index=idxs,columns=['periodicity'])
copy = df.join(cols)
pds={'2':'red','8':'yellow','10':'blue','5':'black'}
colormap=[pds[p] for p in copy['periodicity']]
In [ ]:
annot = pd.read_csv('../simulate_sequence/reference2/periodicities.txt',sep='\t',index_col=0,header=None)
annot.columns=['periodicity']
new = df.join(annot)
pds= {'2':'black','3':'red','8':'blue','2,8':'yellow','2,3':'orange'}
colormap = [pds[p] for p in new['periodicity']]
In [3]:
from sklearn import decomposition
pca = decomposition.PCA()
pcaDf = pca.fit_transform(df)
fig = plt.figure(figsize=(10,10))
scatter(pcaDf[:,0],pcaDf[:,1],s=40,c=colormap)
In [ ]:
fig = plt.figure(figsize=(10,10))
ax = Axes3D(fig)
ax.scatter(xs=pcaDf[:,0],ys=pcaDf[:,1],zs=pcaDf[:,2],c=colormap)
# ax.view_init(elev=0,azim=0)
In [ ]:
print pcaDf.shape
print pca.explained_variance_ratio_.shape
In [ ]:
bar(range(pcaDf.shape[1])[0:20],pca.explained_variance_ratio_[0:20])
In [ ]:
pd.DataFrame(abs(pca.components_),columns=df.columns).plot(kind='bar',stacked=True)
In [ ]:
from scipy.cluster.hierarchy import dendrogram, linkage,leaves_list
Z=linkage(df,'complete')
In [ ]:
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist
c, coph_dists = cophenet(Z, pdist(df))
c
In [ ]:
matD = np.load('../simulate_sequence/reference2/matrices/sd_000.npz')
mat1=matD['S1_116']
mat2=matD['S1_382']
plt.figure(figsize=(20,20))
plt.subplot(121)
imshow(mat1,interpolation='none',cmap=cm.gray)
plt.subplot(122)
imshow(mat2,interpolation='none',cmap=cm.gray)
In [ ]:
labels=[df.index[z] for z in leaves_list(Z)]
plt.figure(figsize=(120,20))
ax=dendrogram(Z,leaf_rotation=90.,leaf_font_size=8.,labels=labels)
plt.show()
In [ ]:
nameFn='../simulate_sequence/reference2/periodicities.txt'
nameH=open(nameFn)
nameD={}
for line in nameH:
lineArr = line.split()
nameD[lineArr[0]] = lineArr[1]
labels=[nameD[df.index[z]]+'_'+df.index[z] for z in leaves_list(Z)]
plt.figure(figsize=(120,20))
ax=dendrogram(Z,leaf_rotation=90.,leaf_font_size=8.,labels=labels)
plt.show()
In [ ]:
from skimage import feature,morphology,filters,measure
matD = np.load('../simulate_sequence/reference2/matrices/sd_000.npz')
mat=matD['S1_100']
mat2=matD['S1_80']
plt.figure(figsize=(20,10))
subplot(121)
imshow(mat,cmap=cm.gray)
subplot(122)
imshow(mat2,cmap=cm.gray)
print measure.compare_nrmse(mat[:500,:500],mat2[:500,:500])
In [ ]:
M[0]['centroid']