In [1]:
from __future__ import division
%pylab inline
import pandas as pd
from Bio import SeqIO
import nwalign
import itertools
from scipy import signal
import mahotas
import sklearn
import glob
import os.path as op
from mpl_toolkits.mplot3d import Axes3D
import generate_features
from skimage.feature import local_binary_pattern,match_template,structure_tensor,structure_tensor_eigvals
from sklearn import cluster,preprocessing
from scipy.stats import linregress

def scaleMatrix(mat):
    mMin = mat.min()
    mMax = mat.max()
    
    s = mat-mMin
    s =255*mat/(mMax-mMin)
    
    return s


Populating the interactive namespace from numpy and matplotlib

In [ ]:
# matD = np.load('../simulate_sequence/reference2/matrices/sd_000.npz')
matD = np.load('../real_data/m130928_232712_42213_c100518541910000001823079209281310_s1_p0.1.subreads.len.gte.8.npz')


for i,mat in matD.iteritems():
    
#     mat= generate_features.segmentMat(mat,k=10)
    mat=generate_features.binarizeMatrix(mat)
    plt.figure(figsize=(14,6))
    plt.subplot(121)
    imshow(mat,interpolation='none',cmap=cm.gray,norm=plt.Normalize())
    plt.subplot(122)
    
    result = generate_features.computeHaralick(mat,stepP=1,featureNames=['contrast'])
    stem(np.array(result)[0:100])
    plt.show()

In [2]:
df = generate_features.processFolder('../simulate_sequence/reference2/matrices/')
# df = generate_features.processFolder('../real_data/')


1 	S1_849
2 	S1_848
3 	S1_847
4 	S1_846
5 	S1_845
6 	S1_844
7 	S1_843
8 	S1_842
9 	S1_841
10 	S1_840
11 	S1_1649
12 	S1_1648
13 	S1_1399
14 	S1_1398
15 	S1_1641
16 	S1_1640
17 	S1_1643
18 	S1_1642
19 	S1_1645
20 	S1_1644
21 	S1_1647
22 	S1_1646
23 	S1_234
24 	S1_235
25 	S1_236
26 	S1_237
27 	S1_230
28 	S1_231
29 	S1_232
30 	S1_233
31 	S1_238
32 	S1_239
33 	S1_1234
34 	S1_1249
35 	S1_1248
36 	S1_1543
37 	S1_1245
38 	S1_1244
39 	S1_1247
40 	S1_1246
41 	S1_1241
42 	S1_1240
43 	S1_1243
44 	S1_1242
45 	S1_1025
46 	S1_1024
47 	S1_548
48 	S1_549
49 	S1_1021
50 	S1_1020
51 	S1_1023
52 	S1_1022
53 	S1_542
54 	S1_543
55 	S1_540
56 	S1_541
57 	S1_1029
58 	S1_1028
59 	S1_544
60 	S1_545
61 	S1_1232
62 	S1_1233
63 	S1_300
64 	S1_301
65 	S1_302
66 	S1_303
67 	S1_304
68 	S1_305
69 	S1_306
70 	S1_307
71 	S1_308
72 	S1_309
73 	S1_698
74 	S1_699
75 	S1_1489
76 	S1_838
77 	S1_839
78 	S1_832
79 	S1_833
80 	S1_830
81 	S1_831
82 	S1_836
83 	S1_837
84 	S1_834
85 	S1_835
86 	S1_946
87 	S1_947
88 	S1_944
89 	S1_945
90 	S1_942
91 	S1_943
92 	S1_940
93 	S1_941
94 	S1_408
95 	S1_948
96 	S1_949
97 	S1_401
98 	S1_400
99 	S1_1287
100 	S1_1286
101 	S1_1289
102 	S1_1288
103 	S1_1656
104 	S1_1657
105 	S1_1654
106 	S1_1655
107 	S1_1652
108 	S1_1653
109 	S1_1650
110 	S1_1651
111 	S1_1658
112 	S1_1388
113 	S1_1389
114 	S1_1380
115 	S1_1381
116 	S1_1382
117 	S1_1383
118 	S1_1384
119 	S1_1385
120 	S1_1386
121 	S1_1387
122 	S1_223
123 	S1_222
124 	S1_221
125 	S1_220
126 	S1_227
127 	S1_226
128 	S1_225
129 	S1_224
130 	S1_229
131 	S1_228
132 	S1_1258
133 	S1_1259
134 	S1_1252
135 	S1_1253
136 	S1_1250
137 	S1_1251
138 	S1_1256
139 	S1_1257
140 	S1_1254
141 	S1_1255
142 	S1_1032
143 	S1_1033
144 	S1_1030
145 	S1_1031
146 	S1_1036
147 	S1_1037
148 	S1_1034
149 	S1_1035
150 	S1_551
151 	S1_550
152 	S1_553
153 	S1_552
154 	S1_555
155 	S1_554
156 	S1_557
157 	S1_556
158 	S1_713
159 	S1_244
160 	S1_711
161 	S1_246
162 	S1_319
163 	S1_318
164 	S1_317
165 	S1_316
166 	S1_315
167 	S1_314
168 	S1_313
169 	S1_312
170 	S1_311
171 	S1_310
172 	S1_243
173 	S1_242
174 	S1_490
175 	S1_491
176 	S1_492
177 	S1_493
178 	S1_366
179 	S1_495
180 	S1_496
181 	S1_497
182 	S1_37
183 	S1_36
184 	S1_35
185 	S1_34
186 	S1_33
187 	S1_32
188 	S1_31
189 	S1_30
190 	S1_39
191 	S1_38
192 	S1_829
193 	S1_828
194 	S1_821
195 	S1_820
196 	S1_823
197 	S1_822
198 	S1_825
199 	S1_824
200 	S1_827
201 	S1_826
202 	S1_955
203 	S1_954
204 	S1_957
205 	S1_956
206 	S1_951
207 	S1_950
208 	S1_953
209 	S1_952
210 	S1_959
211 	S1_958
212 	S1_1623
213 	S1_1622
214 	S1_1621
215 	S1_1620
216 	S1_1627
217 	S1_1626
218 	S1_1625
219 	S1_1624
220 	S1_1629
221 	S1_1628
222 	S1_788
223 	S1_789
224 	S1_784
225 	S1_785
226 	S1_786
227 	S1_787
228 	S1_780
229 	S1_781
230 	S1_782
231 	S1_783
232 	S1_148
233 	S1_149
234 	S1_146
235 	S1_147
236 	S1_144
237 	S1_145
238 	S1_142
239 	S1_143
240 	S1_140
241 	S1_141
242 	S1_766
243 	S1_767
244 	S1_764
245 	S1_765
246 	S1_762
247 	S1_763
248 	S1_760
249 	S1_761
250 	S1_768

In [ ]:
matD = np.load('../simulate_sequence/simulated_matrices/matrices.npz')

idxs=[]
pds = [] 
for i,j in matD.iteritems():
    strL= i.split("_")
    idxs += [i]
    pds += [strL[1]]

cols = pd.DataFrame(data=pds,index=idxs,columns=['periodicity'])

copy = df.join(cols)
    
pds={'2':'red','8':'yellow','10':'blue','5':'black'}
colormap=[pds[p] for p in copy['periodicity']]

In [ ]:
annot = pd.read_csv('../simulate_sequence/reference2/periodicities.txt',sep='\t',index_col=0,header=None)
annot.columns=['periodicity']


new = df.join(annot)
pds= {'2':'black','3':'red','8':'blue','2,8':'yellow','2,3':'orange'}
colormap = [pds[p] for p in new['periodicity']]

In [3]:
from sklearn import decomposition
pca = decomposition.PCA()
pcaDf = pca.fit_transform(df)
fig = plt.figure(figsize=(10,10))
scatter(pcaDf[:,0],pcaDf[:,1],s=40,c=colormap)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-3-9011ba4a7d7f> in <module>()
      3 pcaDf = pca.fit_transform(df)
      4 fig = plt.figure(figsize=(10,10))
----> 5 scatter(pcaDf[:,0],pcaDf[:,1],s=40,c=colormap)

IndexError: index 1 is out of bounds for axis 1 with size 1
<matplotlib.figure.Figure at 0x112fab1d0>

In [ ]:
fig = plt.figure(figsize=(10,10))
ax = Axes3D(fig)
ax.scatter(xs=pcaDf[:,0],ys=pcaDf[:,1],zs=pcaDf[:,2],c=colormap)
# ax.view_init(elev=0,azim=0)

In [ ]:
print pcaDf.shape
print pca.explained_variance_ratio_.shape

In [ ]:
bar(range(pcaDf.shape[1])[0:20],pca.explained_variance_ratio_[0:20])

In [ ]:
pd.DataFrame(abs(pca.components_),columns=df.columns).plot(kind='bar',stacked=True)

In [ ]:
from scipy.cluster.hierarchy import dendrogram, linkage,leaves_list
Z=linkage(df,'complete')

In [ ]:
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

c, coph_dists = cophenet(Z, pdist(df))
c

In [ ]:
matD = np.load('../simulate_sequence/reference2/matrices/sd_000.npz')
mat1=matD['S1_116']
mat2=matD['S1_382']

plt.figure(figsize=(20,20))
plt.subplot(121)
imshow(mat1,interpolation='none',cmap=cm.gray)
plt.subplot(122)
imshow(mat2,interpolation='none',cmap=cm.gray)

In [ ]:
labels=[df.index[z] for z in leaves_list(Z)]
plt.figure(figsize=(120,20))
ax=dendrogram(Z,leaf_rotation=90.,leaf_font_size=8.,labels=labels)
plt.show()

In [ ]:
nameFn='../simulate_sequence/reference2/periodicities.txt'

nameH=open(nameFn)
nameD={}
for line in nameH:
    lineArr = line.split()
    nameD[lineArr[0]] = lineArr[1]

labels=[nameD[df.index[z]]+'_'+df.index[z] for z in leaves_list(Z)]
plt.figure(figsize=(120,20))
ax=dendrogram(Z,leaf_rotation=90.,leaf_font_size=8.,labels=labels)
plt.show()

In [ ]:
from skimage import feature,morphology,filters,measure
matD = np.load('../simulate_sequence/reference2/matrices/sd_000.npz')

mat=matD['S1_100']
mat2=matD['S1_80']


plt.figure(figsize=(20,10))
subplot(121)
imshow(mat,cmap=cm.gray)
subplot(122)
imshow(mat2,cmap=cm.gray)
print measure.compare_nrmse(mat[:500,:500],mat2[:500,:500])

In [ ]:
M[0]['centroid']