In [1]:
import numpy as np
import pandas as pd
from scipy.signal import argrelmax
import sys
sys.path.append('/usr/local/Cellar/python/2.7.13/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages')
import tensorflow as tf
import tflearn
import tsne
from sklearn.manifold import TSNE
import math
from scipy import cluster
import matplotlib.pyplot as plt
from sklearn.decomposition import ProjectedGradientNMF
%matplotlib inline

np.random.seed(0)
tf.set_random_seed(0)
from sklearn.metrics import classification_report

In [2]:
#Read the CSV file and put it into pandas data frame
df = pd.read_csv('Polymers Plus Analytes_SNV_Reduced_SGSmoothed.csv', index_col=0)

In [3]:
#Take a quick look at the data
df


Out[3]:
200.1318 201.617 203.1022 204.5874 206.0726 207.5577 209.0429 210.5281 212.0133 213.4985 ... 1986.816 1988.301 1989.786 1991.272 1992.757 1994.242 1995.727 1997.212 1998.698 2000.183
A1_C1 1.708752 1.775725 1.861025 1.941629 2.011954 2.046811 2.079291 2.112890 2.114177 2.114298 ... -0.665880 -0.656008 -0.649654 -0.641902 -0.642924 -0.636885 -0.630660 -0.626576 -0.600225 -0.613804
A1_C2 2.004295 2.006743 2.108492 2.173715 2.235133 2.287523 2.328738 2.376845 2.390796 2.400700 ... -0.708420 -0.705648 -0.697500 -0.696426 -0.689204 -0.689529 -0.689332 -0.694238 -0.690780 -0.703149
A1_C3 1.641548 1.719635 1.727123 1.780289 1.824746 1.873068 1.920712 1.953743 1.972046 1.972984 ... -0.440724 -0.441731 -0.436140 -0.433233 -0.429963 -0.419810 -0.415419 -0.413493 -0.422153 -0.422688
A1_Gly1 1.659481 1.716239 1.784938 1.847613 1.904956 1.948734 1.974711 2.006418 2.017988 2.022764 ... -0.786990 -0.788303 -0.793475 -0.792389 -0.792823 -0.793748 -0.801304 -0.803293 -0.822496 -0.826604
A1_Gly2 1.167688 1.225181 1.272517 1.320838 1.362455 1.399500 1.423343 1.449302 1.459464 1.450147 ... -1.018188 -1.009754 -1.005993 -1.001317 -1.002929 -1.002543 -1.013830 -1.013618 -1.043767 -1.012335
A1_Gly3 1.667580 1.727449 1.798728 1.832720 1.872614 1.895185 1.913762 1.931283 1.953513 1.951084 ... -1.052663 -1.058615 -1.056988 -1.057154 -1.060322 -1.060348 -1.056159 -1.058370 -1.055079 -1.065565
A1_HC1 1.213005 1.246412 1.294557 1.350900 1.414119 1.480618 1.557076 1.647480 1.727396 1.797848 ... -0.651136 -0.651901 -0.649899 -0.651513 -0.654322 -0.656268 -0.657652 -0.659916 -0.659924 -0.662043
A1_HC2 1.214321 1.250366 1.306477 1.366602 1.439891 1.518616 1.600351 1.688306 1.767251 1.833345 ... -0.669614 -0.669527 -0.670756 -0.671608 -0.673269 -0.674685 -0.675474 -0.675245 -0.672897 -0.681195
A1_HC3 1.144966 1.175299 1.241173 1.298419 1.366901 1.440046 1.517917 1.606259 1.688764 1.761680 ... -0.708548 -0.709161 -0.707377 -0.707572 -0.709099 -0.710288 -0.714087 -0.714551 -0.722282 -0.711487
A1_Ile1 2.080212 2.152591 2.180875 2.231400 2.285243 2.349542 2.394868 2.431752 2.461441 2.454932 ... -0.606310 -0.601787 -0.596811 -0.591758 -0.591364 -0.595460 -0.595208 -0.585713 -0.576605 -0.549335
A1_Ile2 2.497904 2.588654 2.629728 2.693445 2.737390 2.783184 2.818947 2.844160 2.851661 2.842934 ... -0.744714 -0.747692 -0.736922 -0.736229 -0.732720 -0.727792 -0.723185 -0.728081 -0.752789 -0.709134
A1_Ile3 1.843773 1.934465 1.953264 2.011428 2.056509 2.104142 2.151395 2.205697 2.227469 2.226187 ... -0.717272 -0.717951 -0.703682 -0.694608 -0.692060 -0.684847 -0.675930 -0.676181 -0.662299 -0.662334
A1_Phe1 1.213001 1.227603 1.308443 1.364954 1.419122 1.456064 1.493865 1.526027 1.546854 1.554699 ... -0.768946 -0.765989 -0.764601 -0.764101 -0.765451 -0.766235 -0.766378 -0.766339 -0.773450 -0.766635
A1_Phe2 1.428163 1.459969 1.539671 1.598359 1.651830 1.699517 1.735192 1.757098 1.778582 1.799640 ... -0.668539 -0.673312 -0.674053 -0.676834 -0.674994 -0.668920 -0.666232 -0.658654 -0.661206 -0.650613
A1_Phe3 1.563812 1.577121 1.645150 1.701708 1.759864 1.807987 1.856362 1.896501 1.917010 1.925464 ... -0.694087 -0.698959 -0.692347 -0.691738 -0.688455 -0.685409 -0.678736 -0.677610 -0.676467 -0.668454
A1_PP11 1.867184 1.822545 1.852295 1.855265 1.855054 1.881746 1.866032 1.863640 1.842072 1.836790 ... -0.915777 -0.900929 -0.890800 -0.894191 -0.898905 -0.905990 -0.923949 -0.930902 -0.939776 -0.921416
A1_PP12 2.822372 2.800852 2.812671 2.812050 2.812125 2.803801 2.804097 2.784388 2.759950 2.743271 ... -0.707583 -0.694670 -0.705081 -0.713193 -0.725592 -0.711798 -0.716706 -0.706635 -0.697382 -0.701405
A1_PP13 1.749527 1.770681 1.784955 1.804455 1.810886 1.809557 1.794373 1.792642 1.775503 1.767651 ... -0.957337 -0.947072 -0.937875 -0.924050 -0.921651 -0.920061 -0.924609 -0.928760 -0.946713 -0.932987
A1_SA1 1.417198 1.530497 1.589156 1.663126 1.706146 1.741557 1.774377 1.796937 1.811870 1.833364 ... -0.838260 -0.836762 -0.843309 -0.847092 -0.852731 -0.858477 -0.857943 -0.850889 -0.838261 -0.840517
A1_SA2 1.489096 1.634001 1.629703 1.687616 1.722893 1.766239 1.819158 1.856762 1.853006 1.853736 ... -0.859056 -0.863588 -0.861000 -0.855377 -0.858290 -0.855009 -0.853463 -0.858522 -0.863943 -0.881616
A1_SA3 1.688645 1.737642 1.822135 1.876374 1.927970 1.961163 1.995483 2.010958 2.028715 2.028709 ... -0.724410 -0.732390 -0.730798 -0.729488 -0.726129 -0.724036 -0.715908 -0.717865 -0.717477 -0.717128
A2_C1 0.935472 0.935841 0.908567 0.899299 0.888060 0.881056 0.877037 0.883940 0.879433 0.875926 ... -0.382418 -0.378680 -0.372909 -0.369781 -0.371126 -0.369199 -0.364311 -0.361594 -0.352687 -0.351891
A2_C2 1.158512 1.144798 1.121715 1.105654 1.093350 1.085824 1.081947 1.089141 1.091322 1.092816 ... -0.377883 -0.377993 -0.372845 -0.368990 -0.366301 -0.364235 -0.357963 -0.357411 -0.346926 -0.355019
A2_C3 0.985111 0.967353 0.945907 0.932657 0.918858 0.910982 0.903130 0.905938 0.904751 0.906540 ... -0.368778 -0.366620 -0.361434 -0.358179 -0.355562 -0.352399 -0.350112 -0.350852 -0.348430 -0.350412
A2_Gly1 -0.653039 -0.644274 -0.660734 -0.675299 -0.684295 -0.693129 -0.699082 -0.705805 -0.700840 -0.701705 ... -0.940429 -0.947496 -0.948869 -0.950410 -0.954629 -0.956604 -0.955229 -0.962084 -0.954225 -0.977703
A2_Gly2 0.071428 -0.001457 0.005733 -0.022198 -0.035514 -0.053115 -0.075821 -0.082115 -0.089870 -0.093132 ... -1.079406 -1.074151 -1.077680 -1.076815 -1.076246 -1.086893 -1.095661 -1.099941 -1.089136 -1.103185
A2_Gly3 -0.551143 -0.530746 -0.561035 -0.564277 -0.572445 -0.573582 -0.572611 -0.567570 -0.572997 -0.576791 ... -1.083314 -1.088158 -1.086930 -1.091265 -1.093413 -1.101263 -1.102511 -1.105650 -1.104230 -1.109212
A2_HC1 0.987652 0.971882 0.971590 0.959360 0.952873 0.946721 0.936687 0.939264 0.954750 0.955870 ... -1.932934 -1.937400 -1.947508 -1.952767 -1.959790 -1.965035 -1.972859 -1.974715 -1.988982 -1.978283
A2_HC2 0.998670 0.963005 0.968491 0.954208 0.949328 0.940720 0.930799 0.936607 0.948770 0.947992 ... -2.105510 -2.112495 -2.121931 -2.127573 -2.132590 -2.140614 -2.148865 -2.152174 -2.163699 -2.157480
A2_HC3 0.992058 0.935908 0.947432 0.931229 0.928147 0.917086 0.907468 0.916603 0.925011 0.922230 ... -2.250275 -2.259809 -2.268358 -2.274303 -2.277097 -2.287926 -2.296480 -2.301272 -2.309672 -2.308327
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
PMM_Phe1 0.637314 0.640142 0.624634 0.619588 0.615252 0.613673 0.613741 0.611704 0.613042 0.611226 ... -0.706864 -0.705516 -0.705839 -0.705287 -0.704553 -0.704950 -0.705582 -0.705212 -0.707699 -0.703349
PMM_Phe2 0.584885 0.592174 0.583540 0.580178 0.576687 0.568505 0.563385 0.566076 0.564192 0.562503 ... -0.709305 -0.710465 -0.710345 -0.709501 -0.709551 -0.708851 -0.706896 -0.706787 -0.704610 -0.709909
PMM_Phe3 0.905588 0.905742 0.897316 0.891288 0.885952 0.881950 0.877373 0.872003 0.870598 0.865764 ... -0.698644 -0.698240 -0.698781 -0.697496 -0.696784 -0.696348 -0.696175 -0.693508 -0.695660 -0.690233
PMM_PP11 0.815977 0.817445 0.805127 0.798517 0.795372 0.794588 0.789264 0.785491 0.783747 0.776654 ... -0.704295 -0.701602 -0.706570 -0.707906 -0.703058 -0.704698 -0.707306 -0.699635 -0.709885 -0.687870
PMM_PP12 0.517483 0.537008 0.556099 0.570079 0.572403 0.570857 0.563720 0.563614 0.559505 0.562267 ... -0.741982 -0.740173 -0.736041 -0.734968 -0.732043 -0.725712 -0.724010 -0.723341 -0.721369 -0.720603
PMM_PP13 0.587443 0.592637 0.595460 0.598090 0.599156 0.597968 0.594729 0.594277 0.591078 0.586304 ... -0.764151 -0.762880 -0.762638 -0.763273 -0.760839 -0.760261 -0.760703 -0.757790 -0.760787 -0.750308
PMM_SA1 1.047520 1.051150 1.042282 1.035403 1.029662 1.028837 1.023426 1.016776 1.012471 1.005003 ... -0.702037 -0.701128 -0.701802 -0.700713 -0.700323 -0.698723 -0.698580 -0.697397 -0.699710 -0.695959
PMM_SA2 1.021809 1.023659 1.011171 1.003713 0.996024 0.987316 0.982904 0.979772 0.977410 0.971076 ... -0.709567 -0.707675 -0.706016 -0.705607 -0.703106 -0.701051 -0.701089 -0.699318 -0.700454 -0.693500
PMM_SA3 1.052146 1.051640 1.045874 1.039358 1.031724 1.020829 1.017340 1.011720 1.005107 0.998165 ... -0.698674 -0.698771 -0.699094 -0.699773 -0.698065 -0.697214 -0.693891 -0.693740 -0.685805 -0.698983
PVA_C1 1.242274 1.249807 1.175229 1.137997 1.094611 1.073810 1.050117 1.046437 1.029770 1.009965 ... -1.201391 -1.201627 -1.197553 -1.198377 -1.198825 -1.197255 -1.185654 -1.192506 -1.156314 -1.224955
PVA_C2 1.235631 1.209775 1.134774 1.092499 1.054167 1.022297 0.991339 0.970859 0.966613 0.942636 ... -1.184168 -1.172218 -1.174408 -1.174938 -1.164382 -1.154692 -1.160359 -1.157490 -1.175720 -1.149859
PVA_C3 0.963854 0.924616 0.883383 0.846688 0.802584 0.767088 0.739365 0.728153 0.698506 0.691280 ... -1.127157 -1.128212 -1.113738 -1.109259 -1.114493 -1.108826 -1.111572 -1.116705 -1.134336 -1.131070
PVA_Gly1 -0.607385 -0.620992 -0.612369 -0.613153 -0.612634 -0.616694 -0.618716 -0.617665 -0.619355 -0.621616 ... -1.006219 -1.007851 -1.013633 -1.016920 -1.020334 -1.022367 -1.023174 -1.022556 -1.019726 -1.024835
PVA_Gly2 -0.601814 -0.595968 -0.604376 -0.602021 -0.609891 -0.620202 -0.617790 -0.622033 -0.630063 -0.630754 ... -1.003927 -1.004456 -1.007720 -1.010002 -1.014490 -1.016692 -1.021017 -1.023066 -1.023837 -1.027022
PVA_Gly3 -0.452498 -0.462984 -0.473169 -0.482157 -0.485942 -0.489088 -0.489010 -0.486775 -0.482844 -0.482932 ... -0.995383 -0.997011 -0.994097 -0.997257 -0.996517 -0.997953 -0.999410 -1.004793 -1.006402 -1.006947
PVA_HC1 1.464512 1.434199 1.408893 1.380138 1.343957 1.295286 1.248822 1.233347 1.187138 1.151964 ... -0.991746 -0.988324 -0.976171 -0.976306 -0.972443 -0.980980 -0.980983 -0.994746 -0.991319 -1.021694
PVA_HC2 1.348945 1.318255 1.291623 1.270819 1.247594 1.206341 1.163339 1.152631 1.111569 1.060652 ... -1.045845 -1.036696 -1.043121 -1.036662 -1.036230 -1.039325 -1.034483 -1.035646 -1.003085 -1.039469
PVA_HC3 1.885064 1.833207 1.847309 1.812597 1.794618 1.753924 1.712049 1.695128 1.673506 1.627103 ... -0.836395 -0.842685 -0.835892 -0.839715 -0.850595 -0.875873 -0.871877 -0.881899 -0.855438 -0.890074
PVA_Ile1 -0.375345 -0.375846 -0.384960 -0.387568 -0.395991 -0.397288 -0.401640 -0.403880 -0.411983 -0.417794 ... -1.136829 -1.145010 -1.145609 -1.144196 -1.144597 -1.149959 -1.142506 -1.138764 -1.127280 -1.115665
PVA_Ile2 -0.314365 -0.341121 -0.333439 -0.338621 -0.337549 -0.332423 -0.332970 -0.338416 -0.348358 -0.362183 ... -1.102612 -1.110711 -1.105837 -1.111824 -1.116344 -1.118632 -1.118101 -1.128548 -1.122982 -1.141953
PVA_Ile3 -0.385842 -0.350116 -0.364217 -0.362592 -0.371954 -0.374341 -0.375951 -0.385336 -0.392176 -0.396795 ... -1.122817 -1.116168 -1.118268 -1.127309 -1.125500 -1.124147 -1.129069 -1.131169 -1.133502 -1.141346
PVA_Phe1 -0.272667 -0.269867 -0.264444 -0.229406 -0.180450 -0.148743 -0.142760 -0.124634 -0.136315 -0.150088 ... -1.416184 -1.411030 -1.397035 -1.382608 -1.389395 -1.378877 -1.355233 -1.312524 -1.271868 -1.194295
PVA_Phe2 2.363550 2.207461 2.317818 2.288079 2.271639 2.225172 2.195966 2.159052 2.140546 2.121082 ... -1.298287 -1.302106 -1.303517 -1.298750 -1.297036 -1.298075 -1.294786 -1.288387 -1.274496 -1.307482
PVA_Phe3 0.273473 0.234172 0.247412 0.246188 0.284616 0.301562 0.289314 0.297310 0.296798 0.244532 ... -1.225612 -1.213098 -1.215280 -1.214623 -1.194341 -1.175907 -1.180645 -1.169926 -1.157723 -1.193575
PVA_PP11 2.238753 2.080170 2.117835 2.103226 2.100747 2.073809 2.061385 2.065248 2.074209 2.062846 ... -0.492153 -0.486639 -0.463717 -0.457328 -0.435389 -0.430261 -0.407883 -0.400925 -0.382799 -0.398639
PVA_PP12 2.218166 2.295647 2.221981 2.226949 2.215553 2.228912 2.246386 2.257493 2.266332 2.258836 ... -0.527981 -0.508188 -0.489538 -0.478636 -0.486346 -0.494609 -0.484663 -0.458805 -0.383655 -0.350781
PVA_PP13 2.471706 2.403227 2.361165 2.345810 2.328945 2.311124 2.309179 2.320247 2.313693 2.298704 ... -0.436659 -0.440341 -0.434676 -0.447300 -0.454170 -0.460779 -0.461049 -0.456171 -0.434816 -0.418187
PVA_SA1 1.323295 0.683266 0.284972 -0.128811 -0.375680 -0.639727 -0.651162 -0.662526 -0.647574 -0.727282 ... -1.074545 -0.983329 -1.001665 -0.963746 -1.005820 -0.836067 -0.721222 -0.629321 -0.217148 -0.806796
PVA_SA2 -0.195450 -0.272446 -0.556901 -0.607554 -0.572610 -0.521734 -0.343334 -0.146715 0.067028 0.196839 ... -0.643898 -0.552079 -0.359430 -0.364149 -0.296613 -0.242602 -0.310640 -0.463915 -0.650311 -0.620332
PVA_SA3 0.456641 0.118976 -0.049270 -0.228825 -0.347664 -0.541209 -0.503154 -0.531126 -0.472973 -0.414213 ... -0.771920 -0.708351 -0.739497 -0.771137 -0.782259 -0.676197 -0.595573 -0.555224 -0.198120 -0.696176

270 rows × 1213 columns


In [5]:
# How many points on each side to use for the comparison to consider comparator(n, n+x) to be True.
neighborhood = 5
for neighborhood in range(1,20):
    local_maxima = []
    for i in range(df.shape[0]):
        row = np.array(df.iloc[[i]])
        a = argrelmax(row[0], order = neighborhood)
        local_maxima.append(len(a[0]))
    print("The average of local maxima with neighborsize of: %d is %.3f"%(neighborhood,np.mean(local_maxima)))


The average of local maxima with neighborsize of: 1 is 106.826
The average of local maxima with neighborsize of: 2 is 64.559
The average of local maxima with neighborsize of: 3 is 50.033
The average of local maxima with neighborsize of: 4 is 44.026
The average of local maxima with neighborsize of: 5 is 40.259
The average of local maxima with neighborsize of: 6 is 35.244
The average of local maxima with neighborsize of: 7 is 31.648
The average of local maxima with neighborsize of: 8 is 29.152
The average of local maxima with neighborsize of: 9 is 27.030
The average of local maxima with neighborsize of: 10 is 25.333
The average of local maxima with neighborsize of: 11 is 23.830
The average of local maxima with neighborsize of: 12 is 22.678
The average of local maxima with neighborsize of: 13 is 21.693
The average of local maxima with neighborsize of: 14 is 20.815
The average of local maxima with neighborsize of: 15 is 19.967
The average of local maxima with neighborsize of: 16 is 19.163
The average of local maxima with neighborsize of: 17 is 18.467
The average of local maxima with neighborsize of: 18 is 17.881
The average of local maxima with neighborsize of: 19 is 17.285

In [5]:
def xavier_init(fan_in, fan_out, constant=1): 
    """ Xavier initialization of network weights"""
    # https://stackoverflow.com/questions/33640581/how-to-do-xavier-initialization-on-tensorflow
    low = -constant*np.sqrt(6.0/(fan_in + fan_out)) 
    high = constant*np.sqrt(6.0/(fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out), 
                             minval=low, maxval=high, 
                             dtype=tf.float32)

In [ ]:


In [6]:
data_set = []
for i in range(df.shape[0]):
    row = np.array(df.iloc[[i]])
    data_set.append(row[0])

In [7]:
#Number of neurons in each layer
layer1_neurons = int(math.pow(2,10))
layer2_neurons = int(math.pow(2,8))
layer3_neurons = int(math.pow(2,4))
layer4_neurons = int(math.pow(2,1))

print("Layer1: %d, Layer2: %d, Layer3: %d, Layer4: %d"%(layer1_neurons,layer2_neurons,layer3_neurons,layer4_neurons))


Layer1: 1024, Layer2: 256, Layer3: 16, Layer4: 2

In [ ]:
#Initializing the DNN
tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.8)
# Building the encoder
encoder = tflearn.input_data(shape=[None, len(data_set[0])])
INPUT = encoder
encoder = tflearn.fully_connected(encoder, layer1_neurons, name = "en_1")
encoder = tflearn.fully_connected(encoder, layer2_neurons, name = "en_2")
encoder = tflearn.fully_connected(encoder, layer3_neurons, name = "en_3")
encoder = tflearn.fully_connected(encoder, layer4_neurons, name = "en_4")
HIDDEN_STATE = encoder
# Building the decoder
decoder = tflearn.fully_connected(encoder, layer3_neurons, name = "de_1")
decoder = tflearn.fully_connected(encoder, layer2_neurons, name = "de_2")
decoder = tflearn.fully_connected(decoder, layer1_neurons, name = "de_3")
decoder = tflearn.fully_connected(decoder, len(data_set[0]), name = "de_4")
OUTPUT = decoder    


# Regression, with mean square error
net = tflearn.regression(decoder, optimizer='AdaDelta', learning_rate=0.001,loss='mean_square', metric=None)

# Training the auto encoder
model = tflearn.DNN(net, tensorboard_verbose=0)
model.fit(data_set, data_set, n_epoch=10,validation_set=0.1, run_id="auto_encoder", batch_size=10)

#Encode an input X and return the middle layer of the AE
def encode (X):    
    if len (X.shape) < 2:
        X = X.reshape (1, -1)

    tflearn.is_training(False, model.session)
    res = model.session.run (HIDDEN_STATE, feed_dict={INPUT.name:X})    
    return res    

encode_decode = model.predict(data_set)
result_autoencoder = []
for i in range(len(encode_decode)):
    result_autoencoder.append(encode(i))

In [ ]:
# network parameters
input_dim = len(data_set[0]) # height data input
encoder_hidden_dim = 128
decoder_hidden_dim = 128
latent_dim = 2
batch_size = 10
n_epoch = 10

# paths
TENSORBOARD_DIR='experiment/'
CHECKPOINT_PATH='out_models/'

# encoder
def encode(input_x):
    encoder = tflearn.fully_connected(input_x, encoder_hidden_dim, activation='relu')
    mu_encoder = tflearn.fully_connected(encoder, latent_dim, activation='linear')
    logvar_encoder = tflearn.fully_connected(encoder, latent_dim, activation='linear')
    return mu_encoder, logvar_encoder

# decoder
def decode(z):
    decoder = tflearn.fully_connected(z, decoder_hidden_dim, activation='relu', restore=False)
    x_hat = tflearn.fully_connected(decoder, input_dim, activation='linear', restore=False)
    return x_hat

# sampler
def sample(mu, logvar):
    epsilon = tf.random_normal(tf.shape(logvar), dtype=tf.float32, name='epsilon')
    std_encoder = tf.exp(tf.multiply(0.5, logvar))
    z = tf.add(mu, tf.multiply(std_encoder, epsilon))
    return z

# loss function(regularization)
def calculate_regularization_loss(mu, logvar):
    kl_divergence = -0.5 * tf.reduce_sum(1 + logvar - tf.square(mu) - tf.exp(logvar), reduction_indices=1)
    return kl_divergence

# loss function(reconstruction)
def calculate_reconstruction_loss(x_hat, input_x):
    mse = tflearn.objectives.mean_square(x_hat, input_x)
    return mse

# trainer
def define_trainer(target, optimizer):
    trainop = tflearn.TrainOp(loss=target,
                              optimizer=optimizer,
                              batch_size=batch_size,
                              metric=None,
                              name='vae_trainer')

    trainer = tflearn.Trainer(train_ops=trainop,
                              tensorboard_dir=TENSORBOARD_DIR,
                              tensorboard_verbose=3,
                              checkpoint_path=CHECKPOINT_PATH,
                              max_checkpoints=1)
    return trainer



def define_evaluator(trainer, mu, logvar):
    evaluator = tflearn.Evaluator([mu, logvar], session=trainer.session)
    return evaluator

input_x = tflearn.input_data(shape=(None, len(data_set[0])), name='input_x')
mu, logvar = encode(input_x)
z = sample(mu, logvar)
x_hat = decode(z)


regularization_loss = calculate_regularization_loss(mu, logvar)
reconstruction_loss = calculate_reconstruction_loss(x_hat, input_x)

target = tf.reduce_mean(tf.add(regularization_loss, reconstruction_loss))

optimizer = tflearn.optimizers.AdaDelta()
optimizer = optimizer.get_tensor()

trainer = define_trainer(target, optimizer)

trainer.fit(feed_dicts={input_x: data_set}, val_feed_dicts={input_x: data_set},
            n_epoch=n_epoch,
            show_metric=False,
            snapshot_epoch=False,
            shuffle_all=True,
            run_id='VAE')

In [8]:
model = TSNE(n_components=2, random_state=0)

In [9]:
output = model.fit_transform(data_set)

In [10]:
col = df[df.columns[0]]

In [11]:
col = df.iloc[:,0]

In [7]:
df.head()


Out[7]:
200.1318 201.617 203.1022 204.5874 206.0726 207.5577 209.0429 210.5281 212.0133 213.4985 ... 1986.816 1988.301 1989.786 1991.272 1992.757 1994.242 1995.727 1997.212 1998.698 2000.183
A1_C1 1.708752 1.775725 1.861025 1.941629 2.011954 2.046811 2.079291 2.112890 2.114177 2.114298 ... -0.665880 -0.656008 -0.649654 -0.641902 -0.642924 -0.636885 -0.630660 -0.626576 -0.600225 -0.613804
A1_C2 2.004295 2.006743 2.108492 2.173715 2.235133 2.287523 2.328738 2.376845 2.390796 2.400700 ... -0.708420 -0.705648 -0.697500 -0.696426 -0.689204 -0.689529 -0.689332 -0.694238 -0.690780 -0.703149
A1_C3 1.641548 1.719635 1.727123 1.780289 1.824746 1.873068 1.920712 1.953743 1.972046 1.972984 ... -0.440724 -0.441731 -0.436140 -0.433233 -0.429963 -0.419810 -0.415419 -0.413493 -0.422153 -0.422688
A1_Gly1 1.659481 1.716239 1.784938 1.847613 1.904956 1.948734 1.974711 2.006418 2.017988 2.022764 ... -0.786990 -0.788303 -0.793475 -0.792389 -0.792823 -0.793748 -0.801304 -0.803293 -0.822496 -0.826604
A1_Gly2 1.167688 1.225181 1.272517 1.320838 1.362455 1.399500 1.423343 1.449302 1.459464 1.450147 ... -1.018188 -1.009754 -1.005993 -1.001317 -1.002929 -1.002543 -1.013830 -1.013618 -1.043767 -1.012335

5 rows × 1213 columns


In [13]:
tsne_result = TSNE(n_components=2, perplexity=100, verbose=2).fit_transform(data_set)


[t-SNE] Computing pairwise distances...
[t-SNE] Computing 269 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 270 / 270
[t-SNE] Mean sigma: 12.659892
[t-SNE] Iteration 25: error = 0.2273141, gradient norm = 0.0176430
[t-SNE] Iteration 50: error = 0.3000943, gradient norm = 0.0174180
[t-SNE] Iteration 75: error = 0.2897153, gradient norm = 0.0172545
[t-SNE] Iteration 100: error = 0.6682754, gradient norm = 0.0209463
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.668275
[t-SNE] Iteration 125: error = 0.9235449, gradient norm = 0.0275240
[t-SNE] Iteration 150: error = 1.5640441, gradient norm = 0.0432017
[t-SNE] Iteration 175: error = 1.5916212, gradient norm = 0.0246210
[t-SNE] Iteration 175: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 175 iterations: 0.668275

In [ ]:


In [8]:
#Get the labels for names
df_1 = pd.read_csv('Polymers Plus Analytes_SNV_Reduced_SGSmoothed.csv')
names = df_1.iloc[:,0]
labels = []
for name in names:
    labels.append(name)

In [9]:
colors = {}
colors['A1'] = 'red'
colors['A2'] = 'blue'
colors['A3'] = 'green'
colors['A4'] = 'orange'
colors['C1'] = 'lawngreen'
colors['C2'] = 'khaki'
colors['C3'] = 'darkviolet'
colors['C4'] = 'indigo'
colors['PC'] = 'cyan'
colors['PE'] = 'teal'
colors['PI'] = 'salmon'
colors['PM'] = 'darkcyan'
colors['PV'] = 'seagreen'
label_color = []
for i in range(df.shape[0]):
    label_color.append(colors[labels[i][:2]])

In [16]:
# plot the result
vis_x = tsne_result[:, 0]
vis_y = tsne_result[:, 1]
for i in range(df.shape[0]):
    plt.scatter(vis_x[i], vis_y[i], c = label_color[i])



In [10]:
min_overall = 0
for i in range(df.shape[0]):
    min_overall = min(min(data_set[i]) , min_overall)
min_overall *= -1
data_set_pos = []
for i in range(df.shape[0]):
    data_set_pos.append(data_set[i] + min_overall)

In [70]:
num_component = 7
nmf_model = ProjectedGradientNMF(n_components = num_component, init='random', random_state=0)
W = nmf_model.fit_transform(data_set_pos[22:42]);
H = nmf_model.components_;


/Library/Python/2.7/site-packages/sklearn/utils/deprecation.py:52: DeprecationWarning: Class ProjectedGradientNMF is deprecated; It will be removed in release 0.19. Use NMF instead.'pg' solver is still available until release 0.19.
  warnings.warn(msg, category=DeprecationWarning)
/Library/Python/2.7/site-packages/sklearn/decomposition/nmf.py:775: DeprecationWarning: 'pg' solver will be removed in release 0.19. Use 'cd' solver instead.
  " Use 'cd' solver instead.", DeprecationWarning)

In [74]:
clusters = [[] for _ in range(num_component)]
for i in range(len(W)):
    cluster_ = np.argmax(W[i])
    clusters[cluster_].append(labels[i + 21])

In [75]:
clusters


Out[75]:
[['A2_Phe3', 'A2_PP11', 'A2_PP12'],
 ['A2_C3', 'A2_Gly1', 'A2_Gly2', 'A2_PP13', 'A2_SA1', 'A2_SA2'],
 [],
 ['A2_ILe3', 'A2_Phe1', 'A2_Phe2'],
 ['A2_Gly3', 'A2_HC1', 'A2_HC2'],
 [],
 ['A2_C1', 'A2_C2', 'A2_HC3', 'A2_ILe1', 'A2_ILe2']]

In [11]:
labels[21:42]


Out[11]:
['A2_C1',
 'A2_C2',
 'A2_C3',
 'A2_Gly1',
 'A2_Gly2',
 'A2_Gly3',
 'A2_HC1',
 'A2_HC2',
 'A2_HC3',
 'A2_ILe1',
 'A2_ILe2',
 'A2_ILe3',
 'A2_Phe1',
 'A2_Phe2',
 'A2_Phe3',
 'A2_PP11',
 'A2_PP12',
 'A2_PP13',
 'A2_SA1',
 'A2_SA2',
 'A2_SA3']

In [ ]:


In [ ]:


In [12]:
colors_analyte = {}
colors_analyte['C'] = 'red'
colors_analyte['G'] = 'blue'
colors_analyte['HC'] = 'green'
colors_analyte['Il'] = 'orange'
colors_analyte['Ph'] = 'lawngreen'
colors_analyte['PP'] = 'khaki'
colors_analyte['SA'] = 'darkviolet'
label_color_analyte = []
for i in range(21):
    if labels[i][3:4] == 'C' or labels[i][3:4] == 'G':
        label_color_analyte.append(colors_analyte[labels[i][3:4]])
    else:
        label_color_analyte.append(colors_analyte[labels[i][3:5]])

In [46]:
tsne_result = TSNE(n_components=7, perplexity=50, verbose=2).fit_transform(data_set[:21])
# plot the result
vis_x = tsne_result[:21, 0]
vis_y = tsne_result[:21, 1]
for i in range(21):
    plt.scatter(vis_x[i], vis_y[i], c = label_color_analyte[i])


[t-SNE] Computing pairwise distances...
[t-SNE] Computing 20 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 21 / 21
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] Iteration 25: error = 6.0708952, gradient norm = 0.0000000
[t-SNE] Iteration 25: gradient norm 0.000000. Finished.
[t-SNE] Iteration 50: error = -2.8860505, gradient norm = 0.0000030
[t-SNE] Iteration 50: gradient norm 0.000003. Finished.
[t-SNE] KL divergence after 50 iterations with early exaggeration: -2.886050
[t-SNE] Iteration 75: error = -2.9841032, gradient norm = 0.0000026
[t-SNE] Iteration 75: gradient norm 0.000003. Finished.
[t-SNE] Error after 75 iterations: -2.886050

In [47]:
initial = [cluster.vq.kmeans(data_set[0:21],i) for i in range(1,10)]
plt.plot([var for (cent,var) in initial])
plt.show()



In [51]:
cent, var = initial[2]
#use vq() to get as assignment for each obs.
assignment,cdist = cluster.vq.vq(data_set[:21],cent)
plt.scatter(vis_x, vis_y, c=assignment)
plt.show()



In [13]:
def bucketing(numbers, num_bucket):
    result = [0] * num_bucket
    length = len(numbers) / num_bucket
    for i in range(num_bucket):
        begin = i * length
        end = (i + 1) * length
        result[i] = max(numbers[begin:end])
    return result

In [14]:
#Bucketing the data set into 30 buckets
data_set_new = []
for i in range(df.shape[0]):
    data_set_new.append(bucketing(data_set[i], 20))

In [15]:
group = 1
begin_group =  group*21
end_group = (group + 1) * 21
print(begin_group, end_group)


(21, 42)

In [16]:
initial = [cluster.vq.kmeans(data_set_new[begin_group:end_group],i) for i in range(1,10)]
plt.plot([var for (cent,var) in initial])
plt.title("Group: %d"%(group+1))
plt.xlabel('Number of clusters')
plt.ylabel('Average within cluster sum of squares')
plt.show()



In [17]:
num_clusters = 3
cent, var = initial[num_clusters-1]
#use vq() to get as assignment for each obs.
assignment,cdist = cluster.vq.vq(data_set_new[begin_group:end_group],cent)
#plt.scatter(vis_x, vis_y, c=assignment)
#plt.show()

In [18]:
clusters_kmeans = [[] for _ in range(num_clusters)]
for i in range(len(assignment)):
    clusters_kmeans[ assignment[i] ].append(labels[i + (group*21)])

In [19]:
clusters_kmeans


Out[19]:
[['A2_C1',
  'A2_C2',
  'A2_C3',
  'A2_Gly1',
  'A2_Gly2',
  'A2_Gly3',
  'A2_Phe1',
  'A2_Phe2',
  'A2_Phe3',
  'A2_PP11',
  'A2_PP12',
  'A2_PP13',
  'A2_SA1',
  'A2_SA2',
  'A2_SA3'],
 ['A2_HC1', 'A2_HC2', 'A2_HC3'],
 ['A2_ILe1', 'A2_ILe2', 'A2_ILe3']]

In [20]:
num_clusters_list = [3,3,2,3,5,3,4,4,2,2,3,2,3]

In [21]:
kmeans_file = open('k-means_fixed_3.txt', 'w')

for group in range(13):
    begin_group =  group*21
    end_group = (group + 1) * 21
    if group > 3:
        end_group -= 3
    if group > 4:
        begin_group -= 3
        
    print(begin_group, end_group, labels[begin_group:end_group])
    
    initial = [cluster.vq.kmeans(data_set_new[begin_group:end_group],i) for i in range(1,10)]
    plt.plot([var for (cent,var) in initial])
    plt.title("Group: %d"%(group+1))
    plt.xlabel('Number of clusters')
    plt.ylabel('Average within cluster sum of squares')

    plt.show()
    
    #num_clusters = num_clusters_list[group]
    num_clusters = 3
    cent, var = initial[num_clusters-1]
    assignment,cdist = cluster.vq.vq(data_set_new[begin_group:end_group],cent)

    clusters_kmeans = [[] for _ in range(num_clusters)]
    for i in range(len(assignment)):
        clusters_kmeans[ assignment[i] ].append(labels[i + (begin_group)])
    
    kmeans_file.writelines('Group: %d\n'%group)
    for i in range(num_clusters):
        kmeans_file.write("Cluster %d: %s "%(i+1, " , ".join(clusters_kmeans[i])))
        kmeans_file.write("\n")
kmeans_file.close()


(0, 21, ['A1_C1', 'A1_C2', 'A1_C3', 'A1_Gly1', 'A1_Gly2', 'A1_Gly3', 'A1_HC1', 'A1_HC2', 'A1_HC3', 'A1_Ile1', 'A1_Ile2', 'A1_Ile3', 'A1_Phe1', 'A1_Phe2', 'A1_Phe3', 'A1_PP11', 'A1_PP12', 'A1_PP13', 'A1_SA1', 'A1_SA2', 'A1_SA3'])
(21, 42, ['A2_C1', 'A2_C2', 'A2_C3', 'A2_Gly1', 'A2_Gly2', 'A2_Gly3', 'A2_HC1', 'A2_HC2', 'A2_HC3', 'A2_ILe1', 'A2_ILe2', 'A2_ILe3', 'A2_Phe1', 'A2_Phe2', 'A2_Phe3', 'A2_PP11', 'A2_PP12', 'A2_PP13', 'A2_SA1', 'A2_SA2', 'A2_SA3'])
(42, 63, ['A3_C1', 'A3_C2', 'A3_C3', 'A3_Gly1', 'A3_Gly2', 'A3_Gly3', 'A3_HC1', 'A3_HC2', 'A3_HC3', 'A3_ILe1', 'A3_ILe2', 'A3_ILe3', 'A3_Phe1', 'A3_Phe2', 'A3_Phe3', 'A3_PP11', 'A3_PP12', 'A3_PP13', 'A3_SA1', 'A3_SA2', 'A3_SA3'])
(63, 84, ['A4_C1', 'A4_C2', 'A4_C3', 'A4_Gly1', 'A4_Gly2', 'A4_Gly3', 'A4_HC1', 'A4_HC2', 'A4_HC3', 'A4_Ile1', 'A4_Ile2', 'A4_Ile3', 'A4_Phe1', 'A4_Phe2', 'A4_Phe3', 'A4_PP11', 'A4_PP12', 'A4_PP13', 'A4_SA1', 'A4_SA2', 'A4_SA3'])
(84, 102, ['C1_C1', 'C1_C2', 'C1_C3', 'C1_Gly1', 'C1_Gly2', 'C1_Gly3', 'C1_HC1', 'C1_HC2', 'C1_HC3', 'C1_Ile1', 'C1_Ile2', 'C1_Ile3', 'C1_PP11', 'C1_PP12', 'C1_PP13', 'C1_SA1', 'C1_SA2', 'C1_SA3'])
(102, 123, ['C2_C1', 'C2_C2', 'C2_C3', 'C2_Gly1', 'C2_Gly2', 'C2_Gly3', 'C2_HC1', 'C2_HC2', 'C2_HC3', 'C2_Ile1', 'C2_Ile2', 'C2_Ile3', 'C2_Phe1', 'C2_Phe2', 'C2_Phe3', 'C2_PP11', 'C2_PP12', 'C2_PP13', 'C2_SA1', 'C2_SA2', 'C2_SA3'])
(123, 144, ['C3_C1', 'C3_C2', 'C3_C3', 'C3_Gly1', 'C3_Gly2', 'C3_Gly3', 'C3_HC1', 'C3_HC2', 'C3_HC3', 'C3_Ile1', 'C3_Ile2', 'C3_Ile3', 'C3_Phe1', 'C3_Phe2', 'C3_Phe3', 'C3_PP11', 'C3_PP12', 'C3_PP13', 'C3_SA1', 'C3_SA2', 'C3_SA3'])
(144, 165, ['C4_C1', 'C4_C2', 'C4_C3', 'C4_Gly1', 'C4_Gly2', 'C4_Gly3', 'C4_HC1', 'C4_HC2', 'C4_HC3', 'C4_Ile1', 'C4_Ile2', 'C4_Ile3', 'C4_Phe1', 'C4_Phe2', 'C4_Phe3', 'C4_PP11', 'C4_PP12', 'C4_PP13', 'C4_SA1', 'C4_SA2', 'C4_SA3'])
(165, 186, ['PC_C1', 'PC_C2', 'PC_C3', 'PC_Gly1', 'PC_Gly2', 'PC_Gly3', 'PC_HC1', 'PC_HC2', 'PC_HC3', 'PC_Ile1', 'PC_Ile2', 'PC_Ile3', 'PC_Phe1', 'PC_Phe2', 'PC_Phe3', 'PC_PP11', 'PC_PP12', 'PC_PP13', 'PC_SA1', 'PC_SA2', 'PC_SA3'])
(186, 207, ['PEO_C1', 'PEO_C2', 'PEO_C3', 'PEO_Gly1', 'PEO_Gly2', 'PEO_Gly3', 'PEO_HC1', 'PEO_HC2', 'PEO_HC3', 'PEO_Ile1', 'PEO_Ile2', 'PEO_Ile3', 'PEO_Phe1', 'PEO_Phe2', 'PEO_Phe3', 'PEO_PP11', 'PEO_PP12', 'PEO_PP13', 'PEO_SA1', 'PEO_SA2', 'PEO_SA3'])
(207, 228, ['PI_C1', 'PI_C2', 'PI_C3', 'PI_Gly1', 'PI_Gly2', 'PI_Gly3', 'PI_HC1', 'PI_HC2', 'PI_HC3', 'PI_Ile1', 'PI_Ile2', 'PI_Ile3', 'PI_Phe1', 'PI_Phe2', 'PI_Phe3', 'PI_PP11', 'PI_PP12', 'PI_PP13', 'PI_SA1', 'PI_SA2', 'PI_SA3'])
(228, 249, ['PMM_C1', 'PMM_C2', 'PMM_C3', 'PMM_Gly1', 'PMM_Gly2', 'PMM_Gly3', 'PMM_HC1', 'PMM_HC2', 'PMM_HC3', 'PMM_Ile1', 'PMM_Ile2', 'PMM_Ile3', 'PMM_Phe1', 'PMM_Phe2', 'PMM_Phe3', 'PMM_PP11', 'PMM_PP12', 'PMM_PP13', 'PMM_SA1', 'PMM_SA2', 'PMM_SA3'])
(249, 270, ['PVA_C1', 'PVA_C2', 'PVA_C3', 'PVA_Gly1', 'PVA_Gly2', 'PVA_Gly3', 'PVA_HC1', 'PVA_HC2', 'PVA_HC3', 'PVA_Ile1', 'PVA_Ile2', 'PVA_Ile3', 'PVA_Phe1', 'PVA_Phe2', 'PVA_Phe3', 'PVA_PP11', 'PVA_PP12', 'PVA_PP13', 'PVA_SA1', 'PVA_SA2', 'PVA_SA3'])

In [22]:
for group in range(13):
    begin_group =  group*21
    end_group = (group + 1) * 21
    if group > 3:
        end_group -= 3
    if group > 4:
        begin_group -= 3
        
    print(group + 1, begin_group, end_group, labels[begin_group:end_group])


(1, 0, 21, ['A1_C1', 'A1_C2', 'A1_C3', 'A1_Gly1', 'A1_Gly2', 'A1_Gly3', 'A1_HC1', 'A1_HC2', 'A1_HC3', 'A1_Ile1', 'A1_Ile2', 'A1_Ile3', 'A1_Phe1', 'A1_Phe2', 'A1_Phe3', 'A1_PP11', 'A1_PP12', 'A1_PP13', 'A1_SA1', 'A1_SA2', 'A1_SA3'])
(2, 21, 42, ['A2_C1', 'A2_C2', 'A2_C3', 'A2_Gly1', 'A2_Gly2', 'A2_Gly3', 'A2_HC1', 'A2_HC2', 'A2_HC3', 'A2_ILe1', 'A2_ILe2', 'A2_ILe3', 'A2_Phe1', 'A2_Phe2', 'A2_Phe3', 'A2_PP11', 'A2_PP12', 'A2_PP13', 'A2_SA1', 'A2_SA2', 'A2_SA3'])
(3, 42, 63, ['A3_C1', 'A3_C2', 'A3_C3', 'A3_Gly1', 'A3_Gly2', 'A3_Gly3', 'A3_HC1', 'A3_HC2', 'A3_HC3', 'A3_ILe1', 'A3_ILe2', 'A3_ILe3', 'A3_Phe1', 'A3_Phe2', 'A3_Phe3', 'A3_PP11', 'A3_PP12', 'A3_PP13', 'A3_SA1', 'A3_SA2', 'A3_SA3'])
(4, 63, 84, ['A4_C1', 'A4_C2', 'A4_C3', 'A4_Gly1', 'A4_Gly2', 'A4_Gly3', 'A4_HC1', 'A4_HC2', 'A4_HC3', 'A4_Ile1', 'A4_Ile2', 'A4_Ile3', 'A4_Phe1', 'A4_Phe2', 'A4_Phe3', 'A4_PP11', 'A4_PP12', 'A4_PP13', 'A4_SA1', 'A4_SA2', 'A4_SA3'])
(5, 84, 102, ['C1_C1', 'C1_C2', 'C1_C3', 'C1_Gly1', 'C1_Gly2', 'C1_Gly3', 'C1_HC1', 'C1_HC2', 'C1_HC3', 'C1_Ile1', 'C1_Ile2', 'C1_Ile3', 'C1_PP11', 'C1_PP12', 'C1_PP13', 'C1_SA1', 'C1_SA2', 'C1_SA3'])
(6, 102, 123, ['C2_C1', 'C2_C2', 'C2_C3', 'C2_Gly1', 'C2_Gly2', 'C2_Gly3', 'C2_HC1', 'C2_HC2', 'C2_HC3', 'C2_Ile1', 'C2_Ile2', 'C2_Ile3', 'C2_Phe1', 'C2_Phe2', 'C2_Phe3', 'C2_PP11', 'C2_PP12', 'C2_PP13', 'C2_SA1', 'C2_SA2', 'C2_SA3'])
(7, 123, 144, ['C3_C1', 'C3_C2', 'C3_C3', 'C3_Gly1', 'C3_Gly2', 'C3_Gly3', 'C3_HC1', 'C3_HC2', 'C3_HC3', 'C3_Ile1', 'C3_Ile2', 'C3_Ile3', 'C3_Phe1', 'C3_Phe2', 'C3_Phe3', 'C3_PP11', 'C3_PP12', 'C3_PP13', 'C3_SA1', 'C3_SA2', 'C3_SA3'])
(8, 144, 165, ['C4_C1', 'C4_C2', 'C4_C3', 'C4_Gly1', 'C4_Gly2', 'C4_Gly3', 'C4_HC1', 'C4_HC2', 'C4_HC3', 'C4_Ile1', 'C4_Ile2', 'C4_Ile3', 'C4_Phe1', 'C4_Phe2', 'C4_Phe3', 'C4_PP11', 'C4_PP12', 'C4_PP13', 'C4_SA1', 'C4_SA2', 'C4_SA3'])
(9, 165, 186, ['PC_C1', 'PC_C2', 'PC_C3', 'PC_Gly1', 'PC_Gly2', 'PC_Gly3', 'PC_HC1', 'PC_HC2', 'PC_HC3', 'PC_Ile1', 'PC_Ile2', 'PC_Ile3', 'PC_Phe1', 'PC_Phe2', 'PC_Phe3', 'PC_PP11', 'PC_PP12', 'PC_PP13', 'PC_SA1', 'PC_SA2', 'PC_SA3'])
(10, 186, 207, ['PEO_C1', 'PEO_C2', 'PEO_C3', 'PEO_Gly1', 'PEO_Gly2', 'PEO_Gly3', 'PEO_HC1', 'PEO_HC2', 'PEO_HC3', 'PEO_Ile1', 'PEO_Ile2', 'PEO_Ile3', 'PEO_Phe1', 'PEO_Phe2', 'PEO_Phe3', 'PEO_PP11', 'PEO_PP12', 'PEO_PP13', 'PEO_SA1', 'PEO_SA2', 'PEO_SA3'])
(11, 207, 228, ['PI_C1', 'PI_C2', 'PI_C3', 'PI_Gly1', 'PI_Gly2', 'PI_Gly3', 'PI_HC1', 'PI_HC2', 'PI_HC3', 'PI_Ile1', 'PI_Ile2', 'PI_Ile3', 'PI_Phe1', 'PI_Phe2', 'PI_Phe3', 'PI_PP11', 'PI_PP12', 'PI_PP13', 'PI_SA1', 'PI_SA2', 'PI_SA3'])
(12, 228, 249, ['PMM_C1', 'PMM_C2', 'PMM_C3', 'PMM_Gly1', 'PMM_Gly2', 'PMM_Gly3', 'PMM_HC1', 'PMM_HC2', 'PMM_HC3', 'PMM_Ile1', 'PMM_Ile2', 'PMM_Ile3', 'PMM_Phe1', 'PMM_Phe2', 'PMM_Phe3', 'PMM_PP11', 'PMM_PP12', 'PMM_PP13', 'PMM_SA1', 'PMM_SA2', 'PMM_SA3'])
(13, 249, 270, ['PVA_C1', 'PVA_C2', 'PVA_C3', 'PVA_Gly1', 'PVA_Gly2', 'PVA_Gly3', 'PVA_HC1', 'PVA_HC2', 'PVA_HC3', 'PVA_Ile1', 'PVA_Ile2', 'PVA_Ile3', 'PVA_Phe1', 'PVA_Phe2', 'PVA_Phe3', 'PVA_PP11', 'PVA_PP12', 'PVA_PP13', 'PVA_SA1', 'PVA_SA2', 'PVA_SA3'])

In [ ]: