In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
from small_script.myFunctions import *



%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10]    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100

In [3]:
data = pd.read_csv("/Users/weilu/Research/database/membrane_training_set/proteins-2019-05-01.csv")
data.pdbid = data.pdbid.apply(lambda x: x[2:-1])

In [4]:
data.head()


Out[4]:
id ordering family_name_cache species_name_cache membrane_name_cache name description comments pdbid resolution ... species_id family_id superfamily_id classtype_id type_id secondary_representations_count structure_subunits_count citations_count created_at updated_at
0 1 2024.0 OmpA family Escherichia coli Gram-neg. outer Outer membrane protein A (OMPA), disordered loops NaN OmpA is required for the action of colicins K ... 1qjp 1.65 ... 9 34 26 2 1 3 1 2 2018-08-13 03:49:46 UTC 2018-09-21 18:14:03 UTC
1 2 2028.0 Enterobacterial Ail/Lom protein Escherichia coli Gram-neg. outer Outer membrane protein X (OMPX) NaN OmpX from Escherichia coli promotes adhesion t... 1qj8 1.90 ... 9 355 26 2 1 7 1 1 2018-08-13 03:49:46 UTC 2018-09-21 18:14:03 UTC
2 3 2033.0 Opacity porins Neisseria meningitidis Gram-neg. outer Outer membrane protein NspA NaN Pathogenic Neisseria spp. possess a repertoire... 1p4t 2.55 ... 24 337 235 2 1 0 1 0 2018-08-13 03:49:46 UTC 2018-09-21 18:14:03 UTC
3 4 1740.0 Influenza virus matrix protein 2 Influenza virus Viral M2 proton channel of Influenza A, closed state... NaN NaN 3lbw 1.65 ... 51 263 185 11 1 3 4 0 2018-08-13 03:49:46 UTC 2018-10-02 17:42:36 UTC
4 5 2045.0 OM protease omptin, OMPT Yersinia pestis Gram-neg. outer Plasminogen activator PLA (coagulase/fibrinoly... NaN NaN 2x55 1.85 ... 299 36 27 2 1 2 1 0 2018-08-13 03:49:46 UTC 2018-09-21 18:14:03 UTC

5 rows × 31 columns


In [5]:
d = data.query("classtype_id == 1")
# dd = d.groupby("superfamily_id").apply(pd.DataFrame.sample, 5, replace=True).reset_index(drop=True)
dd = d.reset_index(drop=True)

In [6]:
dd.shape


Out[6]:
(1593, 31)

In [7]:
ddd = dd.drop_duplicates().reset_index(drop=True)

In [8]:
ddd.to_csv("/Users/weilu/Research/database/membrane_training_set/chosen_large_data.csv")

In [16]:
a = pd.read_csv("/Users/weilu/Research/database/membrane_training_set/chosen_large_data.csv", index_col=0)

In [17]:
a.shape


Out[17]:
(1593, 31)

In [18]:
# print(a.pdbid.tolist())
pdb_list = a.pdbid.tolist()

In [30]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/cleaned_pdbs/"
filtered_list = []
for pdb in pdb_list:
    location = pre + f"{pdb}.pdb"
    a = get_inside_or_not_table(location)
    # ratio = sum(a)/len(a)
    ratio = sum(a)/(len(a)+1e-6)
    if ratio < 0.4:
        print("not good", pdb, ratio)
    else:
        filtered_list.append(pdb)
    # print(pdb, ratio)


not good 4wis 0.3256880728965014
not good 6dmw 0.24405705191116156
not good 6mmh 0.10695876274876448
not good 5f1c 0.13994169055410585
not good 6dmy 0.2560483868386609
not good 4u5f 0.08707482981350363
not good 4i0u 0.11527377488393725
not good 6cnm 0.3899721437605233
not good 1lv7 0.003984063729147156
not good 6qp6 0.29959514129609294
not good 6nt7 0.2755102031445231
not good 5wua 0.19195046380201095
not good 6mmx 0.10939510925431775
not good 6b2z 0.0
not good 1kf6 0.0
not good 6f0k 0.091787439170109
not good 6ajg 0.3077777774358025
not good 6dlz 0.1676829266588588
not good 3j1z 0.38851351220096786
not good 6mmi 0.11734693862583298
not good 6et5 0.01501501496992488
not good 2bbj 0.11976047868335186
not good 5fgn 0.22947761151216864
not good 5ncq 0.2439024387765219
not good 4jtc 0.0
not good 5fn4 0.03458646611340381
not good 4qin 0.3467561513495388
not good 6mm9 0.11096938761355946
not good 3jcu 0.37724550785255834
not good 3uon 0.3493150676956277
not good 4ycm 0.2424547281263031
not good 4dgh 0.01562499987792969
not good 5fl7 0.0
not good 5kw2 0.3722627728168789
not good 5yhf 0.3678474109429872
not good 5ykf 0.19999999938461538
not good 6e1o 0.35871404338627066
not good 5nc5 0.2586206894074514
not good 4p6v 0.0
not good 6mhx 0.2610837434136556
not good 3wmg 0.2207130726303683
not good 6ffi 0.36097560887566926
not good 2wlk 0.23024054903697405
not good 3zdq 0.23642732007630943
not good 4jkv 0.3370044045440432
not good 6mhv 0.25774877608850116
not good 2hyd 0.22145328681409465
not good 3j8h 0.05336351874082681
not good 5x5y 0.0
not good 5wek 0.17463617445464016
not good 5xan 0.35851648302401584
not good 6co7 0.17907634290379232
not good 5but 0.0
not good 3tlw 0.2668810280807684
not good 2h88 0.0
not good 6a6m 0.2071307296992687
not good 4bbj 0.25602409599996373
not good 5wpv 0.3458149772118613
not good 5d91 0.36417910339051013
not good 6bwf 0.2801857580802078
not good 6dqj 0.08161393852287302
not good 4hg6 0.29852744270612125
not good 6ijj 0.34008097120097036
not good 3w9i 0.268436577907142
not good 5b2g 0.23428571361632652
not good 3b9r 0.23943661947742795
not good 4lmj 0.2774193539438085
not good 4f4c 0.23199999981439998
not good 2leg 0.0
not good 5wel 0.1725571723777992
not good 5uz7 0.0
not good 6d26 0.35123042427017803
not good 4gd3 0.06506849292784762
not good 5gl1 0.04279835389772336
not good 5i5h 0.0
not good 5j0z 0.26045015993424386
not good 6d3r 0.22448979571917496
not good 5wo6 0.25249169393273807
not good 1q16 0.0
not good 2vpx 0.0
not good 3j9p 0.28571428520408165
not good 6mmb 0.11212516282643395
not good 6n2y 0.0
not good 5uja 0.21538461520052596
not good 6nr4 0.2252964423909138
not good 2zxe 0.2298387094457271
not good 6ayg 0.3849658305581644
not good 3spg 0.23192771014479605
not good 5ksd 0.2665066023211205
not good 5svm 0.16408668679849323
not good 4fyg 0.00807537011026195
not good 6hco 0.28241563004899534
not good 2w5q 0.0
not good 6j20 0.38190954677912176
not good 6iok 0.0
not good 5ywd 0.21750433256715393
not good 6d7x 0.2649999995583333
not good 6btm 0.08144796343236216
not good 6e7z 0.33473684140055404
not good 4u5c 0.08969210162022477
not good 5xtc 0.0
not good 6dmb 0.26161616135190285
not good 3t9n 0.1691729316948386
not good 3lut 0.0
not good 6f2d 0.03940886680094154
not good 4dx5 0.26340996143351536
not good 4umv 0.26942148715798103
not good 5i5f 0.0
not good 5vhy 0.10983397176266414
not good 1dxr 0.0
not good 4mrs 0.20833333298611112
not good 6d9h 0.0
not good 3t56 0.259980525550165
not good 3wgu 0.2223340038004688
not good 5tgz 0.35307517003855315
not good 4hfi 0.2636655940075061
not good 2qks 0.21523178736678217
not good 6mxt 0.3674832953953601
not good 6c3o 0.22865853588823618
not good 5xap 0.35792349677879304
not good 5xaa 0.23943661947742795
not good 6mit 0.0
not good 4ev6 0.13333333291005292
not good 5svj 0.14596273246595426
not good 6gcs 0.0
not good 3fb8 0.0045662100248118264
not good 2c9m 0.2434607643425948
not good 5nf8 0.0
not good 1kju 0.2575452713706788
not good 3wme 0.21938775472893238
not good 6mma 0.10810810796897283
not good 6igz 0.3513513508765522
not good 6mu1 0.07677989759107497
not good 5osc 0.26602564017299474
not good 6j8i 0.0
not good 6miz 0.14584891537333664
not good 4mnd 0.3137254894271434
not good 1wpg 0.39703459572152455
not good 5nj3 0.27450980343224635
not good 6baa 0.2056074759949923
not good 1bcc 0.0
not good 5xhq 0.3934426221445848
not good 6idf 0.03748125931412105
not good 4phu 0.35944700378007605
not good 5u70 0.1740576494744372
not good 6hcy 0.3188073387183318
not good 5a63 0.03458646611340381
not good 5h3o 0.29411764648212224
not good 2rdd 0.26620825121197617
not good 6mhw 0.24796084788260872
not good 5c78 0.23226950313427394
not good 3zry 0.0
not good 4ntx 0.12439024359904818
not good 2oar 0.39999999680000003
not good 5v78 0.0
not good 6bo4 0.27679999955712
not good 4ymu 0.0
not good 6mml 0.10588235280276817
not good 6qpc 0.37781629050638427
not good 4ksc 0.22504230099404202
not good 5oc9 0.36902800598183194
not good 5uph 0.0
not good 5z96 0.2850746264401871
not good 5ywb 0.19692307631715977
not good 6n24 0.2679738553334188
not good 6n30 0.15671641674092227
not good 1ezv 0.006976744169821525
not good 4ayw 0.23214285672831633
not good 5zsu 0.12871287110507373
not good 5oyb 0.30640668481001854
not good 5jsz 0.0
not good 5ejz 0.3241758237305277
not good 6hiq 0.2077922072524878
not good 3j9t 0.0
not good 5an8 0.2764505114736339
/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/cleaned_pdbs/3j7q.pdb (' ', 1, ' ')
not good 3j7q 0.0
not good 5vhz 0.10983397176266414
not good 4j7c 0.0
not good 6qm9 0.3031203561617825
not good 4aq9 0.22432432371804237
not good 5kpi 0.22362869179440614
not good 3kg2 0.09884467253036627
not good 3j5q 0.28055077404380246
not good 1p49 0.0912408757459108
not good 5twv 0.1846153840473373
not good 1nek 0.0
not good 5wb1 0.3595238086678005
not good 6hxb 0.23347107413897616
not good 3iiq 0.004464285694355867
not good 6dqn 0.08737419941108225
not good 5y0b 0.23123732228069238
not good 6d1s 0.2611464959836099
not good 3ne5 0.0
not good 6g94 0.0709459457062637
not good 5wo7 0.2722689071054304
not good 5svl 0.1651651646691737
/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/cleaned_pdbs/5gae.pdb (' ', 1, ' ')
not good 5gae 0.0
not good 6dmu 0.2504118612019574
not good 4res 0.23216080378677306
not good 5ofp 0.21190893132765512
not good 5vou 0.2176039114484012
not good 3aqp 0.3616133513746685
not good 6d4j 0.2717842320832114
not good 3rfu 0.2654600297655203
not good 5uow 0.11166875770179578
not good 6bms 0.36879432493335346
not good 5ykg 0.19999999938461538
not good 5xw6 0.14687499954101563
not good 6cud 0.2457386360145758
not good 6dg8 0.2225063932928225
not good 6ijo 0.34008097120097036
not good 6c3p 0.19817073110313801
not good 6an7 0.004201680654614787
not good 5v4s 0.35142118772242587
not good 2qts 0.10071942421889826
not good 6hu9 0.0
not good 5zf0 0.16554054042868882
not good 5ylv 0.23201621050454285
not good 1kn9 0.004273504255241435
not good 5v56 0.3319587622021469
not good 4ux2 0.23464249724608005
not good 5l22 0.23333333290123456
not good 6ave 0.1297169808261392
not good 3spc 0.21712538159900496
not good 6gdi 0.2182741114904618
not good 6b85 0.19069767353163872
not good 3wu2 0.37425149588547457
not good 3lnm 0.0
not good 5ee7 0.3822115375427607
not good 4uu0 0.23943661947742795
not good 1pp9 0.0
not good 5oyg 0.3117066286153644
not good 4ksd 0.22504230099404202
not good 6irs 0.0490405116225149
not good 5aji 0.20833333254419192
not good 6bhu 0.20612582764393556
not good 5t0o 0.27202323303773157
not good 5y83 0.32748537915939946
not good 3jad 0.24629080045611038
not good 6bcq 0.19125127142293868
not good 6dmo 0.26476578384443406
not good 3sya 0.20121951158164783
not good 1fjr 0.0
not good 4c48 0.26356589121747487
not good 6cvl 0.0
not good 6n27 0.23893805239251312
not good 6drj 0.15165876765271818
not good 6huj 0.23342939413997293
not good 6o7u 0.3155680219978008
not good 4gx5 0.15999999957333333
not good 4q9i 0.22888513494181997
not good 3udc 0.15355805185933313
not good 3ar2 0.23340040217967767
not good 5tv4 0.22280701715297013
not good 6dvz 0.25972006180447893
not good 6ayf 0.34453781440223147
not good 6oeu 0.2697095432886831
not good 2rh1 0.34841628880448805
not good 1b12 0.012552301202710036
not good 6mms 0.09934640509889359
not good 6c26 0.3518518513088706
not good 5och 0.2250453716423859
not good 6niy 0.0
not good 6qma 0.3172205433274614
not good 3llo 0.0
not good 4kfm 0.20121951158164783
not good 3d9b 0.2612440188887617
not good 6eo1 0.3971830974727237
not good 2wcd 0.18947368354570637
not good 4kyt 0.24615384590138067
not good 5w3s 0.33608247353385057
not good 2zbd 0.2273641848819274
not good 6c5w 0.1473684205355494
not good 4lml 0.2677419346201873
not good 2agv 0.2434607643425948
not good 3e9j 0.0
not good 5svr 0.1529051983091584
not good 6meo 0.00251256280775738
not good 2oau 0.15748031434062867
not good 6a6n 0.21805792126395585
not good 5nmi 0.0
not good 2zup 0.010638297815753735
not good 5hi9 0.2641196008901668
not good 5z1w 0.3543123534864514
not good 3b5x 0.22552447513020196
not good 3j9v 0.0
not good 5cfb 0.2418879048911861
not good 6ds5 0.0437499997265625
not good 6d7p 0.25573770449879063
not good 6dqs 0.08188472091405091
not good 4l6v 0.32476319306527307
not good 6n23 0.2341597789692568
not good 6bcl 0.19544984467895568
not good 6bpp 0.22569444405261382
not good 5eke 0.17361111050829475
not good 5u6o 0.30497925247929614
not good 6hqb 0.3249001327231689
not good 4cof 0.238235293416955
not good 3j5p 0.28055077404380246
not good 6cnj 0.22162162102264427
not good 6be1 0.2030075182882017
not good 5jrw 0.126126125747369
not good 3sn6 0.0
not good 6giq 0.0023201856094659267
not good 6bwj 0.2819672126525128
not good 6e2g 0.25859247093520055
not good 4q9l 0.2207245153995581
not good 5wku 0.1318944840961763
not good 3din 0.0
not good 3bcc 0.0
not good 6mlu 0.07843137203639626
not good 5tin 0.2418879048911861
not good 4pl0 0.21541155829174857
not good 4b2q 0.0
not good 2xzb 0.23892893899183426
not good 3rqw 0.2671009763286613
not good 2yn9 0.23464249724608005
not good 1yew 0.10471204161070147
not good 5fn3 0.03458646611340381
not good 3jcf 0.11747850969203866
not good 6ira 0.09986684407474455
not good 4eiy 0.399999998974359
not good 3ar9 0.23843058326113623
not good 6d1w 0.33481152919110524
not good 3pcq 0.3283783779346238
not good 5mkf 0.3326403319487727
not good 6a69 0.2590559821525181
not good 4u4g 0.09895833320448134
not good 5xte 0.35802468693796685
not good 3oiz 0.010869565099243857
not good 6gyb 0.018018017855693534
not good 6fn1 0.2216582062422519
not good 5xu1 0.0
not good 3g5u 0.22758037205788462
not good 6dhe 0.368263471951307
not good 6mjp 0.0
not good 6baj 0.2607421872453689
not good 6bob 0.26865671597237695
not good 5mke 0.34526315716786704
not good 6o7t 0.3119891003923854
not good 6dqv 0.08363802555592413
not good 6fj3 0.27224199239814595
not good 3jyc 0.21183800557059812
/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/cleaned_pdbs/5xls.pdb (' ', 409, ' ')
not good 5xls 0.0
not good 4pir 0.2167101822018011
not good 6n28 0.25147928919680684
not good 3kss 0.2630541869329515
not good 6ek7 0.024725274657348147
not good 6haw 0.0
not good 5fn5 0.03609022550963876
not good 3b8n 0.0
not good 6bml 0.3972602726121224
not good 6mg8 0.2649746190203303
not good 3blc 0.018315018247930335
not good 4hw9 0.16205533532784452
not good 6qex 0.22250422993019944
not good 6mi8 0.0
not good 4dw1 0.15432098717802165
not good 2vl0 0.2581699337968303
not good 4zjc 0.3061630212601133
not good 4u2q 0.08460471555533326
not good 1t5s 0.23138832974709422
not good 5mlz 0.2873239428526086
not good 6bwm 0.2656765672183555
not good 3b8e 0.23146292561977663
not good 5yqz 0.2885304654327411
not good 5njy 0.27009646215403066
not good 4rfs 0.0
not good 6n26 0.25705329073024047
not good 5d3m 0.0
not good 5weo 0.17087967626806908
not good 3chx 0.11049723726381978
not good 5zfp 0.29333333202962963
not good 3w5a 0.24748490920776167
not good 5u76 0.17312072873220874
not good 4xk8 0.3234501343349729
not good 5fn2 0.03458646611340381
not good 6irf 0.11054637851264755
not good 6cxh 0.10994764369123654
not good 6iyc 0.03298350819642652
not good 2hi7 0.0
not good 5cxv 0.3536036028071991
not good 5uar 0.20861486468866985
not good 5hcm 0.2668810280807684
not good 6e2f 0.2512234906179062
not good 5o66 0.0
not good 5tj6 0.20561797729705844
not good 6e3y 0.2647058745674743
not good 3ar8 0.24044265569371967
not good 2gif 0.2625968989703518
not good 6mgv 0.3943217659395556
not good 6q81 0.2174280878025143
not good 6fl9 0.2677419346201873
not good 2zbg 0.24144869191001136
not good 4fz1 0.1108374381506467
not good 6bo8 0.26475548016061473
not good 5ko2 0.2227848099385782
not good 4p00 0.3104395600131325
not good 4q9j 0.22325189534688128
not good 6i53 0.24404761832128685
not good 5mpm 0.24294354814219402
not good 6g1k 0.29012345634240205
not good 3j08 0.26356589106423894
not good 6dz7 0.35017421541781496
not good 5zfu 0.30222222087901235
not good 6nzz 0.27450980302447775
not good 6g9o 0.11699164329109799
not good 6qee 0.2216582062422519
not good 3jae 0.2396449697051924
not good 6qmb 0.3131618754717672
not good 6hrc 0.21985815563854938
not good 5goa 0.051709027154043526
not good 5uj9 0.2188034186164073
not good 5u6p 0.2876984121275825
not good 6c0v 0.20970537243526396
not good 6j8e 0.0
not good 6dt0 0.18220338905846023
not good 4y7j 0.33195020609149295
not good 6bmm 0.3655172401189061
not good 6bpq 0.2005208330722385
not good 6his 0.20671834571907405
not good 3rgb 0.10994764369123654
not good 6cno 0.3999999988888889
not good 5go9 0.048495471794187714
not good 1l0l 0.0
not good 6d7s 0.2664418207985804
not good 4npq 0.2636655940075061
not good 3qf4 0.22027971989461587
not good 3jqo 0.1507537680866645
not good 4lde 0.35682819304663393
not good 5wj9 0.33820459219581506
not good 4pxz 0.3877551010516452
not good 6ajf 0.3074361816787612
not good 4lsg 0.2182741114904618
not good 5glh 0.3296460169698097
not good 3jaf 0.23391812797099962
not good 4pe5 0.11424731167439878
not good 4ayx 0.2342657338561788
not good 2wll 0.2613636353736226
not good 4fz0 0.09876543185490017
not good 2mi2 0.31730768925665687
not good 5l75 0.0
not good 5ojm 0.2484848477318641
not good 3bpp 0.0
not good 5x3x 0.0
not good 4u1w 0.09439124474091486
not good 1su4 0.24647887299146995
not good 5jeq 0.18061673929243727
not good 4tnv 0.24332344141447051
not good 5n9y 0.13761467847824257
not good 5mdx 0.3409090898022432
not good 4hqj 0.22842639570718132
not good 2iub 0.10876132897655187
not good 6eti 0.2614840984779433
not good 4mt1 0.26490713561592655
not good 5heg 0.27419354750260144
not good 6boa 0.2613827988846833
not good 3ba6 0.22960725052406117
not good 6dr2 0.08397991780740305
not good 5u1x 0.13084112108772236
not good 6g79 0.0
not good 5wie 0.0
not good 2bhv 0.010582010526021108
not good 4hea 0.0
not good 6mu2 0.07603686632440698
not good 4yuu 0.36046511523120606
not good 6e7p 0.33684210455401664
not good 2bg9 0.23513513449963477
not good 4hyt 0.22914572841291886
not good 2vpz 0.0
not good 6d6u 0.24324324251278306
not good 4umw 0.2587064672326593
not good 5w0p 0.2293178516265343
not good 5sxv 0.26298701213315906
not good 5lnk 0.0
not good 1kqf 0.0
not good 6dvy 0.25429017121015574
not good 4k0e 0.2610030703572128
not good 5nbd 0.21985815563854938
not good 5v57 0.34136546116191674
not good 2yvx 0.2737556554892406
not good 6hzm 0.24486301327934415
not good 5jef 0.20091324109172037
not good 4a97 0.25732898938980786
not good 5t4d 0.34341252625612845
not good 1mhs 0.22826086931710776
not good 4phz 0.0
not good 4tlm 0.10804020086929624
not good 4uqj 0.10187667546665324
not good 5k7l 0.20827389413940955
not good 1q90 0.06849315045036593
not good 5gli 0.35198135116088264
not good 5h1r 0.0
not good 5xtd 0.0
not good 6n52 0.18733850104995026
not good 3tlm 0.23809523785400685
not good 4twd 0.25407166041019
not good 3kds 0.0
not good 6hll 0.3432835813576043
not good 4uis 0.03571428565354713
not good 3n5k 0.2434607643425948
not good 5kuk 0.23006134898754188
not good 5c76 0.23049645349202758
not good 5ogl 0.3946251762452261
not good 5sv1 0.28251120949546543
not good 5guw 0.14788732290220197
not good 4u1y 0.09305555542631173
not good 4u1x 0.09286675626801244
not good 5wpq 0.35242290671272486
not good 6csx 0.26005888100092356
not good 3fb7 0.022831050124059133
not good 6fvq 0.26537216742598
not good 5v5s 0.0
not good 5x41 0.0
not good 5xam 0.37001375464922454
not good 6n2z 0.14925373022944977
not good 6hin 0.2567164171441301
not good 2zbe 0.23239436596338595
not good 5nik 0.0
not good 6mmt 0.10997442441179742
/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/cleaned_pdbs/3bo0.pdb (' ', 1, ' ')
not good 3bo0 0.0
not good 5sv0 0.29017857013313136
not good 5vkq 0.13075383580336633
not good 6d7t 0.26513911576900306
not good 5l7d 0.26360544172856215
not good 4n6h 0.3946078421700788
not good 6cmc 0.12801932336227217
not good 5zmw 0.253999999746
not good 2fyu 0.0
not good 4h1w 0.24873096421448634
not good 6hwh 0.17894736795013852
/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/cleaned_pdbs/3j7r.pdb (' ', 1, ' ')
not good 3j7r 0.0
not good 5kbu 0.16855087341361677
not good 6d7q 0.2617449660037836
not good 5va1 0.24821428527104591
not good 5khn 0.3290094335742813
not good 4rdq 0.2295081960942399
not good 5svk 0.1372549015763168
not good 5kuf 0.08266666655644445
not good 5do7 0.2598290593849076
not good 5iwk 0.2558922554614608
not good 5t1a 0.34606741495265747
not good 5xti 0.0
not good 6mmr 0.10923276968890407
not good 5sy1 0.0
not good 6e0h 0.35836177413248843
not good 4ayt 0.23362831817056934
not good 5heo 0.25732898938980786
not good 6cjt 0.3560606051614631
not good 4m1m 0.22118644049051994
not good 3viv 0.0045662100248118264
not good 3b9b 0.23138832974709422
not good 6dnf 0.18666666583703703
not good 6hbu 0.24657534204353537
not good 5kpj 0.2272340423598008
not good 6a96 0.0
not good 4hzu 0.0
not good 4tnw 0.23976608117027462
not good 5zbg 0.2671641787057251
not good 6dg7 0.21227621429085367
not good 1eys 0.003225806441207076
not good 6ddf 0.0
not good 6al2 0.27892561925841813
not good 6msm 0.21337849262203343
not good 5u1d 0.2281639924631658
not good 1xp5 0.24849094542405337
not good 4qim 0.3522975922269199
not good 4ry2 0.17831669018785065
not good 4u5b 0.08672086709116414
not good 4ycl 0.23943661947742795
not good 5eul 0.049597855161396975
not good 6d32 0.2827225125955977
not good 6djb 0.1260623227676171
not good 5mkk 0.21247892038367805
not good 5ywa 0.19076923018224853
not good 6bbj 0.24177631539181527
not good 5u73 0.30289532260256646
not good 4ksb 0.22758037205788462
not good 5vrh 0.39795918286130777
not good 2k37 0.050847456765297346
not good 5zty 0.3273942086249572
not good 5gup 0.3790849648425819
not good 6nzi 0.2660731945933994
not good 5l1h 0.09961190155289534
not good 6gy6 0.06575342447738788
not good 5d92 0.34795321535686197
not good 6drk 0.1415019761727257
not good 4jza 0.0
not good 5oy0 0.3302263644071553
not good 4xes 0.322717621395504
not good 6aye 0.34243697407051055
not good 6j8g 0.0
not good 4ysx 0.0
not good 5lg3 0.2605863183694257
not good 5l7i 0.2801418434749258
not good 6nzw 0.2779552706774592
not good 5syt 0.3963963955036117
not good 3j9u 0.0
not good 6cna 0.11698113192832563
not good 6mmu 0.11313394003493636
not good 6dqz 0.08409506394694011
not good 2r9r 0.0
not good 6dw0 0.21739130362076486
not good 3h9v 0.15805471076579117
not good 1t7d 0.0
not good 1jb0 0.3310810806336742
not good 3b60 0.2185314681494205
not good 5khs 0.3305978894131326
not good 4yeu 0.2756410247575608
not good 6ftj 0.0
not good 5kbt 0.16855087341361677
not good 6hlp 0.34020618486555426
not good 6d7w 0.14909090854876034
not good 3kdp 0.2254509015777045
not good 4eeb 0.12420382126049738
not good 4xwk 0.2208121825543044
not good 4dgf 0.0
not good 3k07 0.2663414631547888
not good 6bo5 0.27827648064941385
not good 6bqr 0.215189873190728
not good 6bug 0.0
not good 6ebk 0.0
not good 5gko 0.1384615382485207
not good 5gl0 0.04453551911351489
not good 6d6t 0.24324324251278306
not good 2zy9 0.29039812578361096
not good 6adq 0.13461538418392505
not good 5up2 0.1105527636802101
not good 6nt5 0.2945205469365735
not good 5lcb 0.08474576127549557
not good 6a70 0.36160714204998406
not good 5v7v 0.02120717777943364
not good 5vot 0.22303921513960978
not good 5wb2 0.35629453597079686
not good 6bqv 0.2178111585645803
not good 5zgb 0.36977057979788047
not good 6bl6 0.2295652169920605
not good 2w8d 0.0
not good 6n51 0.17780580053240125
not good 4kt0 0.32611637303637836
not good 5zfv 0.29910714152184315
not good 3puw 0.0
not good 4jbw 0.3719512187561967
not good 5gmy 0.3525943392068463
not good 4nyk 0.12801932336227217
not good 5u2h 0.127035830205095
not good 6fn4 0.21591871277229574
not good 5kxi 0.22865013711115664
not good 5vov 0.2149999994625
not good 6e1h 0.2668711653712974
not good 6fnp 0.2641509417348997
not good 5wpt 0.3458149772118613
not good 3zrs 0.2428571419897959
not good 5l4h 0.2668810280807684
not good 6ezn 0.05707762544046204
not good 5woa 0.2689075625732646
not good 5sxu 0.25974025889694724
not good 2onk 0.0
not good 6el1 0.0626780624994927
not good 4xe5 0.231155778662155
not good 5e94 0.0
not good 3fb6 0.0045662100248118264
not good 3d31 0.0
not good 2ybb 0.0
not good 5l1b 0.10769230755424063
not good 6nr2 0.2149877147236023
not good 6m96 0.008403361309229574
not good 5vbl 0.0
not good 6bwi 0.31400966133009717
not good 5kk2 0.2083333328227124
not good 6c14 0.07518796964215049
not good 5gkz 0.04453551911351489
not good 2j8d 0.08846153812130178
not good 6b5v 0.25466892995811724
not good 4p02 0.32005494461530914
not good 6nt8 0.2721088426118747
not good 4nab 0.28433734905501523
not good 3fps 0.23742454704484453
not good 5heh 0.2677419346201873
not good 3cx5 0.00696055682839778
not good 6qm5 0.29717681976645344
not good 3waj 0.36065573725011885
not good 3j09 0.22406638973158174
not good 5un1 0.22302158219898902
not good 5y5s 0.06109324739198313
not good 3puz 0.0
not good 2ear 0.24044265569371967
not good 4wd8 0.27238805868511917
not good 5zx5 0.22972972944750647
not good 6mhs 0.25700164702306155
not good 6mix 0.13388182488116543
not good 5jgp 0.18141592840081447
not good 4x5t 0.26298701213315906
not good 5er7 0.0
not good 3cxh 0.0046403712189318535
not good 1p7b 0.2480620145423953
not good 5dn6 0.0
not good 6n4b 0.0
not good 5lq3 0.2729912872478303
not good 5iji 0.18061673929243727
not good 5u74 0.2933333330074074
not good 6bpl 0.22569444405261382
not good 5yw7 0.22105263138504153
not good 6ffc 0.26223776177930463
not good 4n4w 0.36542669504283
not good 6ijz 0.3572474372514679
not good 5lil 0.1430894306616432
not good 2bs2 0.0
not good 6mhz 0.0
not good 6qpb 0.29001367949382534
not good 5yx9 0.2770370366266118
not good 4u4f 0.10156249986775716
not good 6qkc 0.2188295159826221
not good 5tji 0.20181405872810199
not good 5t04 0.32969432242424823
not good 5xjy 0.16044187261418102
not good 6nr3 0.21410891062610282
not good 4gx0 0.15733333291377777
not good 4huq 0.0
not good 3tgu 0.0
not good 5wj5 0.3566739598322233
/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/cleaned_pdbs/3qe7.pdb (' ', 408, ' ')
not good 3qe7 0.0
not good 6fkf 0.0
not good 6n25 0.25147928919680684
not good 3eml 0.3348214278240593
not good 3uq7 0.2605863183694257
not good 5d5a 0.3529411756720788
not good 5xmj 0.0
not good 6ftg 0.0
not good 5osa 0.2587859416652206
not good 4u2p 0.09403973497478181
not good 5mm1 0.26011560618463697
not good 6dr0 0.08318098716490814
not good 6mj2 0.1383694838157296
not good 3syq 0.23630136905376245
not good 6nbq 0.0
not good 4lp8 0.24028268466331207
not good 3nog 0.26086956495961505
not good 5u09 0.32008368133873705
not good 5y4o 0.18834080633031028
not good 1lgh 0.3749999933035716
not good 1zoy 0.0
not good 6dvw 0.24534161452586706
not good 3rhw 0.238235293416955
not good 3fpb 0.24044265569371967
not good 2j8c 0.09543568425130422
not good 5uak 0.218612818069699
not good 6cdu 0.2643312093492637
not good 5mm0 0.29295774565364013
not good 4y7k 0.21199999915200002
not good 6nt6 0.2877697831375188
not good 6n3t 0.3804945049718482
not good 4myc 0.2173913039842955
not good 5gky 0.04426229506987369
not good 6mho 0.2628951742713891
not good 3pjv 0.007812499938964845
not good 5lki 0.03433777152464066
not good 5w81 0.21142369973450664

In [31]:
len(filtered_list)


Out[31]:
829

In [15]:


In [21]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/"
a = glob.glob(pre+"cleaned_pdbs/*")

In [24]:
b = [i.split("/")[-1][:4] for i in a]

In [25]:
pdb_list = b

In [33]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/"
for pdb in pdb_list:
    toLocation = pre + f"dompdb/{pdb}.pdb"
    location = pre + f"cleaned_pdbs/{pdb}.pdb"
    try:
        extractTransmembrane(toLocation, location)
    except:
        pass

In [28]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/database/dompdb/"
filtered_list = []
for pdb in pdb_list:
    location = pre + f"{pdb}.pdb"
    a = get_inside_or_not_table(location)
    # ratio = sum(a)/len(a)
    ratio = sum(a)/(len(a)+1e-6)
    if ratio < 0.4:
        print("not good", pdb, ratio)
    else:
        filtered_list.append(pdb)
    # print(pdb, ratio)


not good 6b2z 0.0
not good 1kf6 0.0
not good 4jtc 0.0
not good 5fl7 0.0
not good 4p6v 0.0
not good 5x5y 0.0
not good 5but 0.0
not good 2h88 0.0
not good 2leg 0.0
not good 5uz7 0.0
not good 5i5h 0.0
not good 1q16 0.0
not good 2vpx 0.0
not good 6n2y 0.0
not good 2w5q 0.0
not good 6iok 0.0
not good 5xtc 0.0
not good 3lut 0.0
not good 5i5f 0.0
not good 1dxr 0.0
not good 6d9h 0.0
not good 6mit 0.0
not good 6gcs 0.0
not good 5nf8 0.0
not good 6j8i 0.0
not good 1bcc 0.0
not good 3zry 0.0
not good 5v78 0.0
not good 4ymu 0.0
not good 5uph 0.0
not good 5jsz 0.0
not good 3j9t 0.0
not good 3j7q 0.0
not good 4j7c 0.0
not good 1nek 0.0
not good 3ne5 0.0
not good 5gae 0.0
not good 6hu9 0.0
not good 3lnm 0.0
not good 1pp9 0.0
not good 1fjr 0.0
not good 6cvl 0.0
not good 6niy 0.0
not good 3llo 0.0
not good 3e9j 0.0
not good 5nmi 0.0
not good 3j9v 0.0
not good 3sn6 0.0
not good 3din 0.0
not good 3bcc 0.0
not good 4b2q 0.0
not good 5xu1 0.0
not good 6mjp 0.0
not good 6haw 0.0
not good 3b8n 0.0
not good 6mi8 0.0
not good 4rfs 0.0
not good 5d3m 0.0
not good 2hi7 0.0
not good 5o66 0.0
not good 6j8e 0.0
not good 1l0l 0.0
not good 5l75 0.0
not good 3bpp 0.0
not good 5x3x 0.0
not good 6g79 0.0
not good 5wie 0.0
not good 4hea 0.0
not good 2vpz 0.0
not good 5lnk 0.0
not good 1kqf 0.0
not good 4phz 0.0
not good 5h1r 0.0
not good 5xtd 0.0
not good 3kds 0.0
not good 5v5s 0.0
not good 5x41 0.0
not good 5nik 0.0
not good 3bo0 0.0
not good 2fyu 0.0
not good 3j7r 0.0
not good 5xti 0.0
not good 5sy1 0.0
not good 6a96 0.0
not good 4hzu 0.0
not good 6ddf 0.0
not good 4jza 0.0
not good 6j8g 0.0
not good 4ysx 0.0
not good 3j9u 0.0
not good 2r9r 0.0
not good 1t7d 0.0
not good 6ftj 0.0
not good 4dgf 0.0
not good 6bug 0.0
not good 6ebk 0.0
not good 2w8d 0.0
not good 3puw 0.0
not good 2onk 0.0
not good 5e94 0.0
not good 3d31 0.0
not good 2ybb 0.0
not good 5vbl 0.0
not good 3puz 0.0
not good 5er7 0.0
not good 5dn6 0.0
not good 6n4b 0.0
not good 2bs2 0.0
not good 6mhz 0.0
not good 4huq 0.0
not good 3tgu 0.0
not good 6fkf 0.0
not good 5xmj 0.0
not good 6ftg 0.0
not good 6nbq 0.0
not good 1zoy 0.0

In [29]:
len(filtered_list)


Out[29]:
1471

In [32]:
pdb_list = filtered_list

In [39]:
pre = "/Users/weilu/Research/server/may_2019/four_body_helix_large_data/"

In [40]:
with open(f"{pre}/optimization/protein_list", "w") as out:
    for pdb in pdb_list:
        # print(pdb)
        out.write(pdb+"\n")

In [38]:
for pdb in pdb_list:
    location = f"{pre}/dompdb/{pdb}.pdb"
    toLocation = f"{pre}/S20_seq/{pdb}.seq"
    seq,resseqs = getSeqFromPDB(location, considerGap=False)
    with open(toLocation, "w") as out:
        out.write(seq+'\n')

In [46]:
pre = '/Users/weilu/Research/server/may_2019/four_body_helix/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma"
gamma_k1000 = np.loadtxt(location)

pre = '/Users/weilu/Research/server/may_2019/four_body_helix_repeat_n_decoys2000/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma"
gamma_k2000 = np.loadtxt(location)

pre = '/Users/weilu/Research/server/may_2019/four_body_helix_repeat_n_decoys4000/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma"
gamma_k4000 = np.loadtxt(location)

pre = '/Users/weilu/Research/server/may_2019/four_body_helix_repeat_n_decoys8000/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma"
gamma_k8000 = np.loadtxt(location)

pre = '/Users/weilu/Research/server/may_2019/four_body_helix_repeat_n_decoys8000/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_10_True_gamma_filtered"
gamma_k8000_f = np.loadtxt(location)

In [23]:


In [3]:


In [4]:
pre = '/Users/weilu/Research/server/may_2019/four_body/four_body_helix_use_cb_complete_phi/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_6_Truephi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0_gamma"
gamma_complete_phis = np.loadtxt(location)

In [5]:
name = "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_6_Truephi_pairwise_contact_well4.5_6.5_5.0_10phi_density_mediated_contact_well6.5_9.5_5.0_10_2.6_7.0phi_burial_well4.0"
A,B,B_filtered,Gamma,Gamma_filtered,Lamb,Lamb_filtered,half_B,other_half_B,std_half_B,A_prime = get_raw_optimization_data(pre, name)

In [6]:
from pyCodeLib import *

In [7]:
total_phis = len(A)
num_decoys = 1000
filtered_gamma, filtered_B, filtered_lamb, P, lamb = get_filtered_gamma_B_lamb_P_and_lamb(
            A, B, half_B, other_half_B, std_half_B, total_phis, num_decoys, setCutoff=726)


726

In [9]:
six_letter_code_combinations = ['000004', '000013', '000022', '000031', '000040', '000103', '000112', '000121', '000130', '000202', '000211', '000220', '000301', '000310', '000400', '001003', '001012', '001021', '001030', '001102', '001111', '001120', '001201', '001210', '001300', '002002', '002011', '002020', '002101', '002110', '002200', '003001', '003010', '003100', '004000', '010003', '010012', '010021', '010030', '010102', '010111', '010120', '010201', '010210', '010300', '011002', '011011', '011020', '011101', '011110', '011200', '012001', '012010', '012100', '013000', '020002', '020011', '020020', '020101', '020110', '020200', '021001', '021010', '021100', '022000', '030001', '030010', '030100', '031000', '040000', '100003', '100012', '100021', '100030', '100102', '100111', '100120', '100201', '100210', '100300', '101002', '101011', '101020', '101101', '101110', '101200', '102001', '102010', '102100', '103000', '110002', '110011', '110020', '110101', '110110', '110200', '111001', '111010', '111100', '112000', '120001', '120010', '120100', '121000', '130000', '200002', '200011', '200020', '200101', '200110', '200200', '201001', '201010', '201100', '202000', '210001', '210010', '210100', '211000', '220000', '300001', '300010', '300100', '301000', '310000', '400000']
six_letter_code_combinations = np.array(six_letter_code_combinations)

In [10]:
g = -filtered_gamma[:126]
sorted_letter = six_letter_code_combinations[np.argsort(g)]
y_g = g[np.argsort(g)]

In [47]:
from adjustText import adjust_text

In [14]:
fig, ax = plt.subplots()
ax.plot(sorted_letter, y_g)
for i, txt in enumerate(sorted_letter):
    ax.annotate(txt, (i, y_g[i]))



In [11]:
fig, ax = plt.subplots()
ax.plot(sorted_letter[:10], y_g[:10])
for i, txt in enumerate(sorted_letter[:10]):
    ax.annotate(txt, (i, y_g[i]))



In [16]:
fig, ax = plt.subplots()
ax.plot(sorted_letter[-20:], y_g[-20:])
for i, txt in enumerate(sorted_letter[-20:]):
    ax.annotate(txt, (i, y_g[i]))



In [12]:
gamma_complete_phis.shape


Out[12]:
(126,)

In [24]:
gamma_complete_phis = gamma_complete_phis[:126]
sorted_letter = six_letter_code_combinations[np.argsort(gamma_complete_phis)]

In [30]:
gamma_complete_phis.round(1)


Out[30]:
array([ -0.8,  -0.1,  -0.8,  -0.1,  -1.7,  -0.6,  -0. ,  -0.3,  -0.6,
        -0.4,  -0.3,  -0.3,  -0.4,  -0.4,  -0.7,  -0.6,  -0.5,  -1. ,
        -0.1,  -0.5,  -0.3,  -0.3,  -0.3,  -0.3,  -0.5,   0.4,  -3.6,
        -0.5,  -0.9,  -0.3,  -0. ,   0.8,   0.8,  -0.9,   7.6,  -0.4,
        -0.7,  -0.4,  -1. ,  -0.8,  -0.6,  -0.7,  -0.8,  -0.6,  -0.9,
        -0.7,  -0.5,  -0.8,  -0.9,  -0.7,  -1. ,  -1.4,   0.1,  -0.6,
        -0.4,  -1.6,  -0.8,  -2. ,  -1.1,  -0.8,  -1.3,  -1.1,  -1. ,
        -1.4,  -1.8,  -1.4,  -1.7,  -1.8,  -2. ,  -3.5,  -0.2,  -0.8,
        -0.3,  -0.7,  -1. ,  -0.9,  -1.1,  -1. ,  -0.7,  -0.8,  -0.3,
        -0.8,  -0.4,  -0.7,  -1. ,  -1. ,  -3.7,   0.2,  -1.3,  -2. ,
        -1.6,  -1.1,  -1.6,  -1.6,  -1.1,  -1.5,  -2.1,  -1.5,  -1.4,
        -0.5,  -3. ,  -2.4,  -2.1,  -1.7,  -4.6,  -2. ,  -2.5,  -2. ,
        -2.8,  -1.5,  -2.3,  -4.6,  -0.3,  -2.5,  -8.1,  -4.5,  -3.4,
        -2.7,  -3.8,  -8.7,  -7.5,  -1.3,  -2.3,   1. ,  -7.5, -43.5])

In [29]:
y.round(1)


Out[29]:
array([-43.5,  -8.7,  -8.1,  -7.5,  -7.5,  -4.6,  -4.6,  -4.5,  -3.8,
        -3.7,  -3.6,  -3.5,  -3.4,  -3. ,  -2.8,  -2.7,  -2.5,  -2.5,
        -2.4,  -2.3,  -2.3,  -2.1,  -2.1,  -2. ,  -2. ,  -2. ,  -2. ,
        -2. ,  -1.8,  -1.8,  -1.7,  -1.7,  -1.7,  -1.6,  -1.6,  -1.6,
        -1.6,  -1.5,  -1.5,  -1.5,  -1.4,  -1.4,  -1.4,  -1.4,  -1.3,
        -1.3,  -1.3,  -1.1,  -1.1,  -1.1,  -1.1,  -1.1,  -1. ,  -1. ,
        -1. ,  -1. ,  -1. ,  -1. ,  -1. ,  -1. ,  -0.9,  -0.9,  -0.9,
        -0.9,  -0.9,  -0.8,  -0.8,  -0.8,  -0.8,  -0.8,  -0.8,  -0.8,
        -0.8,  -0.8,  -0.8,  -0.7,  -0.7,  -0.7,  -0.7,  -0.7,  -0.7,
        -0.7,  -0.7,  -0.6,  -0.6,  -0.6,  -0.6,  -0.6,  -0.6,  -0.5,
        -0.5,  -0.5,  -0.5,  -0.5,  -0.5,  -0.4,  -0.4,  -0.4,  -0.4,
        -0.4,  -0.4,  -0.4,  -0.3,  -0.3,  -0.3,  -0.3,  -0.3,  -0.3,
        -0.3,  -0.3,  -0.3,  -0.3,  -0.3,  -0.2,  -0.1,  -0.1,  -0.1,
        -0. ,  -0. ,   0.1,   0.2,   0.4,   0.8,   0.8,   1. ,   7.6])

In [20]:
y = gamma_complete_phis[np.argsort(gamma_complete_phis)]

In [21]:
fig, ax = plt.subplots()
ax.plot(sorted_letter, gamma_complete_phis[np.argsort(gamma_complete_phis)])
for i, txt in enumerate(sorted_letter):
    ax.annotate(txt, (i, y[i]))



In [ ]:


In [18]:
plt.plot(sorted_letter, gamma_complete_phis[np.argsort(gamma_complete_phis)])


Out[18]:
[<matplotlib.lines.Line2D at 0x1a23a0fda0>]

In [10]:
plt.plot(gamma_complete_phis[:125])


Out[10]:
[<matplotlib.lines.Line2D at 0x1a235daac8>]

In [45]:
pre = '/Users/weilu/Research/server/may_2019/four_body_helix_large_data/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_6_True_gamma_filtered"
gamma_large = np.loadtxt(location)

In [47]:
pre = '/Users/weilu/Research/server/may_2019/four_body_helix_more_data/optimization/'
location = pre + "gammas/protein_list_phi_six_letter_four_body_helix_docking3.5_6.5_5.0_6_True_gamma"
gamma_more = np.loadtxt(location)

In [53]:


In [55]:
six_letter_code_combinations = np.array(six_letter_code_combinations)

In [52]:
np.argsort(gamma_large)


Out[52]:
array([125, 123, 124, 114, 119, 120, 111, 105, 104, 107, 118,  69,  97,
       116, 115, 110,   4, 121,  92,  87, 108, 101,  70, 100, 117,  62,
        72,  67,  57, 122,  68, 113,  96,  52,  66, 102,  80,  65,   3,
        93,  90,  27,  60,  95,  63,  26,  76,  86,  55, 103,  61,  59,
        38,  91,  94,  50,  73, 109,  58,  35,  81,  56,  47,  98,  41,
        44,  46,  14,  37,  79,  77,  49,  51,  83,  42, 106,  85,  39,
        16,  40,   8,  23,  43,   7,  48,   5, 112,  84,  75,  24,  74,
        12,  78,  36,  71,  45,  32,  17,  28,   9,  25,  15,  22,  20,
        11,  13,  10,  88,   6,  29,  21,   2,  64,  19,  33,  18,   1,
         0,  30,  53,  54,  99,  82,  31,  34,  89])

In [56]:
sorted_letter = six_letter_code_combinations[np.argsort(gamma_large)]

In [63]:
np.sum(gamma_large < -3)


Out[63]:
8

In [64]:
plt.plot(sorted_letter[:8], gamma_large[np.argsort(gamma_large)][:8])


Out[64]:
[<matplotlib.lines.Line2D at 0x1a2cdeb550>]

In [59]:
plt.plot(sorted_letter, gamma_large[np.argsort(gamma_large)])


Out[59]:
[<matplotlib.lines.Line2D at 0x1a2d5b8518>]

In [57]:
sorted_letter


Out[57]:
array(['400000', '301000', '310000', '202000', '220000', '300001',
       '201001', '200002', '130000', '200020', '211000', '040000',
       '111010', '210010', '210001', '200200', '000040', '300010',
       '110020', '102010', '200101', '120010', '100003', '120001',
       '210100', '021010', '100021', '030100', '020020', '300100',
       '031000', '201100', '111001', '012010', '030010', '120100',
       '101002', '030001', '000031', '110101', '110002', '002020',
       '020200', '110200', '021100', '002011', '100120', '102001',
       '020002', '121000', '021001', '020110', '010030', '110011',
       '110110', '011200', '100030', '200110', '020101', '010003',
       '101011', '020011', '011020', '111100', '010120', '010300',
       '011011', '000400', '010021', '100300', '100201', '011110',
       '012001', '101101', '010201', '200011', '101200', '010102',
       '001012', '010111', '000130', '001210', '010210', '000121',
       '011101', '000103', '201010', '101110', '100111', '001300',
       '100102', '000301', '100210', '010012', '100012', '011002',
       '003010', '001021', '002101', '000202', '002002', '001003',
       '001201', '001111', '000220', '000310', '000211', '102100',
       '000112', '002110', '001120', '000022', '022000', '001102',
       '003100', '001030', '000013', '000004', '002200', '012100',
       '013000', '112000', '101020', '003001', '004000', '103000'],
      dtype='<U6')

In [61]:
plt.plot(gamma_more, label="gamma_more")
plt.plot(gamma_large, label="gamma_large")
# plt.plot(gamma_k8000, label="k8000")
# plt.plot(gamma_k8000_centered, label="k8000_center")
plt.legend()
# plt.ylim(-5,5)


Out[61]:
<matplotlib.legend.Legend at 0x1a2e754ba8>

In [60]:
plt.plot(gamma_k1000, label="k1000")
plt.plot(gamma_k2000, label="k2000")
plt.plot(gamma_k4000, label="k4000")
# plt.plot(gamma_more, label="gamma_more")
# plt.plot(gamma_large, label="gamma_large")
# plt.plot(gamma_k8000, label="k8000")
# plt.plot(gamma_k8000_centered, label="k8000_center")
plt.legend()


Out[60]:
<matplotlib.legend.Legend at 0x1a23730f28>

In [48]:
plt.plot(gamma_k1000, label="k1000")
plt.plot(gamma_k2000, label="k2000")
plt.plot(gamma_k4000, label="k4000")
plt.plot(gamma_more, label="gamma_more")
# plt.plot(gamma_k8000, label="k8000")
# plt.plot(gamma_k8000_centered, label="k8000_center")
plt.legend()
plt.ylim(-5,5)


Out[48]:
(-5, 5)

In [50]:
import glob
a = glob.glob("/Users/weilu/Research/server/may_2019/four_body_helix_more_data/optimization/data/*.dat")
data_all = []
for d in a:
    name = d.split("/")[-1].split("-")[0]
    tmp = pd.read_csv(d, sep=" ", names=["res_pair1_chain", "res_pair1_index_1", "res_pair1_index_2", 
                               "res_pair2_chain", "res_pair2_index_1", "res_pair2_index_2", 
                               "res_type",  "six_letter_string", "total_phi",
                              "d00", "d11", "d01", "d10"])
    data_all.append(tmp.assign(protein=name))
data = pd.concat(data_all)
data = data.reset_index(drop=True)
data["six_letter_string"] = data["six_letter_string"].astype(str).str.pad(width=6, fillchar="0")

In [52]:
data.head()


Out[52]:
res_pair1_chain res_pair1_index_1 res_pair1_index_2 res_pair2_chain res_pair2_index_1 res_pair2_index_2 res_type six_letter_string total_phi d00 d11 d01 d10 protein
0 A 15 19 A 77 81 GAVF 110101 0.691777 8.218560 9.569242 6.418541 4.126931 2c3e
1 A 17 21 A 283 287 ASGL 120100 0.687627 10.557247 5.287187 6.421049 4.385104 2c3e
2 A 17 21 A 286 290 ASVY 020101 0.995874 8.894876 6.960624 5.734986 4.060895 2c3e
3 A 18 22 A 76 80 AKNY 010021 0.987386 8.097395 8.315829 4.241660 3.940909 2c3e
4 A 19 23 A 73 77 ATNV 020110 0.995154 9.297452 6.720541 4.126931 5.918079 2c3e

In [53]:
data["six_letter_string"].value_counts()


Out[53]:
020200    710
010300    505
030100    430
110200    427
120100    422
010201    413
020101    334
110101    295
210100    244
020110    242
010210    240
130000    210
220000    204
000400    200
120001    195
110110    176
000301    161
010111    157
010102    145
100300    144
100201    142
200200    135
011200    131
120010    128
040000    125
030001    124
021100    109
210001    108
000310    101
200101     98
         ... 
011002      8
100012      7
100030      7
101011      6
000022      6
202000      5
012100      5
000031      4
001012      4
101002      4
022000      3
012010      3
201010      2
012001      2
002101      2
001003      2
301000      2
002011      2
000004      2
000013      2
102100      2
002002      1
102010      1
002110      1
102001      1
002020      1
112000      1
000040      1
002200      1
001021      1
Name: six_letter_string, Length: 118, dtype: int64

In [ ]:
def getSeqFromPDB(location, considerGap=True):
    x = PDBParser().get_structure("x", location)
    seq = ""
    resseqs = []
    preResId = 0
    for res in x.get_residues():
        resId = res.get_id()[1]
        if considerGap and resId != preResId + 1:
            seq += " "
            resseqs.append(-1)
        seq += three_to_one(res.get_resname())
        resseqs.append(res.get_id()[1])
        preResId = resId
    return seq,resseqs