In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
# from small_script.myFunctions import *



%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10]    #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100

In [6]:
a = pd.read_csv("/Users/weilu/Research/server/may_2019/database/Q_original_1r69", index_col=0)

In [79]:
b = a.query("Rank < 6000").sample(200)

In [81]:
all_movies = {}
for i in b["Run"].unique():
    with open(f"/Users/weilu/Research/server/may_2019/database/original_1r69_{i}/movie.pdb") as f:
        movie = f.readlines()
    all_movies[i] = movie
size = 0
for line in movie:
    size += 1
    if line == "END\n":
        break
print(size)


314

In [85]:
def getStructures(x, all_movies):
    index = int(x["index"])+1
    run = int(x["Run"])

    start = index * size
    end = (index + 1) * size
    f = io.StringIO("".join(all_movies[run][start:end]))
    return parser.get_structure(f"{index}", f)

In [92]:
b.iloc[0]


Out[92]:
index                                                      923
 Qw                                                   0.535499
Run                                                          7
Rank                                                    2295.5
structure    (((<Residue SER het=  resseq=1 icode= >, <Resi...
Name: 7923, dtype: object

In [91]:
xx = b.iloc[0]["structure"]

In [95]:
from Bio.PDB.PDBIO import PDBIO
out = PDBIO()
out.set_structure(xx)
out.save('/Users/weilu/Research/server/may_2019/out.pdb')

In [104]:
b.to_pickle("/Users/weilu/Research/server/may_2019/out.pkl")

In [110]:
c = pd.read_pickle("/Users/weilu/Research/server/may_2019/out.pkl")

In [ ]:
b["structure"] = b

In [114]:
b["structure"] = b.apply(getStructures, all_movies=all_movies, axis=1)

In [115]:
b.equals(c)


Out[115]:
True

In [13]:
314314/1001


Out[13]:
314.0

In [140]:
b


Out[140]:
index Qw Run Rank structure Qw
7923 923 0.535499 7 2295.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.535
1960 960 0.464596 1 1185.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.465
10991 991 0.646153 10 255.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.646
29946 946 0.556851 29 1605.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.557
4954 954 0.629374 4 1365.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.629
10982 982 0.644678 10 525.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.645
13885 885 0.685323 13 3435.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.685
6846 846 0.551446 6 4605.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.551
25955 955 0.714764 25 1335.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.715
21802 802 0.517790 21 5925.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.518
14839 839 0.514815 14 4815.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.515
29838 838 0.615635 29 4845.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.616
2837 837 0.741397 2 4875.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.741
15924 924 0.532348 15 2265.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.532
14822 822 0.507551 14 5325.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.508
28871 871 0.566710 28 3855.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.567
24943 943 0.733249 24 1695.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.733
17993 993 0.638778 17 195.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.639
11821 821 0.518540 11 5355.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.519
8901 901 0.602135 8 2955.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.602
20878 878 0.475997 20 3645.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.476
24911 911 0.681996 24 2655.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.682
10996 996 0.657030 10 105.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.657
4844 844 0.703470 4 4665.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.703
20801 801 0.648037 20 5955.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.648
5846 846 0.497530 5 4605.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.498
11877 877 0.522874 11 3675.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.523
9898 898 0.610686 9 3045.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.611
25897 897 0.619894 25 3075.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.620
10825 825 0.562865 10 5235.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.563
... ... ... ... ... ... ...
6899 899 0.483526 6 3015.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.484
4962 962 0.588982 4 1125.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.589
16814 814 0.505825 16 5565.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.506
17898 898 0.702807 17 3045.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.703
2880 880 0.553256 2 3585.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.553
1868 868 0.419378 1 3945.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.419
18865 865 0.513220 18 4035.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.513
15915 915 0.571546 15 2535.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.572
22844 844 0.602857 22 4665.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.603
11900 900 0.485654 11 2985.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.486
7835 835 0.735204 7 4935.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.735
21856 856 0.564427 21 4305.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.564
29822 822 0.612041 29 5325.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.612
8938 938 0.570013 8 1845.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.570
12851 851 0.468245 12 4455.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.468
18866 866 0.497601 18 4005.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.498
6854 854 0.502701 6 4365.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.503
15885 885 0.529569 15 3435.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.530
19847 847 0.696022 19 4575.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.696
934 934 0.537232 0 1965.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.537
10947 947 0.651820 10 1575.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.652
20869 869 0.470937 20 3915.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.471
25919 919 0.626449 25 2415.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.626
6838 838 0.513957 6 4845.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.514
22883 883 0.549574 22 3495.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.550
15937 937 0.520374 15 1875.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.520
9960 960 0.578464 9 1185.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.578
20803 803 0.673201 20 5895.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.673
18826 826 0.677789 18 5205.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.678
24885 885 0.558958 24 3435.5 (((<Residue SER het= resseq=1 icode= >, <Resi... 0.559

200 rows × 6 columns


In [143]:
c["structure"].tolist()


Out[143]:
[<Structure id=924>,
 <Structure id=961>,
 <Structure id=992>,
 <Structure id=947>,
 <Structure id=955>,
 <Structure id=983>,
 <Structure id=886>,
 <Structure id=847>,
 <Structure id=956>,
 <Structure id=803>,
 <Structure id=840>,
 <Structure id=839>,
 <Structure id=838>,
 <Structure id=925>,
 <Structure id=823>,
 <Structure id=872>,
 <Structure id=944>,
 <Structure id=994>,
 <Structure id=822>,
 <Structure id=902>,
 <Structure id=879>,
 <Structure id=912>,
 <Structure id=997>,
 <Structure id=845>,
 <Structure id=802>,
 <Structure id=847>,
 <Structure id=878>,
 <Structure id=899>,
 <Structure id=898>,
 <Structure id=826>,
 <Structure id=853>,
 <Structure id=896>,
 <Structure id=868>,
 <Structure id=994>,
 <Structure id=895>,
 <Structure id=907>,
 <Structure id=918>,
 <Structure id=982>,
 <Structure id=894>,
 <Structure id=984>,
 <Structure id=846>,
 <Structure id=980>,
 <Structure id=869>,
 <Structure id=986>,
 <Structure id=810>,
 <Structure id=817>,
 <Structure id=882>,
 <Structure id=973>,
 <Structure id=980>,
 <Structure id=887>,
 <Structure id=936>,
 <Structure id=974>,
 <Structure id=852>,
 <Structure id=925>,
 <Structure id=842>,
 <Structure id=906>,
 <Structure id=875>,
 <Structure id=989>,
 <Structure id=824>,
 <Structure id=893>,
 <Structure id=898>,
 <Structure id=825>,
 <Structure id=884>,
 <Structure id=890>,
 <Structure id=833>,
 <Structure id=928>,
 <Structure id=900>,
 <Structure id=993>,
 <Structure id=892>,
 <Structure id=894>,
 <Structure id=828>,
 <Structure id=953>,
 <Structure id=998>,
 <Structure id=966>,
 <Structure id=921>,
 <Structure id=974>,
 <Structure id=952>,
 <Structure id=893>,
 <Structure id=823>,
 <Structure id=915>,
 <Structure id=918>,
 <Structure id=885>,
 <Structure id=882>,
 <Structure id=943>,
 <Structure id=975>,
 <Structure id=981>,
 <Structure id=926>,
 <Structure id=968>,
 <Structure id=970>,
 <Structure id=950>,
 <Structure id=997>,
 <Structure id=956>,
 <Structure id=820>,
 <Structure id=951>,
 <Structure id=888>,
 <Structure id=940>,
 <Structure id=913>,
 <Structure id=921>,
 <Structure id=929>,
 <Structure id=810>,
 <Structure id=992>,
 <Structure id=919>,
 <Structure id=826>,
 <Structure id=987>,
 <Structure id=972>,
 <Structure id=855>,
 <Structure id=962>,
 <Structure id=883>,
 <Structure id=835>,
 <Structure id=914>,
 <Structure id=969>,
 <Structure id=891>,
 <Structure id=887>,
 <Structure id=931>,
 <Structure id=866>,
 <Structure id=996>,
 <Structure id=997>,
 <Structure id=824>,
 <Structure id=969>,
 <Structure id=883>,
 <Structure id=840>,
 <Structure id=936>,
 <Structure id=843>,
 <Structure id=862>,
 <Structure id=820>,
 <Structure id=900>,
 <Structure id=900>,
 <Structure id=947>,
 <Structure id=995>,
 <Structure id=858>,
 <Structure id=861>,
 <Structure id=865>,
 <Structure id=965>,
 <Structure id=956>,
 <Structure id=974>,
 <Structure id=923>,
 <Structure id=880>,
 <Structure id=802>,
 <Structure id=938>,
 <Structure id=959>,
 <Structure id=910>,
 <Structure id=882>,
 <Structure id=835>,
 <Structure id=991>,
 <Structure id=934>,
 <Structure id=833>,
 <Structure id=859>,
 <Structure id=868>,
 <Structure id=811>,
 <Structure id=829>,
 <Structure id=864>,
 <Structure id=808>,
 <Structure id=966>,
 <Structure id=941>,
 <Structure id=999>,
 <Structure id=837>,
 <Structure id=892>,
 <Structure id=855>,
 <Structure id=916>,
 <Structure id=930>,
 <Structure id=993>,
 <Structure id=851>,
 <Structure id=917>,
 <Structure id=851>,
 <Structure id=853>,
 <Structure id=860>,
 <Structure id=883>,
 <Structure id=919>,
 <Structure id=810>,
 <Structure id=881>,
 <Structure id=900>,
 <Structure id=963>,
 <Structure id=815>,
 <Structure id=899>,
 <Structure id=881>,
 <Structure id=869>,
 <Structure id=866>,
 <Structure id=916>,
 <Structure id=845>,
 <Structure id=901>,
 <Structure id=836>,
 <Structure id=857>,
 <Structure id=823>,
 <Structure id=939>,
 <Structure id=852>,
 <Structure id=867>,
 <Structure id=855>,
 <Structure id=886>,
 <Structure id=848>,
 <Structure id=935>,
 <Structure id=948>,
 <Structure id=870>,
 <Structure id=920>,
 <Structure id=839>,
 <Structure id=884>,
 <Structure id=938>,
 <Structure id=961>,
 <Structure id=804>,
 <Structure id=827>,
 <Structure id=886>]

In [14]:
with open("/Users/weilu/Research/server/may_2019/database/original_1r69_0/movie.pdb") as f:
    movie = f.readlines()

In [55]:


In [56]:
size


Out[56]:
314

In [53]:
movie[-1]


Out[53]:
'END\n'

In [15]:
len(movie)


Out[15]:
314314

In [33]:
f = io.StringIO("".join(movie[:314]))

In [34]:
from Bio.PDB.PDBParser import PDBParser
parser = PDBParser()
x = parser.get_structure("test", f)

In [134]:
a1, a2 = "origianl_frag__1r69_2".split("__")

In [135]:
a1


Out[135]:
'origianl_frag'

In [136]:
a2


Out[136]:
'1r69_2'

In [19]:
import io

In [17]:
import tempfile

In [ ]:


In [ ]:
tempfile.TemporaryFile

In [ ]:
temp_file = tempfile.NamedTemproaryFile(delete = False)

In [ ]:
from Bio.PDB.PDBParser import PDBParser
import numpy , tempfile ,os , re

models_re = re.compile("END")
pdb_re = re.compile(r"MODEL(.*?)ENDMDL", re.DOTALL)

def PDB_parse(pdb_file_handle):
    model_pos = []
    models = []
    k = open(pdb_file_handle,"r").read()
    for i in models_re.finditer(k):
        model_pos.append(i.start())
    for i in model_pos:
        models.append(pdb_re.search(k,i).group())
    return models

array_all_structure = []

for i in PDB_parse(pdb_file_handle):
    temp_file = tempfile.NamedTemproaryFile(delete = False)
    temp_file.write(i)
    temp_file.close
    structure = parser.get_structure("1fqy", temp_file.name)
    os.remove(temp_file.name)
    model = structure[0]
    chain = model["A"]
    S1coor = numpy.zeros(shape=(226, 3))
    i = 0
    for residue1 in chain:
       resnum = residue1.get_id()[1]
       atom1 = residue1['CA']
       S1coor[i] = atom1.get_coord()
       i = i + 1
       array_all_structure.append(i)