In [1]:
import os
import sys
import random
import time
from random import seed, randint
import argparse
import platform
from datetime import datetime
import imp
import numpy as np
import fileinput
from itertools import product
import pandas as pd
from scipy.interpolate import griddata
from scipy.interpolate import interp2d
import seaborn as sns
from os import listdir
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.interpolate import griddata
import matplotlib as mpl
# sys.path.insert(0,'..')
# from notebookFunctions import *
# from .. import notebookFunctions
from Bio.PDB.Polypeptide import one_to_three
from Bio.PDB.Polypeptide import three_to_one
from Bio.PDB.PDBParser import PDBParser
from pyCodeLib import *
from small_script.myFunctions import *
%matplotlib inline
# plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# %matplotlib notebook
%load_ext autoreload
%autoreload 2
In [2]:
plt.rcParams['figure.figsize'] = [16.18033, 10] #golden ratio
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['figure.dpi'] = 100
In [3]:
dataset = {"old":"1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "),
"new":"1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "),
"test":["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"]}
dataset["combined"] = dataset["old"] + dataset["new"]
dataset["combined"] = [a.lower()[:4] for a in dataset["combined"] ]
folder_list = ["original", "multi_iter0"]
In [96]:
dd = []
for simulation_location in folder_list:
for p in dataset["combined"]:
for i in range(30):
d = pd.read_csv(f"/Users/weilu/Research/server/may_2019/database/{simulation_location}_{p}_{i}/ff_energy_smaller.dat", sep="\s+", names=["Name", "direct", "mediated", "burial", "water", "Newcontact"])
d["Frame"] = d["Name"].apply(lambda x:int(x[5:])-1)
d = d[["Newcontact", "Frame"]]
d = d.assign(Name=p, Repeat=i, Folder=simulation_location)
dd.append(d)
d_span = pd.concat(dd)
In [7]:
folder_list = ["original"]
dd = []
for simulation_location in folder_list:
for p in dataset["combined"]:
for i in range(30):
try:
d = pd.read_csv(f"/Users/weilu/Research/server/may_2019/database/{simulation_location}_{p}_{i}/ff_energy.dat", sep="\s+", names=["Name", "direct", "mediated", "burial", "water", "Newcontact"])
except:
print(f"{simulation_location}_{p}_{i}")
continue
d["Frame"] = d["Name"].apply(lambda x:int(x[5:])-1)
d = d[["Newcontact", "Frame"]]
d = d.assign(Name=p, Repeat=i, Folder=simulation_location)
dd.append(d)
d_last = pd.concat(dd)
In [165]:
data = get_complete_data(pre, folder_list, pdb_list, run=-1, rerun=-1, formatName=True)
data.Steps = data.Steps.astype(int)
data["Contact"] = data["Water"] + data["Burial"]
native = data
In [168]:
native_energy = native.groupby(["Name", "Folder"]).head(1).reset_index()
native_energy["Contact"] = native_energy["Water"] + native_energy["Burial"]
In [17]:
dd = []
for simulation_location in folder_list:
for p in dataset["combined"]:
location = f"/Users/weilu/Research/server/may_2019/single_memory/{simulation_location}/{p}/simulation/native/rerun/newContact.dat"
d = pd.read_csv(location, sep="\s+", names=["Name", "direct", "mediated", "burial", "water", "Newcontact"])
d = d.assign(Name=p, Folder=simulation_location)
dd.append(d)
d_native = pd.concat(dd)
In [21]:
native.to_csv("/Users/weilu/Research/data/optimization/jun03_native.csv")
In [22]:
d_native.to_csv("/Users/weilu/Research/data/optimization/jun03_d_native.csv")
In [79]:
# dd = []
# for i in range(2):
# d = pd.read_csv(f"/Users/weilu/Research/server/may_2019/database/original_1r69_0/ff_energy_smaller.dat", sep="\s+", names=["Name", "direct", "mediated", "burial", "water", "Newcontact"])
# d["Frame"] = d["Name"].apply(lambda x:int(x[5:])-1)
# d = d[["Newcontact", "Frame"]]
# dd.append(d.assign(Name="1r69", Repeat=i, Folder="original"))
# d = pd.concat(dd)
data["Frame"] = data["Steps"] // 4000
# data = data.reset_index().rename(columns={"index":"Frame"})
# data = pd.read_csv("/Users/weilu/Research/data/optimization/may12.csv")
data = data.merge(d_last, on=["Name", "Repeat", "Frame", "Folder"])
In [14]:
pre = "/Users/weilu/Research/server/may_2019/single_memory/"
dataset["combined"] = [a.lower()[:4] for a in dataset["combined"] ]
folder_list = ["original", "multi_iter0"]
pdb_list = dataset["combined"]
data = get_complete_data(pre, folder_list, pdb_list, run=-1, rerun=-1, formatName=True)
data.Steps = data.Steps.astype(int)
data["Contact"] = data["Water"] + data["Burial"]
native = data
In [15]:
native_energy = native.groupby(["Name", "Folder"]).head(1).reset_index()
native_energy["Contact"] = native_energy["Water"] + native_energy["Burial"]
In [9]:
data = pd.read_csv("/Users/weilu/Research/data/optimization/may12.csv")
In [12]:
lastFrames = data
In [32]:
g = sns.FacetGrid(lastFrames, col="Name",col_wrap=4, sharey=False, sharex=False)
g.map(plt.scatter, "Contact", "Qw", alpha=0.2)
g.map(plt.scatter, "Newcontact", "Qw", color="red", alpha=0.2)
Out[32]:
In [18]:
y_show = "Contact"
g = sns.FacetGrid(lastFrames, col='Name',col_wrap=4, sharey=False, sharex=False)
def facet_scatter(x, y, c, **kwargs):
"""Draw scatterplot with point colors from a faceted DataFrame columns."""
kwargs.pop("color")
plt.scatter(x, y, c=c, **kwargs)
vmin, vmax = 0, 1
# cmap = sns.diverging_palette(240, 10, l=65, center="dark", as_cmap=True)
# cmap = plt.cm.viridis
# cmap = plt.cm.Accent
cmap = plt.cm.Reds
# g = g.map(facet_scatter, 'Contact', 'Newcontact', "Qw", s=10, alpha=1, vmin=vmin, vmax=vmax, cmap=cmap)
g = g.map(facet_scatter, 'Contact', 'Newcontact', "Qw", s=10, alpha=1, cmap=cmap)
# Make space for the colorbar
g.fig.subplots_adjust(right=.92)
# Define a new Axes where the colorbar will go
cax = g.fig.add_axes([.94, .25, .02, .6])
# Get a mappable object with the same colormap as the data
points = plt.scatter([], [], c=[], vmin=vmin, vmax=vmax, cmap=cmap)
# Draw the colorbar
g.fig.colorbar(points, cax=cax)
for ax in g.axes:
name= ax.title.get_text().split(" ")[-1]
# ax.set_aspect('equal')
# print(name)
energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
ax.axvline(energy, ls="--", color="blue", linewidth=4)
energy = d_native.query(f"Name == '{name}'")["Newcontact"].iloc[0]
ax.axhline(energy, ls="--", color="orange", linewidth=4)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
# ax.axvline(energy, ls="--", color="orange", linewidth=4)
In [218]:
lastFrames.shape
Out[218]:
In [219]:
y_show = "Contact"
g = sns.FacetGrid(lastFrames, col='Name',col_wrap=4, sharey=False, sharex=False)
def facet_scatter(x, y, c, **kwargs):
"""Draw scatterplot with point colors from a faceted DataFrame columns."""
kwargs.pop("color")
plt.scatter(x, y, c=c, **kwargs)
vmin, vmax = 0, 1
# cmap = sns.diverging_palette(240, 10, l=65, center="dark", as_cmap=True)
# cmap = plt.cm.viridis
# cmap = plt.cm.Accent
cmap = plt.cm.Reds
# g = g.map(facet_scatter, 'Contact', 'Newcontact', "Qw", s=10, alpha=1, vmin=vmin, vmax=vmax, cmap=cmap)
g = g.map(facet_scatter, 'Contact', 'Newcontact', "Qw", s=10, alpha=1, cmap=cmap)
# Make space for the colorbar
g.fig.subplots_adjust(right=.92)
# Define a new Axes where the colorbar will go
cax = g.fig.add_axes([.94, .25, .02, .6])
# Get a mappable object with the same colormap as the data
points = plt.scatter([], [], c=[], vmin=vmin, vmax=vmax, cmap=cmap)
# Draw the colorbar
g.fig.colorbar(points, cax=cax)
for ax in g.axes:
name= ax.title.get_text().split(" ")[-1]
# ax.set_aspect('equal')
# print(name)
energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
ax.axvline(energy, ls="--", color="blue", linewidth=4)
energy = d_native.query(f"Name == '{name}'")["Newcontact"].iloc[0]
ax.axhline(energy, ls="--", color="orange", linewidth=4)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
# ax.axvline(energy, ls="--", color="orange", linewidth=4)
In [107]:
cmap = sns.cubehelix_palette(as_cmap=True)
# , c=data["Qw"]
In [120]:
data_folder_original = data.query("Folder == 'original'")
In [ ]:
plt.cm.Accent
In [134]:
g = sns.FacetGrid(data_folder_original, col='Name',col_wrap=4, sharey=False, sharex=False)
def facet_scatter(x, y, c, **kwargs):
"""Draw scatterplot with point colors from a faceted DataFrame columns."""
kwargs.pop("color")
plt.scatter(x, y, c=c, **kwargs)
vmin, vmax = 0, 1
# cmap = sns.diverging_palette(240, 10, l=65, center="dark", as_cmap=True)
# cmap = plt.cm.viridis
# cmap = plt.cm.Accent
cmap = plt.cm.Reds
# g = g.map(facet_scatter, 'Contact', 'Newcontact', "Qw", s=10, alpha=1, vmin=vmin, vmax=vmax, cmap=cmap)
g = g.map(facet_scatter, 'Contact', 'Newcontact', "Qw", s=10, alpha=1, cmap=cmap)
# Make space for the colorbar
g.fig.subplots_adjust(right=.92)
# Define a new Axes where the colorbar will go
cax = g.fig.add_axes([.94, .25, .02, .6])
# Get a mappable object with the same colormap as the data
points = plt.scatter([], [], c=[], vmin=vmin, vmax=vmax, cmap=cmap)
# Draw the colorbar
g.fig.colorbar(points, cax=cax)
Out[134]:
In [109]:
y_show = "Contact"
g = sns.FacetGrid(data, col="Name",col_wrap=4, hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Newcontact", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
name= ax.title.get_text().split(" ")[-1]
# print(name)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
# ax.axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
# ax.axhline(energy, ls="--", color="orange", linewidth=4)
In [104]:
y_show = "Contact"
g = sns.FacetGrid(data, col="Name",col_wrap=4, hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
name= ax.title.get_text().split(" ")[-1]
# print(name)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
# ax.axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
# ax.axhline(energy, ls="--", color="orange", linewidth=4)
In [115]:
y_show = "Newcontact"
g = sns.FacetGrid(data, col="Name",col_wrap=4, hue="Folder", sharey=False, sharex=False)
g = (g.map(plt.scatter, "Qw", y_show, alpha=0.5).add_legend())
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'multi_iter0_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query("Name == 'T0759-D1' and Folder == 'original_with_minimization'")["VTotal"][0]
# g.axes[0].axhline(energy, ls="--", color="orange", linewidth=4)
for ax in g.axes:
name= ax.title.get_text().split(" ")[-1]
# print(name)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[0]
# ax.axhline(energy, ls="--", color="blue", linewidth=4)
# energy = native_energy.query(f"Name == '{name}'")[y_show].iloc[1]
# ax.axhline(energy, ls="--", color="orange", linewidth=4)
In [56]:
dataset["combined"][::4]
Out[56]:
In [131]:
pre = "/Users/weilu/Research/server/may_2019/single_memory/"
folder_list = ["multi_iter0", "original"]
# folder_list = ["original", "multi_iter0", "multi_iter0_A_norm"]
# pdb_list = ['T0759-D1', 'T0953s2-D1', 'T0943-D1', 'T0773-D1', 'T0816-D1', 'T0854-D2', 'T0767-D1', 'T0853-D1', 'T0958-D1', 'T0834-D2', 'T0960-D3', 'T0862-D1', 'T0912-D3', 'T0898-D1', 'T0824-D1', 'T0782-D1', 'T0830-D2', 'T0761-D2', 'T0968s1-D1', 'T0870-D1', 'T0838-D1', 'T0803-D1']
pdb_list = dataset["combined"]
# data = get_complete_data(pre, folder_list, pdb_list, run=30, rerun=-1, formatName=True)
data = get_complete_data(pre, folder_list, pdb_list, run=30, rerun=1, formatName=True)
data.Steps = data.Steps.astype(int)
data["Contact"] = data["Water"] + data["Burial"]
subset_data = data.query("Steps % 80000 == 0 and Steps != 0")
In [145]:
data = data.reset_index().rename(columns={"index":"Frame"})
In [217]:
data.to_csv("/Users/weilu/Research/data/optimization/may12.csv", index=False)
In [153]:
data.head()
Out[153]:
In [154]:
data.tail()
Out[154]:
In [155]:
a = pd.read_csv("/Users/weilu/Research/data/optimization/may12.csv")
In [222]:
import xml.etree.ElementTree as ET
import pandas as pd
# do this if running in jupyter
# pd.set_option('display.max_columns', None)
# convert XML to dataframe (assumes only one layer of nesting)
def xml2df(xml_data):
root = ET.XML(xml_data) # element tree
all_records = []
for i, child in enumerate(root):
record = {}
for subchild in child:
record[subchild.tag] = subchild.text
all_records.append(record)
df = pd.DataFrame(all_records)
# how to make datetimes from unix epoch ints
# df['CreatedTimestamp'] = pd.to_datetime(df['CreatedDate'], unit='s')
# df['ModifiedTimestamp'] = pd.to_datetime(df['ModifiedDate'], unit='s')
return df
# load XML to dataframe (gotta be small)
xml_data = open('/Users/weilu/Downloads/2018/1800011.xml').read()
df = xml2df(xml_data)
In [223]:
df
Out[223]:
In [225]:
a = glob.glob("/Users/weilu/Downloads/2018/*.xml")
In [228]:
all_data = []
for line in a:
# load XML to dataframe (gotta be small)
try:
xml_data = open(line).read()
df = xml2df(xml_data)
all_data.append(df)
except:
print(line)
In [260]:
a.query("AwardAmount < 15").hist("AwardAmount", bins=100)
Out[260]:
In [265]:
a.query("AwardAmount > 10").shape
Out[265]:
In [262]:
a.query("AwardAmount > 2").hist("AwardAmount", bins=100)
Out[262]:
In [ ]: