In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from small_script.myFunctions import *
import feather
import Bio.PDB as bio
d3_to_index = bio.Polypeptide.d3_to_index # we may want to adjust this in the future.
three_to_one = bio.Polypeptide.three_to_one
plt.rcParams['figure.figsize'] = [16.18033, 10]
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [4]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/feather_frag_data_jan14.feather")
In [5]:
data.head()
Out[5]:
In [15]:
data.iloc[:,3:24].max().max()
Out[15]:
In [16]:
data.iloc[:,3:24].min().min()
Out[16]:
In [101]:
np.arange(0,31,4)
Out[101]:
In [57]:
pd.interval_range(start=0, end=31, freq=5).mid.astype(int)
Out[57]:
In [102]:
labels = pd.interval_range(start=0, end=31, freq=4).mid.astype(int)
for i in range(1,22):
data[f"d{i}"] = pd.cut(data[f"dis{i}"], bins=np.arange(0,31,4), labels=labels)
In [105]:
data["dd"] = d["d1"].astype(str)
for i in range(2,22):
data["dd"] += d[f"d{i}"].astype(str)
d = data.query("DisType == 'dis_ca_ca'").reset_index(drop=True)
In [ ]:
data.to_feather("/Users/weilu/Research/optimization/fragment/feather_frag_data_bin4_jan18.feather")
In [106]:
count_data = d["dd"].value_counts()
In [108]:
# number of unique fold. 78,333
len(count_data)
Out[108]:
In [109]:
# number of total frag, 475,358
count_data.reset_index()["dd"].sum()
Out[109]:
In [110]:
count_data.reset_index().query("dd > 1").shape
Out[110]:
In [111]:
count_data.reset_index().query("dd > 2").shape
Out[111]:
In [113]:
count_data.reset_index().query("dd > 5").shape
Out[113]:
In [114]:
count_data.reset_index()["dd"].hist(bins=50, log=True)
Out[114]:
In [240]:
data["dis6"][data["dis6"] >= 30] = 29.99 # cap the large dis to 30
labels = pd.interval_range(start=0, end=30, freq=2).mid.astype(int)
for i in range(1,22):
data[f"d{i}"] = pd.cut(data[f"dis{i}"], bins=np.arange(0,32,2), labels=labels)
In [215]:
labels
Out[215]:
In [239]:
np.arange(0,32,2)
Out[239]:
In [263]:
data["dd"] = data["d1"].astype(int).astype(str)
for i in range(2,22):
data["dd"] += "," + data[f"d{i}"].astype(int).astype(str)
d = data.query("DisType == 'dis_ca_ca'").reset_index(drop=True)
In [243]:
data.to_feather("/Users/weilu/Research/optimization/fragment/feather_frag_data_bin2_jan18.feather")
In [195]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/feather_frag_data_bin2_jan18.feather")
# data = data.drop(columns="dd")
# d = data.query("DisType == 'dis_ca_ca'").reset_index(drop=True)
In [264]:
d.head()
Out[264]:
In [265]:
count_data = d["dd"].value_counts()
In [266]:
# number of unique fold. 853,589
len(count_data)
Out[266]:
In [267]:
# number of total frag, 1,901,430
count_data.reset_index()["dd"].sum()
Out[267]:
In [268]:
count_data.reset_index().query("dd > 1").shape
Out[268]:
In [269]:
count_data.reset_index().query("dd > 2").shape
Out[269]:
In [270]:
count_data.reset_index().query("dd > 5").shape
Out[270]:
In [271]:
count_data.reset_index().query("dd > 10").shape
Out[271]:
In [272]:
count_data_dd_10 = count_data.reset_index().query("dd > 10").rename(columns={"index":"dd", "dd":"count"})
count_data_dd_10 = count_data_dd_10.reset_index().rename(columns={"index":"category"})
In [273]:
save_data = d.merge(count_data_dd_10, how='left', left_index=True, on="dd")
In [275]:
save_data.reset_index(drop=True).to_csv("/Users/weilu/Research/optimization/fragment/clustered_bin2_jan18.csv")
In [80]:
count_data.reset_index().query("dd > 1").shape
Out[80]:
In [81]:
count_data.reset_index().query("dd > 2").shape
Out[81]:
In [83]:
count_data.reset_index().query("dd > 3").shape
Out[83]:
In [89]:
count_data.reset_index().query("dd > 1")["dd"].sum()
Out[89]:
In [85]:
# number of unique fold, filter less frequent, 29,117
count_data.reset_index().query("dd > 5").shape
Out[85]:
In [87]:
count_data.reset_index().query("dd > 5")["dd"].sum()
Out[87]:
In [94]:
count_data.reset_index()["dd"].hist(bins=50, log=True)
Out[94]:
In [38]:
pd.cut(data.head()["dis1"], bins=np.arange(0,31,2)).cat.
Out[38]:
In [4]:
data = pd.read_csv("/Users/weilu/Research/optimization/fragment/data_jan14.csv")
In [36]:
data["seqDis"] = data["j"]+3
data.to_feather("/Users/weilu/Research/optimization/fragment/feather_data_jan14_2.feather")
In [14]:
data = feather.read_dataframe("/Users/weilu/Research/optimization/fragment/feather_data_jan14_2.feather")
In [19]:
three_to_one("ASP")
Out[19]:
In [39]:
test = data.sample(int(1e6))
In [35]:
data.head(20)
Out[35]:
In [45]:
g = sns.FacetGrid(data, col="seqDis", col_wrap=3, height=3, aspect=1.618033)
g = g.map(plt.hist, "dis_ca_ca", bins=50)
In [44]:
g = sns.FacetGrid(test, col="seqDis", col_wrap=3, height=3, aspect=1.618033)
g = g.map(plt.hist, "dis_ca_ca", bins=50)
In [47]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
test.query("j == 0")["dis_ca_ca"].hist(bins=100)
Out[47]:
In [ ]: