This code was used to generate the figures in the PNAS letter "On the role of MFD and UvrD in transcription coupled DNA repair"
The input files are bedfiles uploaded by Adebali et al. from the GEO database. Acession #GSE92734
Code is taken from process_file.py and assembled in this notebook to make more sense.
All code was written by Britney Martinez Graduate Student in Evgeny Nudler's lab
In [1]:
#Packages used in this analysis
import collections
import pandas as pd
from functools import reduce
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
from scipy.stats import stats
In [62]:
'''
gen_df takes in a list of bedfiles for each sample and seperates alignment counts on whether they map to the TS or
NTS. This assumes the first file in the list is the "plus" file and the second file in the list is the "minus" file
It returns a DF that has the sample seperated into aligned reads that map to TS or NTS.
All files in Plus file that map to '+' are NTS -because alignment files give coding strand
All files in Pluse file that map to '-' are TS
All files in Minus file that map to '-' are NTS
All files in Minus file that map ti '+' are TS
'''
def gen_df(sample_lis,sample):
plus_df=pd.read_table(sample_lis[0],header=None)
plus_df.columns=[0,1,2,3,4,5,'plus_count']
minus_df=pd.read_table(sample_lis[1],header=None)
minus_df.columns=[0,1,2,3,4,5,'minus_count']
merge_df=pd.merge(plus_df,minus_df)
plus_genes=merge_df[5]=='+'
minus_genes=merge_df[5]=='-'
plusgenes_df=merge_df[plus_genes]
plusgenes_df.columns=[0,1,2,3,4,5,sample+'_NTS',sample+'_TS']
plusgenes_df=plusgenes_df.ix[:,6:8]
minusgenes_df=merge_df[minus_genes]
minusgenes_df.columns=[0,1,2,3,4,5,sample+'_TS',sample+'_NTS']
minusgenes_df=minusgenes_df.ix[:,6:8]
merge_df=pd.concat([plusgenes_df,minusgenes_df])
merge_df=merge_df.sort_index()
cols=[sample+'_TS',sample+'_NTS']
merge_df=merge_df[cols]
return merge_df
wt_df=gen_df(['WT_plus_genes.bed','WT_minus_genes.bed'],'wt')
uvrd60_df=gen_df(['uvrd60_plus_genes.bed','uvrd60_minus_genes.bed'],'uvrd60')
ctd_df=gen_df(['ctd_plus_genes.bed','ctd_minus_genes.bed'],'ctd')
gre_df=gen_df(['greab_plus_genes.bed','greab_minus_genes.bed'],'gre')
uvrd10_df=gen_df(['uvrd10_plus_genes.bed','uvrd10_minus_genes.bed'],'uvrd10')
#uvrd_df
As an example, lets look at the first five rows of WT DF generated by gen_df: Each row is a gene and each column represent that gene's count for TS and NTS
In [37]:
ctd_df
Out[37]:
We then normalized the data based on the number of TT's in the gene. The following function was used. You can find TT counts in the Github repo.
In [38]:
uvrd60_df
Out[38]:
In [63]:
def normdata(df,strand):
countfile=open("TTcount.txt","r")
header=countfile.readline()
for i in range(4518):
newLine=countfile.readline().split()
if strand.split("_")[1] == "TS":
count=int(newLine[1])
if count != 0:
df.loc[i,strand]=df.loc[i,strand]/count
else:
if strand.split("_")[1]=="NTS":
count=int(newLine[2])
if count != 0:
df.loc[i,strand]=df.loc[i,strand]/count
return df
wt_df=normdata(wt_df,'wt_TS')
wt_df=normdata(wt_df,'wt_NTS')
uvrd60_df=normdata(uvrd60_df,'uvrd60_TS')
uvrd60_df=normdata(uvrd60_df,'uvrd60_NTS')
ctd_df=normdata(ctd_df,'ctd_TS')
ctd_df=normdata(ctd_df,'ctd_NTS')
gre_df=normdata(gre_df,'gre_TS')
gre_df=normdata(gre_df,'gre_NTS')
uvrd10_df=normdata(uvrd10_df,'uvrd10_TS')
uvrd10_df=normdata(uvrd10_df,'uvrd10_NTS')
You can now see the normalized DF:
In [27]:
ctd_df
Out[27]:
Then we removed all genes that had less than 30 TT dinucleotides:
In [64]:
TT_df=pd.read_table("TTcount.txt")
TT_df=TT_df[['TS']]
TSgreaterthan30=TT_df['TS']>30
TT_filtered=TT_df[TSgreaterthan30]
wt_df=TT_filtered.join(wt_df,how='inner')
wt_df=wt_df.drop('TS',1)
uvrd60_df=TT_filtered.join(uvrd60_df,how='inner')
uvrd60_df=uvrd60_df.drop('TS',1)
ctd_df=TT_filtered.join(ctd_df,how='inner')
ctd_df = ctd_df.drop('TS', 1)
gre_df=TT_filtered.join(gre_df,how='inner')
gre_df = gre_df.drop('TS', 1)
uvrd10_df=TT_filtered.join(uvrd10_df,how='inner')
uvrd10_df=uvrd10_df.drop('TS',1)
In [39]:
wt_df.replace(0,1,inplace=True)
wt_df
Out[39]:
Now we are ready to reproduce the histograms that they saw: It looks like we see the same trend they do (which we would expect with the same exact data)
In [41]:
def make_hists(sample_df,sample):
sample_df.replace(0,1,inplace=True)
sample_df[sample+'_TS/NTS']=sample_df[sample+'_TS']/sample_df[sample+'_NTS']
#print(sample_df)
sample_df=np.log2(sample_df[[sample+'_TS/NTS']])
#sample_df=sample_df.replace([np.inf, -np.inf,np.nan], 0)
fig=sample_df[[sample+'_TS/NTS']].plot.hist(alpha=0.5,range=[-4.5,4.5],bins=20,figsize=(10,3),edgecolor='w',color='teal',legend=None)
major_ticks = np.arange(0, 700, 200)
fig.set_yticks(major_ticks)
fig.tick_params(labelsize=16)
fig.set_xlabel("log2 TS/NTS repair",fontsize='16')
fig.set_ylabel("Number of Genes",fontsize='16')
graph=fig.get_figure()
#graph.savefig(sample+"_histogram.pdf", transparent=True)
#fig.set_xlim((0,2.5))
plt.show()
make_hists(wt_df,'wt')
make_hists(uvrd60_df,'uvrd60')
make_hists(ctd_df,'ctd')
make_hists(gre_df,'gre')
make_hists(uvrd10_df,'uvrd10')
In [68]:
sums=wt_df.sum(axis=0)
total=sums.sum()
print(sums)
total
Out[68]:
In [69]:
wt_df=wt_df/total
In [18]:
wt_df
Out[18]:
Now we make the correlation plots:
In [35]:
def correlate(first_df,second_df,sample1,sample2,strand):
first_df=np.log2(first_df)
first_df=first_df.replace([np.inf, -np.inf], 0)
first_df.replace(0,1,inplace=True)
second_df=np.log2(second_df)
second_df.replace(0,1,inplace=True)
second_df=second_df.replace([np.inf, -np.inf], 0)
first_lis=first_df[sample1+'_'+strand].tolist()
second_lis=second_df[sample2+'_'+strand].tolist()
x=first_lis
y=second_lis
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
fig, ax = plt.subplots()
ax.scatter(x,y,s=5,c=z,alpha=1.0)
ax.set_xlabel(sample1+" "+strand + " log2 counts per gene")
ax.set_ylabel(sample2 +" "+strand+" log2 counts per gene")
lims = [
np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes
np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes
]
ax.plot(lims, lims, 'k-', alpha=0.5, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)
#fig.savefig(sample2+strand+"correlation_filtered.pdf",transparent=True)
plt.show()
correlate(uvrd60_df,uvrd10_df,'uvrd60','uvrd10','TS')
correlate(uvrd60_df,uvrd10_df,'uvrd60','uvrd10','NTS')
#correlate(wt_df,uvrd_df,'wt','uvrd','TS')
#correlate(wt_df,uvrd_df,'wt','uvrd','NTS')
Now we make the frequency distrubutions for TS and NTS
In [12]:
import seaborn as sns
temp_ts= pd.concat([wt_df, mfd_df], axis=1)
temp_ts=np.log2(temp_ts)
temp_ts=temp_ts.replace([np.inf, -np.inf], 0)
f, ax = plt.subplots(figsize=(12, 4))
sns.set_style("white")
#sns.despine()
ax=sns.distplot(temp_ts[['wt_TS']],hist=False,kde_kws={"color": "darkorange", "lw": 3,"label": "WT TS"},ax=ax)
ax=sns.distplot(temp_ts[['mfd_TS']],hist=False,kde_kws={"color": "steelblue", "lw": 3,"label": "mfd(-) TS"},ax=ax)
ax.set_xlim(-1,4)
sns.plt.savefig("TS_smoothed_hist.pdf",transparent=True)
sns.plt.show()
f, ax = plt.subplots(figsize=(12, 4))
ax=sns.distplot(temp_ts[['wt_NTS']],hist=False,kde_kws={"color": "darkorange", "lw": 3,"label": "WT NTS"},ax=ax)
ax=sns.distplot(temp_ts[['mfd_NTS']],hist=False,kde_kws={"color": "steelblue", "lw": 3,"label": "mfd(-) NTS"},ax=ax)
ax.set_xlim(-1,4)
sns.plt.savefig("NTS_smoothed_hist.pdf",transparent=True)
sns.plt.show()
To filter to the top 148 high TCR genes the following code was used:
In [70]:
#wt_df.drop(wt_df[wt_df['wt_TS'] <= wt_df['wt_NTS']].index, inplace=True)
#wt_df.drop(wt_df[wt_df['wt_TS']-wt_df['wt_NTS']<5].index, inplace=True)
joined_df2=wt_df.join(uvrd10_df,how='inner')
joined_df2.head()
Out[70]:
In [7]:
joined_df2['wt_TS/NTS']=joined_df2['wt_TS']/joined_df2['wt_NTS']
joined_df2['uvrd_TS/NTS']=joined_df2['uvrd_TS']/joined_df2['uvrd_NTS']
joined_df2.head()
Out[7]:
In [16]:
wt_df=joined_df2[['wt_TS','uvrd_TS']]
NTS_df=joined_df2[['wt_NTS','uvrd_NTS']]
In [71]:
def correlate(first_df,sample1,sample2,strand):
first_df=np.log2(first_df)
first_df=first_df.replace([np.inf, -np.inf], 0)
#second_df=np.log2(second_df)
#second_df=second_df.replace([np.inf, -np.inf], 0)
first_lis=first_df[sample1+'_'+strand].tolist()
second_lis=first_df[sample2+'_'+strand].tolist()
x=first_lis
y=second_lis
xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)
fig, ax = plt.subplots()
ax.scatter(x,y,s=5,c=z,alpha=1.0)
ax.set_xlabel(sample1+" "+strand + " log2 counts per gene")
ax.set_ylabel(sample2 +" "+strand+" log2 counts per gene")
lims = [
np.min([ax.get_xlim(), ax.get_ylim()]), # min of both axes
np.max([ax.get_xlim(), ax.get_ylim()]), # max of both axes
]
ax.plot(lims, lims, 'k-', alpha=0.5, zorder=0)
ax.set_aspect('equal')
ax.set_xlim(lims)
ax.set_ylim(lims)
#fig.savefig(sample2+strand+"correlation_filtered.pdf",transparent=True)
plt.show()
In [72]:
correlate(joined_df2,'wt','uvrd10','TS')
In [17]:
NTS_df.drop(NTS_df[NTS_df['wt_NTS'] <= NTS_df['uvrd_NTS']].index, inplace=True)
print(NTS_df.shape)
In [5]:
%run TT_content.py
In [ ]: