In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
In [4]:
high_tcr=pd.read_csv("1138_highTCR.csv")
mid_tcr=pd.read_csv("1182_midTCR.csv")
low_tcr=pd.read_csv("1260_lowTCR.csv")
In [32]:
'''
Function make_intial creates a dataframe from a sample ID. If using different filename, this must be changed within
the function. Standard file naming: each file should start with the sample and the rest of the filename should be the
same between other samples.
The intitial dataframe has the counts at each bp position of every gene that was in the original bedfile. These
counts are raw and are not normalized to the number of reads or TTs in the region.
'''
def make_initial(sample):
plus_df=pd.read_table("primary_TSS_bedfiles/"+sample+"_plus_TSS_xrseq_2.bed",header=None)
plus_df.columns=['chr','start','end','type','gene','strand','position','plus_count']
minus_df=pd.read_table("primary_TSS_bedfiles/"+sample+"_minus_TSS_xrseq_2.bed",header=None)
minus_df.columns=['chr','start','end','type','gene','strand','position','minus_count']
merge_df=pd.merge(plus_df,minus_df)
plusgenes_df=merge_df[merge_df['strand']=='+']
plusgenes_df.columns=['chr','start','end','type','gene','strand','position','NTS','TS']
minusgenes_df=merge_df[merge_df['strand']=='-']
minusgenes_df.columns=['chr','start','end','type','gene','strand','position','TS','NTS']
joined_df=pd.concat([plusgenes_df,minusgenes_df],sort=True)
joined_df=joined_df.sort_index()
joined_df=joined_df.loc[:,['gene','strand','position','TS','NTS']]
return joined_df
'''
Function norm_TT takes a dataframe (from make initial function) and normalizes the TS and NTS counts based on the
number of TTs on each strand whithin a specified region.
The TT_content.py script and bedfile with genomic regions used to make the sample files should be used in order to have
the correct normalization.
'''
def norm_TT(joined_df,TT_file,bedfile):
TT_counts=pd.read_csv(TT_file,sep='\t',index_col=0)
TT_counts.columns=['TT_TS','TT_NTS']
gene_names=pd.read_csv(bedfile,sep='\t',header=None,usecols=[4],names=['gene'])
TT_counts['gene']=gene_names['gene']
joined_df=joined_df.merge(TT_counts,on='gene')
joined_df['norm_TS']=joined_df['TS']/joined_df['TT_TS']
joined_df['norm_NTS']=joined_df['NTS']/joined_df['TT_NTS']
joined_df=joined_df.loc[:,['gene','strand','position','norm_TS','norm_NTS']]
return joined_df
'''
Function make_vector will take the average of all counts between genes.
It also flips the reverse genes so that they will be in the same orientation as the forward genes.
'''
def make_vector(joined_df,s):
forward_genes=joined_df[joined_df['strand']=='+']
forward_genes=forward_genes.groupby(['position']).mean()
forward_lis=forward_genes['norm_'+s].tolist()
reverse_genes=joined_df[joined_df['strand']=='-']
reverse_genes=reverse_genes.groupby(['position']).mean()
reverse_lis=reverse_genes['norm_'+s].tolist()
reverse_lis=reverse_lis[::-1]
forward_vector=np.array(forward_lis)
reverse_vector=np.array(reverse_lis)
sum_vector= forward_vector + reverse_vector
return sum_vector
'''
Function make_final makes the final DF by converting the vectors for TS and NTS into
dataframe columns. It then uses a window sizes to compute the rolling average of each
column.
'''
def make_final(joined_df,win_size):
sum_vector_TS=make_vector(joined_df,'TS')
sum_vector_NTS=make_vector(joined_df,'NTS')
sum_df=pd.DataFrame({'TS':sum_vector_TS, 'NTS':sum_vector_NTS})
final_df=sum_df.rolling(window=win_size,min_periods=1).mean()
return final_df
'''
Function make_df takes in a sample name, a TT_file, Bedfile and optional filter_df
Bedfile should have been used to make the TT_counts file with the TT_content.py script. TT_counts is just the number
of TTs on each strand for the specified regions in the bedfile. These two files are important for normalizing by the
number of TTs in each strand.
The optional filter file is a csv of genes that should be used to filter the dataframe.(csv should just be one
column of gene names and the column name MUST be gene)
'''
def make_df(sample,TT_file,bedfile,filter_genes=None):
joined_df=make_initial(sample)
if filter_genes is not None:
filter_df=pd.read_csv(filter_genes)
joined_df=joined_df.merge(filter_df,on='gene')
joined_df=norm_TT(joined_df,TT_file,bedfile)
final_df=make_final(joined_df,125)
return final_df
In [ ]:
sample_dict=
In [33]:
TT_file='TTcount.txt'
bedfile='primary_upstreamsense_TSS_2'
high_tcr="1138_highTCR.csv"
wt120_df=make_df('bm01',TT_file,bedfile,high_tcr)
In [34]:
wt120_df
Out[34]:
In [ ]: