In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [4]:
high_tcr=pd.read_csv("1138_highTCR.csv")
mid_tcr=pd.read_csv("1182_midTCR.csv")
low_tcr=pd.read_csv("1260_lowTCR.csv")

In [32]:
'''
Function make_intial creates a dataframe from a sample ID. If using different filename, this must be changed within
the function. Standard file naming: each file should start with the sample and the rest of the filename should be the 
same between other samples. 

The intitial dataframe has the counts at each bp position of every gene that was in the original bedfile. These
counts are raw and are not normalized to the number of reads or TTs in the region.
'''
def make_initial(sample):
    plus_df=pd.read_table("primary_TSS_bedfiles/"+sample+"_plus_TSS_xrseq_2.bed",header=None)
    plus_df.columns=['chr','start','end','type','gene','strand','position','plus_count']
    minus_df=pd.read_table("primary_TSS_bedfiles/"+sample+"_minus_TSS_xrseq_2.bed",header=None)
    minus_df.columns=['chr','start','end','type','gene','strand','position','minus_count']
    merge_df=pd.merge(plus_df,minus_df)
    plusgenes_df=merge_df[merge_df['strand']=='+']
    plusgenes_df.columns=['chr','start','end','type','gene','strand','position','NTS','TS']
    minusgenes_df=merge_df[merge_df['strand']=='-']
    minusgenes_df.columns=['chr','start','end','type','gene','strand','position','TS','NTS']
    joined_df=pd.concat([plusgenes_df,minusgenes_df],sort=True)
    joined_df=joined_df.sort_index()
    joined_df=joined_df.loc[:,['gene','strand','position','TS','NTS']]
    
    return joined_df

'''
Function norm_TT takes a dataframe (from make initial function) and normalizes the TS and NTS counts based on the 
number of TTs on each strand whithin a specified region.

The TT_content.py script and bedfile with genomic regions used to make the sample files should be used in order to have 
the correct normalization.
'''
def norm_TT(joined_df,TT_file,bedfile):
    TT_counts=pd.read_csv(TT_file,sep='\t',index_col=0)
    TT_counts.columns=['TT_TS','TT_NTS']
    gene_names=pd.read_csv(bedfile,sep='\t',header=None,usecols=[4],names=['gene'])
    TT_counts['gene']=gene_names['gene']
    
    joined_df=joined_df.merge(TT_counts,on='gene')
    joined_df['norm_TS']=joined_df['TS']/joined_df['TT_TS']
    joined_df['norm_NTS']=joined_df['NTS']/joined_df['TT_NTS']
    joined_df=joined_df.loc[:,['gene','strand','position','norm_TS','norm_NTS']]
    
    return joined_df
'''
Function make_vector will take the average of all counts between genes.
It also flips the reverse genes so that they will be in the same orientation as the forward genes.
'''
def make_vector(joined_df,s):
    forward_genes=joined_df[joined_df['strand']=='+']
    forward_genes=forward_genes.groupby(['position']).mean()
    forward_lis=forward_genes['norm_'+s].tolist()
    reverse_genes=joined_df[joined_df['strand']=='-']
    reverse_genes=reverse_genes.groupby(['position']).mean()
    reverse_lis=reverse_genes['norm_'+s].tolist()
    reverse_lis=reverse_lis[::-1]
    forward_vector=np.array(forward_lis)
    reverse_vector=np.array(reverse_lis)
    sum_vector= forward_vector + reverse_vector

    return sum_vector
'''
Function make_final makes the final DF by converting the vectors for TS and NTS into 
dataframe columns. It then uses a window sizes to compute the rolling average of each
column.
'''
def make_final(joined_df,win_size):
    sum_vector_TS=make_vector(joined_df,'TS')
    sum_vector_NTS=make_vector(joined_df,'NTS')
    sum_df=pd.DataFrame({'TS':sum_vector_TS, 'NTS':sum_vector_NTS})
    final_df=sum_df.rolling(window=win_size,min_periods=1).mean()
    
    return final_df
    

'''
Function make_df takes in a sample name, a TT_file, Bedfile and optional filter_df
Bedfile should have been used to make the TT_counts file with the TT_content.py script. TT_counts is just the number 
of TTs on each strand for the specified regions in the bedfile. These two files are important for normalizing by the 
number of TTs in each strand.

The optional filter file is a csv of genes that should be used to filter the dataframe.(csv should just be one
column of gene names and the column name MUST be gene)
'''

def make_df(sample,TT_file,bedfile,filter_genes=None):
    
    joined_df=make_initial(sample)
    
    if filter_genes is not None:
        filter_df=pd.read_csv(filter_genes)
        joined_df=joined_df.merge(filter_df,on='gene')
    
    
    joined_df=norm_TT(joined_df,TT_file,bedfile)
    
    final_df=make_final(joined_df,125)
    
    return final_df

In [ ]:
sample_dict=

In [33]:
TT_file='TTcount.txt'
bedfile='primary_upstreamsense_TSS_2'
high_tcr="1138_highTCR.csv"



wt120_df=make_df('bm01',TT_file,bedfile,high_tcr)

In [34]:
wt120_df


Out[34]:
TS NTS
0 0.662186 0.771330
1 0.670423 0.747816
2 0.664781 0.770311
3 0.651599 0.786940
4 0.648017 0.794297
5 0.645058 0.817144
6 0.648140 0.834525
7 0.655704 0.839794
8 0.663299 0.842595
9 0.662293 0.848476
10 0.660714 0.854608
11 0.664256 0.860717
12 0.669551 0.868098
13 0.678716 0.866565
14 0.684865 0.863260
15 0.691769 0.858660
16 0.696581 0.853784
17 0.701039 0.848877
18 0.706256 0.837904
19 0.713499 0.827235
20 0.716584 0.816898
21 0.718426 0.806442
22 0.738118 0.842591
23 0.758271 0.876004
24 0.779711 0.907349
25 0.797201 0.933334
26 0.811674 0.957355
27 0.825369 0.986242
28 0.851571 1.013064
29 0.875812 1.038352
... ... ...
970 1.362286 0.487075
971 1.364332 0.485861
972 1.366952 0.484254
973 1.370742 0.482999
974 1.374617 0.482199
975 1.379062 0.481410
976 1.383927 0.480518
977 1.386315 0.479590
978 1.390629 0.478590
979 1.395271 0.478092
980 1.399911 0.477632
981 1.403010 0.477391
982 1.407117 0.477618
983 1.410680 0.477404
984 1.414096 0.477067
985 1.416664 0.477249
986 1.418140 0.477510
987 1.419588 0.478258
988 1.420723 0.478977
989 1.421980 0.479671
990 1.421662 0.480670
991 1.420208 0.481684
992 1.417377 0.482586
993 1.416061 0.483551
994 1.414663 0.484430
995 1.414273 0.485187
996 1.414419 0.485839
997 1.414960 0.486591
998 1.414727 0.487044
999 1.413222 0.489630

1000 rows × 2 columns


In [ ]: