notebook.community

Edit and run



In [3]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np



In [4]:

    
high_tcr=pd.read_csv("1138_highTCR.csv")
mid_tcr=pd.read_csv("1182_midTCR.csv")
low_tcr=pd.read_csv("1260_lowTCR.csv")



In [32]:

    
'''
Function make_intial creates a dataframe from a sample ID. If using different filename, this must be changed within
the function. Standard file naming: each file should start with the sample and the rest of the filename should be the 
same between other samples. 

The intitial dataframe has the counts at each bp position of every gene that was in the original bedfile. These
counts are raw and are not normalized to the number of reads or TTs in the region.
'''
def make_initial(sample):
    plus_df=pd.read_table("primary_TSS_bedfiles/"+sample+"_plus_TSS_xrseq_2.bed",header=None)
    plus_df.columns=['chr','start','end','type','gene','strand','position','plus_count']
    minus_df=pd.read_table("primary_TSS_bedfiles/"+sample+"_minus_TSS_xrseq_2.bed",header=None)
    minus_df.columns=['chr','start','end','type','gene','strand','position','minus_count']
    merge_df=pd.merge(plus_df,minus_df)
    plusgenes_df=merge_df[merge_df['strand']=='+']
    plusgenes_df.columns=['chr','start','end','type','gene','strand','position','NTS','TS']
    minusgenes_df=merge_df[merge_df['strand']=='-']
    minusgenes_df.columns=['chr','start','end','type','gene','strand','position','TS','NTS']
    joined_df=pd.concat([plusgenes_df,minusgenes_df],sort=True)
    joined_df=joined_df.sort_index()
    joined_df=joined_df.loc[:,['gene','strand','position','TS','NTS']]
    
    return joined_df

'''
Function norm_TT takes a dataframe (from make initial function) and normalizes the TS and NTS counts based on the 
number of TTs on each strand whithin a specified region.

The TT_content.py script and bedfile with genomic regions used to make the sample files should be used in order to have 
the correct normalization.
'''
def norm_TT(joined_df,TT_file,bedfile):
    TT_counts=pd.read_csv(TT_file,sep='\t',index_col=0)
    TT_counts.columns=['TT_TS','TT_NTS']
    gene_names=pd.read_csv(bedfile,sep='\t',header=None,usecols=[4],names=['gene'])
    TT_counts['gene']=gene_names['gene']
    
    joined_df=joined_df.merge(TT_counts,on='gene')
    joined_df['norm_TS']=joined_df['TS']/joined_df['TT_TS']
    joined_df['norm_NTS']=joined_df['NTS']/joined_df['TT_NTS']
    joined_df=joined_df.loc[:,['gene','strand','position','norm_TS','norm_NTS']]
    
    return joined_df
'''
Function make_vector will take the average of all counts between genes.
It also flips the reverse genes so that they will be in the same orientation as the forward genes.
'''
def make_vector(joined_df,s):
    forward_genes=joined_df[joined_df['strand']=='+']
    forward_genes=forward_genes.groupby(['position']).mean()
    forward_lis=forward_genes['norm_'+s].tolist()
    reverse_genes=joined_df[joined_df['strand']=='-']
    reverse_genes=reverse_genes.groupby(['position']).mean()
    reverse_lis=reverse_genes['norm_'+s].tolist()
    reverse_lis=reverse_lis[::-1]
    forward_vector=np.array(forward_lis)
    reverse_vector=np.array(reverse_lis)
    sum_vector= forward_vector + reverse_vector

    return sum_vector
'''
Function make_final makes the final DF by converting the vectors for TS and NTS into 
dataframe columns. It then uses a window sizes to compute the rolling average of each
column.
'''
def make_final(joined_df,win_size):
    sum_vector_TS=make_vector(joined_df,'TS')
    sum_vector_NTS=make_vector(joined_df,'NTS')
    sum_df=pd.DataFrame({'TS':sum_vector_TS, 'NTS':sum_vector_NTS})
    final_df=sum_df.rolling(window=win_size,min_periods=1).mean()
    
    return final_df
    

'''
Function make_df takes in a sample name, a TT_file, Bedfile and optional filter_df
Bedfile should have been used to make the TT_counts file with the TT_content.py script. TT_counts is just the number 
of TTs on each strand for the specified regions in the bedfile. These two files are important for normalizing by the 
number of TTs in each strand.

The optional filter file is a csv of genes that should be used to filter the dataframe.(csv should just be one
column of gene names and the column name MUST be gene)
'''

def make_df(sample,TT_file,bedfile,filter_genes=None):
    
    joined_df=make_initial(sample)
    
    if filter_genes is not None:
        filter_df=pd.read_csv(filter_genes)
        joined_df=joined_df.merge(filter_df,on='gene')
    
    
    joined_df=norm_TT(joined_df,TT_file,bedfile)
    
    final_df=make_final(joined_df,125)
    
    return final_df



In [ ]:

    
sample_dict=



In [33]:

    
TT_file='TTcount.txt'
bedfile='primary_upstreamsense_TSS_2'
high_tcr="1138_highTCR.csv"



wt120_df=make_df('bm01',TT_file,bedfile,high_tcr)



In [34]:

    
wt120_df









    Out[34]:







  
    
      
      TS
      NTS
    
  
  
    
      0
      0.662186
      0.771330
    
    
      1
      0.670423
      0.747816
    
    
      2
      0.664781
      0.770311
    
    
      3
      0.651599
      0.786940
    
    
      4
      0.648017
      0.794297
    
    
      5
      0.645058
      0.817144
    
    
      6
      0.648140
      0.834525
    
    
      7
      0.655704
      0.839794
    
    
      8
      0.663299
      0.842595
    
    
      9
      0.662293
      0.848476
    
    
      10
      0.660714
      0.854608
    
    
      11
      0.664256
      0.860717
    
    
      12
      0.669551
      0.868098
    
    
      13
      0.678716
      0.866565
    
    
      14
      0.684865
      0.863260
    
    
      15
      0.691769
      0.858660
    
    
      16
      0.696581
      0.853784
    
    
      17
      0.701039
      0.848877
    
    
      18
      0.706256
      0.837904
    
    
      19
      0.713499
      0.827235
    
    
      20
      0.716584
      0.816898
    
    
      21
      0.718426
      0.806442
    
    
      22
      0.738118
      0.842591
    
    
      23
      0.758271
      0.876004
    
    
      24
      0.779711
      0.907349
    
    
      25
      0.797201
      0.933334
    
    
      26
      0.811674
      0.957355
    
    
      27
      0.825369
      0.986242
    
    
      28
      0.851571
      1.013064
    
    
      29
      0.875812
      1.038352
    
    
      ...
      ...
      ...
    
    
      970
      1.362286
      0.487075
    
    
      971
      1.364332
      0.485861
    
    
      972
      1.366952
      0.484254
    
    
      973
      1.370742
      0.482999
    
    
      974
      1.374617
      0.482199
    
    
      975
      1.379062
      0.481410
    
    
      976
      1.383927
      0.480518
    
    
      977
      1.386315
      0.479590
    
    
      978
      1.390629
      0.478590
    
    
      979
      1.395271
      0.478092
    
    
      980
      1.399911
      0.477632
    
    
      981
      1.403010
      0.477391
    
    
      982
      1.407117
      0.477618
    
    
      983
      1.410680
      0.477404
    
    
      984
      1.414096
      0.477067
    
    
      985
      1.416664
      0.477249
    
    
      986
      1.418140
      0.477510
    
    
      987
      1.419588
      0.478258
    
    
      988
      1.420723
      0.478977
    
    
      989
      1.421980
      0.479671
    
    
      990
      1.421662
      0.480670
    
    
      991
      1.420208
      0.481684
    
    
      992
      1.417377
      0.482586
    
    
      993
      1.416061
      0.483551
    
    
      994
      1.414663
      0.484430
    
    
      995
      1.414273
      0.485187
    
    
      996
      1.414419
      0.485839
    
    
      997
      1.414960
      0.486591
    
    
      998
      1.414727
      0.487044
    
    
      999
      1.413222
      0.489630
    
  

1000 rows × 2 columns



In [ ]:

	TS	NTS
0	0.662186	0.771330
1	0.670423	0.747816
2	0.664781	0.770311
3	0.651599	0.786940
4	0.648017	0.794297
5	0.645058	0.817144
6	0.648140	0.834525
7	0.655704	0.839794
8	0.663299	0.842595
9	0.662293	0.848476
10	0.660714	0.854608
11	0.664256	0.860717
12	0.669551	0.868098
13	0.678716	0.866565
14	0.684865	0.863260
15	0.691769	0.858660
16	0.696581	0.853784
17	0.701039	0.848877
18	0.706256	0.837904
19	0.713499	0.827235
20	0.716584	0.816898
21	0.718426	0.806442
22	0.738118	0.842591
23	0.758271	0.876004
24	0.779711	0.907349
25	0.797201	0.933334
26	0.811674	0.957355
27	0.825369	0.986242
28	0.851571	1.013064
29	0.875812	1.038352
...	...	...
970	1.362286	0.487075
971	1.364332	0.485861
972	1.366952	0.484254
973	1.370742	0.482999
974	1.374617	0.482199
975	1.379062	0.481410
976	1.383927	0.480518
977	1.386315	0.479590
978	1.390629	0.478590
979	1.395271	0.478092
980	1.399911	0.477632
981	1.403010	0.477391
982	1.407117	0.477618
983	1.410680	0.477404
984	1.414096	0.477067
985	1.416664	0.477249
986	1.418140	0.477510
987	1.419588	0.478258
988	1.420723	0.478977
989	1.421980	0.479671
990	1.421662	0.480670
991	1.420208	0.481684
992	1.417377	0.482586
993	1.416061	0.483551
994	1.414663	0.484430
995	1.414273	0.485187
996	1.414419	0.485839
997	1.414960	0.486591
998	1.414727	0.487044
999	1.413222	0.489630