Format of processed files for my MED4/P-HM2 data:
Tabular format of processed .txt files contains the following fields: 1. Sequential gene number, 2. GenBank identifier, 3. Strand, 4. Start position, 5. End position, 6. Description, 7. Total counts, 8. Sense counts, 9. Antisense counts
In [45]:
import pandas as pd
In [46]:
treatments = [
'1000ng_NoRibo',
'500ng_NoRibo',
'100ng_NoRibo',
'1000ng_Ribodpl',
'500ng_Ribodpl',
'100ng_Ribodpl',
'50ng_Ribodpl',
'25ng_Ribodpl',
'10ng_Ribodpl',
'5ng_Ribodpl_17cyc',
'5ng_Ribodpl_15cyc',
'2ng_Ribodpl_17cyc',
'2ng_Ribodpl_15cyc',
'1ng_Ribodpl_17cyc',
'1ng_Ribodpl_15cyc'
]
In [47]:
# low-biomass CDS counts
df_merged = pd.DataFrame()
for treatment in treatments:
path = '/Users/luke/vibrio/bioconductor/swt_%s_R1_trim_CDS.tsv' % treatment
df_single = pd.read_csv(path, sep='\t', header=None, index_col=0)
df_single.columns = [treatment]
df_merged = pd.concat([df_merged, df_single], axis=1)
df_merged = df_merged.drop(df_merged.index[-5:])
df_merged['CDS_number'] = df_merged.index
In [48]:
# low-biomass rRNA counts -- don't drop last 5 rows, do write to CSV without adding VF numbers
df_merged_rrna = pd.DataFrame()
for treatment in treatments:
path = '/Users/luke/vibrio/bioconductor/swt_%s_R1_trim_rRNA.tsv' % treatment
df_single = pd.read_csv(path, sep='\t', header=None, index_col=0)
df_single.columns = [treatment]
df_merged_rrna = pd.concat([df_merged_rrna, df_single], axis=1)
df_merged_rrna.to_csv('/Users/luke/vibrio/published/geo_table_LowBiomass_rRNA.csv', index_label='rRNA_number')
In [49]:
treatments2 = [
'plk_1_hiseq',
'plk_2_hiseq',
'plk_3_hiseq',
'swt_1_hiseq',
'swt_2_hiseq',
'swt_3_hiseq',
'vnt_1_hiseq',
'vnt_2_hiseq',
'vnt_3_hiseq'
]
In [50]:
# PlkSwtVnt CDS counts
df_merged2 = pd.DataFrame()
for treatment in treatments2:
path = '/Users/luke/vibrio/bioconductor/%s_R1_trim_CDS.tsv' % treatment
df_single = pd.read_csv(path, sep='\t', header=None, index_col=0)
df_single.columns = [treatment]
df_merged2 = pd.concat([df_merged2, df_single], axis=1)
df_merged2= df_merged2.drop(df_merged2.index[-5:])
df_merged2['CDS_number'] = df_merged2.index
df_merged2.columns = ['Plk1', 'Plk2', 'Plk3', 'Swt1', 'Swt2', 'Swt3', 'Vnt1', 'Vnt2', 'Vnt3', 'CDS_number']
In [51]:
# PlkSwtVnt rRNA counts -- don't drop last 5 rows, do write to CSV without adding VF numbers
df_merged2_rrna = pd.DataFrame()
for treatment in treatments2:
path = '/Users/luke/vibrio/bioconductor/%s_R1_trim_rRNA.tsv' % treatment
df_single = pd.read_csv(path, sep='\t', header=None, index_col=0)
df_single.columns = [treatment]
df_merged2_rrna = pd.concat([df_merged2_rrna, df_single], axis=1)
df_merged2_rrna.columns = ['Plk1', 'Plk2', 'Plk3', 'Swt1', 'Swt2', 'Swt3', 'Vnt1', 'Vnt2', 'Vnt3']
df_merged2_rrna.to_csv('/Users/luke/vibrio/published/geo_table_PlkSwtVnt_rRNA.csv', index_label='rRNA_number')
In [52]:
# Import GFF file
path_gff = '/Users/luke/vibrio/genome/VfES114_fixed.CDS.tsv'
df_gff = pd.read_csv(path_gff, sep='\t', header=None)
df_gff.columns = ['CDS_number', 'Length_bp', 'Description']
In [53]:
# Import file with CDS-to-VF
path_plk_vnt = '/Users/luke/vibrio/results/mydeseq2.all.Plk.Vnt.tsv'
deseq_plk_vnt = pd.read_csv(path_plk_vnt, sep='\t')
df_vf = deseq_plk_vnt[['VF_number', 'CDS_number']]
In [54]:
# Merge GFF with VF_number
df_gff_vf = pd.merge(df_gff, df_vf, how='outer')
df_gff_vf = df_gff_vf[['CDS_number', 'VF_number', 'Length_bp', 'Description']]
In [55]:
df_LowBiomass = pd.merge(df_gff_vf, df_merged, how='outer')
df_LowBiomass.to_csv('/Users/luke/vibrio/published/geo_table_LowBiomass_CDS.csv', index=None)
In [56]:
df_PlkSwtVnt = pd.merge(df_gff_vf, df_merged2, how='outer')
df_PlkSwtVnt.to_csv('/Users/luke/vibrio/published/geo_table_PlkSwtVnt_CDS.csv', index=None)
In [ ]: