In [1]:
import pandas as pd
import numpy as np
import re
import random
import string
In [2]:
pd.set_option('display.max_rows', 500)
In [3]:
def make_og_table_with_description(path_table, path_descr, path_export):
with open(path_table) as f:
table = f.readlines()
num_cols = len(table[0].split(' '))-1
col_names = range(num_cols)
# read groups table as df
df_table = pd.read_csv(path_table, sep=' ', index_col=0, header=None, dtype=object, names=col_names)
df_table.index = [re.sub(':', '', x) for x in df_table.index]
# read description table as df
df_descr = pd.read_csv(path_descr, sep='\t', header=None)
df_descr.columns =['Code', 'Accession', 'Description']
# re-group table with one entry per gene and add descriptions
df_table_stack = pd.DataFrame(df_table.stack(), columns=['Accession'])
df_table_stack['Accession'] = [x.split('|')[2] for x in df_table_stack['Accession']]
df_table_stack = df_table_stack.reset_index().merge(df_descr, left_on='Accession', right_on='Accession', how='left')
df_table_stack['level_1'] = df_table_stack['level_1'] + 1
# write output to file
with open(path_export, 'a') as outfile:
outfile.write('Ortholog group\tNumber of genomes\tExample accession\tDescription\n')
for og in df_table_stack['level_0'].unique():
og_size = df_table_stack[df_table_stack['level_0'] == og]['level_1'].max()
og_example = df_table_stack[df_table_stack['level_0'] == og]['Accession'].iloc[0]
og_example = re.sub('Pelagi', 'Pelagibacter', og_example)
og_example = re.sub('Proch', 'Prochlorococcus', og_example)
descr_list = df_table_stack[df_table_stack['level_0'] == og]['Description'].tolist()
for i in range(len(descr_list) - 1):
if descr_list[i] == 'hypothetical protein':
descr_list[i] = ''
descr_list = filter(None, descr_list)
og_descr = descr_list[0]
outfile.write('%s\t%s\t%s\t%s\n' % (og, og_size, og_example, og_descr))
In [4]:
# make summary table for Pelagibacter OGs
path_descr = '/Users/luke/singlecell/clusters/orthomcl-sar4/descr_sar.tsv'
# ... RS-only OGs
path_table = '/Users/luke/singlecell/clusters/orthomcl-sar4/groups.RSonly_sar.txt'
path_export = '/Users/luke/singlecell/notebooks/table_sar_RSonly.tsv'
make_og_table_with_description(path_table, path_descr, path_export)
# ... all OGs
path_table = '/Users/luke/singlecell/clusters/orthomcl-sar4/groups.all_sar.txt'
path_export = '/Users/luke/singlecell/notebooks/table_sar_all.tsv'
make_og_table_with_description(path_table, path_descr, path_export)
In [5]:
# make summary table for Prochlorococcus OGs
path_descr = '/Users/luke/singlecell/clusters/orthomcl-pro4/descr_pro.tsv'
# ... RS-only OGs
path_table = '/Users/luke/singlecell/clusters/orthomcl-pro4/groups.RSonly_pro.txt'
path_export = '/Users/luke/singlecell/notebooks/table_pro_RSonly.tsv'
make_og_table_with_description(path_table, path_descr, path_export)
# ... all OGs
path_table = '/Users/luke/singlecell/clusters/orthomcl-pro4/groups.all_pro.txt'
path_export = '/Users/luke/singlecell/notebooks/table_pro_all.tsv'
make_og_table_with_description(path_table, path_descr, path_export)
In [ ]:
In [3]:
def make_og_count_matrix(path_table, path_genomes, path_export):
# determine number of columns
with open(path_table) as f:
table = f.readlines()
num_cols = len(table[0].split(' '))-1
col_names = range(num_cols)
# read groups table as df
df_table = pd.read_csv(path_table, sep=' ', index_col=0, header=None, dtype=object, names=col_names)
df_table.index = [re.sub(':', '', x) for x in df_table.index]
df_table.replace(np.nan,'none|none|none', regex=True, inplace=True)
# return genome code from gnl|code|accession
def get_code(series_of_gnl_code_accession):
codes = [x.split('|')[1] for x in series_of_gnl_code_accession]
return codes
df_codes = df_table.apply(get_code)
# get list of genomes
df_genomes = pd.read_csv(path_genomes, header=None)
df_none = pd.DataFrame(['none'])
df_genomes = df_genomes.append(df_none)
# get values counts for each OG
df_counts = pd.DataFrame(index=df_genomes[0])
for index, series in df_codes.iterrows():
df_counts[index] = series.value_counts()
# transpose, fill na, and delete none column
df_matrix = df_counts.transpose()
df_matrix.fillna(0, inplace=True)
df_matrix.drop('none', axis=1, inplace=True)
# write to csv
df_matrix.to_csv(path_export)
In [4]:
# make full OG count matrix for Pelagibacter OGs
# set values
path_table = '/Users/luke/singlecell/clusters/orthomcl-sar4/groups.all_sar.txt'
path_genomes = '/Users/luke/singlecell/clusters/orthomcl-sar4/genome_list_sar.txt'
path_export = '/Users/luke/singlecell/notebooks/matrix_sar_full.csv'
# run function
make_og_count_matrix(path_table, path_genomes, path_export)
In [5]:
# make full OG count matrix for Prochlorococcus OGs
# set values
path_table = '/Users/luke/singlecell/clusters/orthomcl-pro4/groups.all_pro.txt'
path_genomes = '/Users/luke/singlecell/clusters/orthomcl-pro4/genome_list_pro.txt'
path_export = '/Users/luke/singlecell/notebooks/matrix_pro_full.csv'
# run function
make_og_count_matrix(path_table, path_genomes, path_export)
In [6]:
def subsample_og_count_matrix(df_matrix, num_subsample):
# make series containing list of OGs in each genome
ser_list_of_ogs = pd.Series(data=np.empty((len(df_matrix.columns), 0)).tolist(), index=df_matrix.columns)
for og, genome_count in df_matrix.iterrows():
for genome, count in genome_count.iteritems():
ser_list_of_ogs[genome] += [og] * int(count)
# make series containing subsampled list of OGs in each genome
ser_list_of_ogs_sub = pd.Series(data=np.empty((len(df_matrix.columns), 0)).tolist(), index=df_matrix.columns)
for genome, list_of_ogs in ser_list_of_ogs.iteritems():
ser_list_of_ogs_sub[genome] = np.random.choice(np.array(ser_list_of_ogs[genome]), size=num_subsample, replace=False)
# make matrix containing counts of subsampled OGs in each genome
df_matrix_sub = pd.DataFrame(columns=df_matrix.columns, index=df_matrix.index)
for genome, list_of_ogs_sub in ser_list_of_ogs_sub.iteritems():
vc = pd.Series(list_of_ogs_sub).value_counts()
for og, count in vc.iteritems():
df_matrix_sub[genome][og] = count
return df_matrix_sub
In [99]:
def make_subsampled_og_count_matrices(path_matrix, num_resample, num_subsample, path_export):
# NB: num_resample is number of replicates, which are NOT averaged
# NB: num_subsample provided based on visual inspection of genome sizes
df_matrix = pd.read_csv(path_matrix, index_col=0)
# generate random string to label each replicate
for i in range(num_resample):
rand_str = ''.join(random.choice(string.ascii_lowercase + string.ascii_uppercase) for _ in range(4))
# for the provided minimum size genome (num_subsample)
# get subset of matrix containing genomes with at least that size
# and then subsample that
df_matrix_subset = df_matrix[df_matrix.columns[df_matrix.sum() >= num_subsample]]
# sumsample matrix
df_matrix_sub = subsample_og_count_matrix(df_matrix_subset, num_subsample)
df_matrix_sub.fillna(0, inplace=True)
df_matrix_sub.to_csv('%s_%s_%s.csv' % (path_export, num_subsample, rand_str))
In [101]:
# make replicate subsampled OG count matrices for Pelagibacter OGs
# set values
path_matrix = '/Users/luke/singlecell/notebooks/matrix_sar_full.csv'
num_resample = 5
num_subsample = 800
path_export = '/Users/luke/singlecell/notebooks/subsampled/matrix_sar_subsample_to'
# run function
make_subsampled_og_count_matrices(path_matrix, num_resample, num_subsample, path_export)
In [102]:
# make replicate subsampled OG count matrices for Prochlorococcus OGs
# set values
path_matrix = '/Users/luke/singlecell/notebooks/matrix_pro_full.csv'
num_resample = 5
num_subsample = 1400
path_export = '/Users/luke/singlecell/notebooks/subsampled/matrix_pro_subsample_to'
# run function
make_subsampled_og_count_matrices(path_matrix, num_resample, num_subsample, path_export)
In [10]:
path_matrix = '/Users/luke/singlecell/notebooks/matrix_sar_full.csv'
df_matrix = pd.read_csv(path_matrix, index_col=0)
In [11]:
# SAGs only sorted
df_matrix.iloc[:,26:].sum().sort_values()
# all sorted
df_matrix.sum().sort_values()
Out[11]:
In [12]:
# genomes with more than N genes
N = 800
df_matrix_gtN = df_matrix[df_matrix.columns[(df_matrix.sum() > N).values]]
with open('/Users/luke/singlecell/clusters/orthomcl-sar4/genomes_gt_%s_genes.grep' % N, 'w') as target:
target.write(r'\|'.join(df_matrix_gtN.columns.tolist()))
with open('/Users/luke/singlecell/clusters/orthomcl-sar4/genomes_gt_%s_genes.list' % N, 'w') as target:
target.write('\n'.join(df_matrix_gtN.columns.tolist()))
target.write('\n')
# OGs found in all genomes with more than N genes
og_list = []
for index, row in df_matrix_gtN.iterrows():
if not (df_matrix_gtN.loc[index] == 0).any():
og_list.append(index)
with open('/Users/luke/singlecell/clusters/orthomcl-sar4/ogs_in_all_genomes_gt_%s_genes.grep' % N, 'w') as target:
target.write(r'\|'.join(og_list))
with open('/Users/luke/singlecell/clusters/orthomcl-sar4/ogs_in_all_genomes_gt_%s_genes.list' % N, 'w') as target:
target.write('\n'.join(og_list))
target.write('\n')
In [6]:
path_matrix = '/Users/luke/singlecell/notebooks/matrix_pro_full.csv'
df_matrix = pd.read_csv(path_matrix, index_col=0)
In [7]:
# all sorted
df_matrix.sum().sort_values()
Out[7]:
In [8]:
# genomes with more than N genes
N = 1400
df_matrix_gtN = df_matrix[df_matrix.columns[(df_matrix.sum() > N).values]]
with open('/Users/luke/singlecell/clusters/orthomcl-pro4/genomes_gt_%s_genes.grep' % N, 'w') as target:
target.write(r'\|'.join(df_matrix_gtN.columns.tolist()))
with open('/Users/luke/singlecell/clusters/orthomcl-pro4/genomes_gt_%s_genes.list' % N, 'w') as target:
target.write('\n'.join(df_matrix_gtN.columns.tolist()))
target.write('\n')
# OGs found in all genomes with more than N genes
og_list = []
for index, row in df_matrix_gtN.iterrows():
if not (df_matrix_gtN.loc[index] == 0).any():
og_list.append(index)
with open('/Users/luke/singlecell/clusters/orthomcl-pro4/ogs_in_all_genomes_gt_%s_genes.grep' % N, 'w') as target:
target.write(r'\|'.join(og_list))
with open('/Users/luke/singlecell/clusters/orthomcl-pro4/ogs_in_all_genomes_gt_%s_genes.list' % N, 'w') as target:
target.write('\n'.join(og_list))
target.write('\n')
In [ ]:
def average_subsampled_og_count_matrices(path_matrix, num_subsample, num_resample, path_export):
# initialize sum matrix
df_matrix = pd.read_csv(path_matrix, index_col=0)
df_matrix_sub_sum = pd.DataFrame(columns=df_matrix.columns, index=df_matrix.index)
df_matrix_sub_sum.fillna(0, inplace=True)
# sum subsampled matrix n times
for i in range(num_resample):
df_matrix_sub = subsample_og_count_matrix(path_matrix, num_subsample)
df_matrix_sub.fillna(0, inplace=True)
df_matrix_sub_sum = df_matrix_sub_sum.add(df_matrix_sub)
# divide by n
df_matrix_sub_sum = df_matrix_sub_sum/num_resample
# write to csv
df_matrix_sub_sum.to_csv(path_export)
In [ ]:
# make average subsampled OG count matrix for Pelagibacter OGs
# path_matrix = '/Users/luke/singlecell/notebooks/matrix_sar_full.csv'
# num_subsample = 200
# num_resample = 1000
# path_export = '/Users/luke/singlecell/notebooks/matrix_sar_subsample%s_resample%s.csv' % (num_subsample, num_resample)
# average_subsampled_og_count_matrices(path_matrix, num_subsample, num_resample, path_export)