In [1]:

    
import pandas as pd
import numpy as np
import re
import random
import string









    



/Users/luke/anaconda/envs/qiime190/lib/python2.7/site-packages/pandas/computation/__init__.py:19: UserWarning: The installed version of numexpr 2.4.4 is not supported in pandas and will be not be used

  UserWarning)



In [2]:

    
pd.set_option('display.max_rows', 500)

Make summary table of OG, Count, Example, Description



In [3]:

    
def make_og_table_with_description(path_table, path_descr, path_export):
    with open(path_table) as f:
        table = f.readlines()
    num_cols = len(table[0].split(' '))-1
    col_names = range(num_cols)
    # read groups table as df
    df_table = pd.read_csv(path_table, sep=' ', index_col=0, header=None, dtype=object, names=col_names)
    df_table.index = [re.sub(':', '', x) for x in df_table.index]
    # read description table as df
    df_descr = pd.read_csv(path_descr, sep='\t', header=None)
    df_descr.columns =['Code', 'Accession', 'Description']
    # re-group table with one entry per gene and add descriptions
    df_table_stack = pd.DataFrame(df_table.stack(), columns=['Accession'])
    df_table_stack['Accession'] = [x.split('|')[2] for x in df_table_stack['Accession']]
    df_table_stack = df_table_stack.reset_index().merge(df_descr, left_on='Accession', right_on='Accession', how='left')
    df_table_stack['level_1'] = df_table_stack['level_1'] + 1

    # write output to file
    with open(path_export, 'a') as outfile:
        outfile.write('Ortholog group\tNumber of genomes\tExample accession\tDescription\n')
        for og in df_table_stack['level_0'].unique():
            og_size = df_table_stack[df_table_stack['level_0'] == og]['level_1'].max()
            og_example = df_table_stack[df_table_stack['level_0'] == og]['Accession'].iloc[0]
            og_example = re.sub('Pelagi', 'Pelagibacter', og_example)
            og_example = re.sub('Proch', 'Prochlorococcus', og_example)
            descr_list = df_table_stack[df_table_stack['level_0'] == og]['Description'].tolist()
            for i in range(len(descr_list) - 1):
                if descr_list[i] == 'hypothetical protein':
                    descr_list[i] = ''
            descr_list = filter(None, descr_list)
            og_descr = descr_list[0]
            outfile.write('%s\t%s\t%s\t%s\n' % (og, og_size, og_example, og_descr))



In [4]:

    
# make summary table for Pelagibacter OGs
path_descr = '/Users/luke/singlecell/clusters/orthomcl-sar4/descr_sar.tsv'
# ... RS-only OGs
path_table = '/Users/luke/singlecell/clusters/orthomcl-sar4/groups.RSonly_sar.txt'
path_export = '/Users/luke/singlecell/notebooks/table_sar_RSonly.tsv'
make_og_table_with_description(path_table, path_descr, path_export)
# ... all OGs
path_table = '/Users/luke/singlecell/clusters/orthomcl-sar4/groups.all_sar.txt'
path_export = '/Users/luke/singlecell/notebooks/table_sar_all.tsv'
make_og_table_with_description(path_table, path_descr, path_export)



In [5]:

    
# make summary table for Prochlorococcus OGs
path_descr = '/Users/luke/singlecell/clusters/orthomcl-pro4/descr_pro.tsv'
# ... RS-only OGs
path_table = '/Users/luke/singlecell/clusters/orthomcl-pro4/groups.RSonly_pro.txt'
path_export = '/Users/luke/singlecell/notebooks/table_pro_RSonly.tsv'
make_og_table_with_description(path_table, path_descr, path_export)
# ... all OGs
path_table = '/Users/luke/singlecell/clusters/orthomcl-pro4/groups.all_pro.txt'
path_export = '/Users/luke/singlecell/notebooks/table_pro_all.tsv'
make_og_table_with_description(path_table, path_descr, path_export)



In [ ]:

Make matrix of OG counts across genomes



In [3]:

    
def make_og_count_matrix(path_table, path_genomes, path_export):
    # determine number of columns
    with open(path_table) as f:
        table = f.readlines()
    num_cols = len(table[0].split(' '))-1
    col_names = range(num_cols)
    # read groups table as df
    df_table = pd.read_csv(path_table, sep=' ', index_col=0, header=None, dtype=object, names=col_names)
    df_table.index = [re.sub(':', '', x) for x in df_table.index]
    df_table.replace(np.nan,'none|none|none', regex=True, inplace=True)
    # return genome code from gnl|code|accession
    def get_code(series_of_gnl_code_accession):
        codes = [x.split('|')[1] for x in series_of_gnl_code_accession]
        return codes
    df_codes = df_table.apply(get_code)
    # get list of genomes
    df_genomes = pd.read_csv(path_genomes, header=None)
    df_none = pd.DataFrame(['none'])
    df_genomes = df_genomes.append(df_none)
    # get values counts for each OG
    df_counts = pd.DataFrame(index=df_genomes[0])
    for index, series in df_codes.iterrows():
        df_counts[index] = series.value_counts()
    # transpose, fill na, and delete none column
    df_matrix = df_counts.transpose()
    df_matrix.fillna(0, inplace=True)
    df_matrix.drop('none', axis=1, inplace=True)
    # write to csv
    df_matrix.to_csv(path_export)



In [4]:

    
# make full OG count matrix for Pelagibacter OGs
# set values
path_table = '/Users/luke/singlecell/clusters/orthomcl-sar4/groups.all_sar.txt'
path_genomes = '/Users/luke/singlecell/clusters/orthomcl-sar4/genome_list_sar.txt'
path_export = '/Users/luke/singlecell/notebooks/matrix_sar_full.csv'
# run function
make_og_count_matrix(path_table, path_genomes, path_export)



In [5]:

    
# make full OG count matrix for Prochlorococcus OGs
# set values
path_table = '/Users/luke/singlecell/clusters/orthomcl-pro4/groups.all_pro.txt'
path_genomes = '/Users/luke/singlecell/clusters/orthomcl-pro4/genome_list_pro.txt'
path_export = '/Users/luke/singlecell/notebooks/matrix_pro_full.csv'
# run function
make_og_count_matrix(path_table, path_genomes, path_export)

Make matrix of subsampled OG counts across genomes



In [6]:

    
def subsample_og_count_matrix(df_matrix, num_subsample):
    # make series containing list of OGs in each genome
    ser_list_of_ogs = pd.Series(data=np.empty((len(df_matrix.columns), 0)).tolist(), index=df_matrix.columns)
    for og, genome_count in df_matrix.iterrows():
        for genome, count in genome_count.iteritems():
            ser_list_of_ogs[genome] += [og] * int(count)
    # make series containing subsampled list of OGs in each genome
    ser_list_of_ogs_sub = pd.Series(data=np.empty((len(df_matrix.columns), 0)).tolist(), index=df_matrix.columns)
    for genome, list_of_ogs in ser_list_of_ogs.iteritems():
        ser_list_of_ogs_sub[genome] = np.random.choice(np.array(ser_list_of_ogs[genome]), size=num_subsample, replace=False)
    # make matrix containing counts of subsampled OGs in each genome
    df_matrix_sub = pd.DataFrame(columns=df_matrix.columns, index=df_matrix.index)
    for genome, list_of_ogs_sub in ser_list_of_ogs_sub.iteritems():
        vc = pd.Series(list_of_ogs_sub).value_counts()
        for og, count in vc.iteritems():
            df_matrix_sub[genome][og] = count
    return df_matrix_sub



In [99]:

    
def make_subsampled_og_count_matrices(path_matrix, num_resample, num_subsample, path_export):
    # NB: num_resample is number of replicates, which are NOT averaged
    # NB: num_subsample provided based on visual inspection of genome sizes
    df_matrix = pd.read_csv(path_matrix, index_col=0)
    # generate random string to label each replicate
    for i in range(num_resample):
        rand_str = ''.join(random.choice(string.ascii_lowercase + string.ascii_uppercase) for _ in range(4))
        # for the provided minimum size genome (num_subsample)
        # get subset of matrix containing genomes with at least that size
        # and then subsample that
        df_matrix_subset = df_matrix[df_matrix.columns[df_matrix.sum() >= num_subsample]]
        # sumsample matrix
        df_matrix_sub = subsample_og_count_matrix(df_matrix_subset, num_subsample)
        df_matrix_sub.fillna(0, inplace=True)
        df_matrix_sub.to_csv('%s_%s_%s.csv' % (path_export, num_subsample, rand_str))



In [101]:

    
# make replicate subsampled OG count matrices for Pelagibacter OGs
# set values
path_matrix = '/Users/luke/singlecell/notebooks/matrix_sar_full.csv'
num_resample = 5
num_subsample = 800
path_export = '/Users/luke/singlecell/notebooks/subsampled/matrix_sar_subsample_to'
# run function
make_subsampled_og_count_matrices(path_matrix, num_resample, num_subsample, path_export)



In [102]:

    
# make replicate subsampled OG count matrices for Prochlorococcus OGs
# set values
path_matrix = '/Users/luke/singlecell/notebooks/matrix_pro_full.csv'
num_resample = 5
num_subsample = 1400
path_export = '/Users/luke/singlecell/notebooks/subsampled/matrix_pro_subsample_to'
# run function
make_subsampled_og_count_matrices(path_matrix, num_resample, num_subsample, path_export)

List genomes with more than N genes, and OGs present in all of them

Pelagibacter



In [10]:

    
path_matrix = '/Users/luke/singlecell/notebooks/matrix_sar_full.csv'
df_matrix = pd.read_csv(path_matrix, index_col=0)



In [11]:

    
# SAGs only sorted
df_matrix.iloc[:,26:].sum().sort_values()
# all sorted
df_matrix.sum().sort_values()









    Out[11]:





S001     346
SG15     349
SA08     389
SC10     511
SN08     604
SB16     604
SN17     623
SC09     737
SE07     740
S003     756
S002     756
SK18     779
SE22     820
SO19     859
SL23     908
SM22     917
SJ21     948
S004     973
SF16    1008
S005    1020
SP11    1026
SI19    1068
SO20    1072
SM18    1073
SD22    1076
S006    1165
SA20    1202
S008    1323
S017    1337
S015    1344
S016    1349
S018    1349
S024    1353
S014    1375
S010    1387
S023    1404
S025    1420
S020    1428
S013    1430
S012    1453
S022    1460
S007    1463
S021    1497
S011    1505
S009    1514
S019    1541
S026    1821
dtype: float64



In [12]:

    
# genomes with more than N genes
N = 800
df_matrix_gtN = df_matrix[df_matrix.columns[(df_matrix.sum() > N).values]]
with open('/Users/luke/singlecell/clusters/orthomcl-sar4/genomes_gt_%s_genes.grep' % N, 'w') as target:
    target.write(r'\|'.join(df_matrix_gtN.columns.tolist()))
with open('/Users/luke/singlecell/clusters/orthomcl-sar4/genomes_gt_%s_genes.list' % N, 'w') as target:
    target.write('\n'.join(df_matrix_gtN.columns.tolist()))
    target.write('\n')
    
# OGs found in all genomes with more than N genes
og_list = []
for index, row in df_matrix_gtN.iterrows():
    if not (df_matrix_gtN.loc[index] == 0).any():
        og_list.append(index)
with open('/Users/luke/singlecell/clusters/orthomcl-sar4/ogs_in_all_genomes_gt_%s_genes.grep' % N, 'w') as target:
    target.write(r'\|'.join(og_list))
with open('/Users/luke/singlecell/clusters/orthomcl-sar4/ogs_in_all_genomes_gt_%s_genes.list' % N, 'w') as target:
    target.write('\n'.join(og_list))
    target.write('\n')

Prochlorococcus



In [6]:

    
path_matrix = '/Users/luke/singlecell/notebooks/matrix_pro_full.csv'
df_matrix = pd.read_csv(path_matrix, index_col=0)



In [7]:

    
# all sorted
df_matrix.sum().sort_values()









    Out[7]:





P011     113
P047     395
P002     433
P010     467
P096     476
P021     525
P006     542
P009     564
P058     598
P102     631
P024     655
P084     762
P061     765
P007     770
P008     812
P005     908
P103     930
P003     943
P069     948
P104     979
P051     983
P004    1022
P018    1026
P043    1027
P072    1032
P099    1079
P070    1090
P053    1102
P092    1116
P028    1118
P059    1138
P067    1179
P023    1203
P017    1228
P055    1233
P093    1253
P020    1261
P027    1269
P038    1278
P026    1281
P036    1287
P032    1294
P101    1304
P083    1305
P052    1335
P025    1336
P097    1337
P086    1370
P064    1382
P089    1390
P045    1394
P090    1396
P078    1407
PI15    1416
P077    1445
P001    1462
P022    1465
P019    1473
P044    1485
P094    1487
P087    1494
P066    1521
P079    1521
P056    1524
P080    1525
P076    1534
P037    1544
PI06    1564
P081    1567
P060    1570
P085    1570
PF05    1590
P088    1597
P071    1606
P046    1607
P091    1616
P050    1616
P054    1619
P095    1620
P057    1623
P074    1628
P039    1644
P040    1652
P034    1656
PJ16    1658
P073    1665
P030    1665
PM23    1666
P029    1670
P100    1673
P062    1675
P082    1676
P031    1689
P063    1690
P065    1720
P041    1734
P035    1737
P016    1758
P049    1769
P098    1775
P033    1798
P042    1818
P014    1832
P068    1849
P048    1849
P119    1852
P106    1864
P121    1872
P012    1889
P135    1892
P075    1893
P131    1898
P139    1900
P129    1902
P136    1902
P128    1904
P138    1905
P105    1906
P140    1909
P137    1910
P107    1912
P108    1913
P124    1913
P013    1913
P130    1916
P125    1919
P115    1921
P114    1931
P116    1937
P127    1938
P117    1957
P118    1958
P122    1969
P120    1982
P015    1989
P109    2065
P133    2108
P132    2149
P134    2162
P113    2190
P126    2551
P112    2643
P111    2659
P110    2666
P123    2731
dtype: float64



In [8]:

    
# genomes with more than N genes
N = 1400
df_matrix_gtN = df_matrix[df_matrix.columns[(df_matrix.sum() > N).values]]
with open('/Users/luke/singlecell/clusters/orthomcl-pro4/genomes_gt_%s_genes.grep' % N, 'w') as target:
    target.write(r'\|'.join(df_matrix_gtN.columns.tolist()))
with open('/Users/luke/singlecell/clusters/orthomcl-pro4/genomes_gt_%s_genes.list' % N, 'w') as target:
    target.write('\n'.join(df_matrix_gtN.columns.tolist()))
    target.write('\n')

# OGs found in all genomes with more than N genes
og_list = []
for index, row in df_matrix_gtN.iterrows():
    if not (df_matrix_gtN.loc[index] == 0).any():
        og_list.append(index)
with open('/Users/luke/singlecell/clusters/orthomcl-pro4/ogs_in_all_genomes_gt_%s_genes.grep' % N, 'w') as target:
    target.write(r'\|'.join(og_list))
with open('/Users/luke/singlecell/clusters/orthomcl-pro4/ogs_in_all_genomes_gt_%s_genes.list' % N, 'w') as target:
    target.write('\n'.join(og_list))
    target.write('\n')

Skip: Average subsampled OG count matrices



In [ ]:

    
def average_subsampled_og_count_matrices(path_matrix, num_subsample, num_resample, path_export):
    # initialize sum matrix
    df_matrix = pd.read_csv(path_matrix, index_col=0)
    df_matrix_sub_sum = pd.DataFrame(columns=df_matrix.columns, index=df_matrix.index)
    df_matrix_sub_sum.fillna(0, inplace=True)
    # sum subsampled matrix n times
    for i in range(num_resample):
        df_matrix_sub = subsample_og_count_matrix(path_matrix, num_subsample)
        df_matrix_sub.fillna(0, inplace=True)
        df_matrix_sub_sum = df_matrix_sub_sum.add(df_matrix_sub)
    # divide by n
    df_matrix_sub_sum = df_matrix_sub_sum/num_resample
    # write to csv
    df_matrix_sub_sum.to_csv(path_export)



In [ ]:

    
# make average subsampled OG count matrix for Pelagibacter OGs
# path_matrix = '/Users/luke/singlecell/notebooks/matrix_sar_full.csv'
# num_subsample = 200
# num_resample = 1000
# path_export = '/Users/luke/singlecell/notebooks/matrix_sar_subsample%s_resample%s.csv' % (num_subsample, num_resample)
# average_subsampled_og_count_matrices(path_matrix, num_subsample, num_resample, path_export)