In [1]:
import pandas as pd
import glob
import skbio
import re
In [2]:
mer = 6
path_glob = '/Users/luke/singlecell/jellyfish/*_%smer.fa' % mer
df = pd.DataFrame(index=[x.split('/')[-1] for x in glob.glob(path_glob)])
for path in glob.glob(path_glob):
fasta = skbio.io.read(path, format='fasta')
for x in fasta:
kmer = str(x)
count = x.metadata['id']
df.loc[path.split('/')[-1], kmer] = count
df.fillna(0, inplace=True)
df.index = [re.sub('_%smer.fa' % mer, '', x) for x in df.index]
In [3]:
df_genome_metadata = pd.read_csv('/Users/luke/singlecell/notebooks/genome_metadata.tsv', sep='\t', index_col=0)
In [4]:
code_pro = list(df_genome_metadata['jellyfish'][(df_genome_metadata['genus'] == 'Prochlorococcus') & df_genome_metadata['jellyfish'].notnull()])
df_pro = df.loc[code_pro]
df_pro.to_csv('/Users/luke/singlecell/notebooks/jellyfish_proch_%smer.csv' % mer)
In [5]:
code_pel = list(df_genome_metadata['jellyfish'][(df_genome_metadata['genus'] == 'Pelagibacter') & df_genome_metadata['jellyfish'].notnull()])
df_pel = df.loc[code_pel]
df_pel.to_csv('/Users/luke/singlecell/notebooks/jellyfish_pelag_%smer.csv' % mer)
In [6]:
code_combined = list(df_genome_metadata['jellyfish'][((df_genome_metadata['genus'] == 'Pelagibacter') | (df_genome_metadata['genus'] == 'Prochlorococcus')) & df_genome_metadata['jellyfish'].notnull()])
df_combined = df.loc[code_combined]
df_combined.to_csv('/Users/luke/singlecell/notebooks/jellyfish_combined_%smer.csv' % mer)
In [ ]: