Using partis to generate synthetic datasets

This notebook explains how to use the software partis to generate synthetic datasets.

Note that you will also need Change-O toolkit, geiger, ape, TreeSim in R language.


In [ ]:
import numpy as np
import pandas as pd
import os
import sys

from subprocess import call, check_output, STDOUT, check_call

from icing.externals.DbCore import parseAllele, gene_regex, junction_re
from icing.utils import io

# configuration
partis_path = # PARTIS PATH 
output_path = # OUTPUT FOLDER 
makedb_path = # MakeDb.py PATH
n_iter = 3

In [ ]:
# need to install geiger, ape, TreeSim in R for this to work
for i in [1500, 3000, 5000]:
    for j in [400]:
        call('{1}/bin/partis simulate '
             #'--parameter-dir {1}/test/reference-results/test/parameters/data/ '
             '--simulate-partially-from-scratch '
             '--outfname {3}/clones_{0}.{2}.csv --n-sim-events {0} --n-leaves {2} '
             '--indel-frequency 0.05 --indel-location cdr3 --mean-indel-length 6 '
             '--n-procs 16'.format(i, partis_path, j, output_path).split())

This produces into output_path folder a list of RAW sequences.

To simplify the processing of IMGT/HighV-Quest, let's a unique fasta file where, in the ID string, there is also the identity of the original database name. This will allow us to recover our original databases splitted.


In [ ]:
files = [os.path.join(output_path, x) for x in os.listdir(output_path) if x.endswith('.csv')]

# 1. create a single pandas dataframe
db_s = []
for x in files:
    df = pd.read_csv(x, index_col=None)
    df['db'] = x.split('/')[-1]
    db_s.append(df)

df = pd.concat(db_s)

In [ ]:
# 2. create fasta file up to 500k sequences
for i in range(df.shape[0] / 500000 + 1):
    with open(os.path.join(output_path, "all_{}.fasta".format(i)), 'w') as f:
        for index, row in (df.iloc[i * 500000:(i+1)*500000].iterrows()):
            f.write(">" + "_".join([row['db']] + [str(a) for a in row.values[:-8]]))
            f.write("\n")
            f.write(row['seq'])
            f.write("\n")

Ok! Now we can use IMGT to convert our fasta file(s) into databases which we can use as input to ICING. To do so, connect to IMGT HighV-Quest software and upload the data.

When finished, an email will notify that results are ready. Now, download them and extract the "txz" files as folders to use them with Change-O MakeDb script.


In [ ]:
%%bash -s "$output_path" "$makedb_path"
# run Changeo to convert IMGT into fasta file
# python MakeDb.py imgt -i <imgt output, zip or folder> -s <original fasta file> --scores
for i in {0..12}
  do 
     python $2/MakeDb.py imgt -i $1/imgt-pass/partis_6_$i -s $1/fasta/all_$i.fasta --scores
done

Divide now the IMGT-ChangeO processed files into a final list of databases which are usable from our method.


In [ ]:
path = os.path,join(output_path, 'makedb-pass')
db_s = []
for f in [os.path.join(path, x) for x in os.listdir(path) if x.endswith('db-pass.tab')]:
    db_s.append(pd.read_csv(f, dialect='excel-tab'))

df = pd.concat(db_s)
    
# add the mutation column
df['MUT'] = (1 - df['V_IDENTITY']) * 100.

df['DB'] = df['SEQUENCE_ID'].str.split('.csv').apply(lambda x: min(x, key=len))
for i in df.DB.unique():
    df[df.DB == i].to_csv(os.path.join(path, str(i) + '.tab'), index=False, sep='\t')

Let's produce an overview of the datasets.


In [ ]:
path = os.path,join(output_path, 'datasets')

df_all = pd.DataFrame()
for f in sorted([os.path.join(path, x) for x in os.listdir(path) if x.startswith('clones_')]):
    df = io.load_dataframe(f)
    
    df['true_clone'] = [x[3] for x in df.sequence_id.str.split('_')] 
    row = {}
    row['n_seqs'] = int(df.shape[0])
    
    df['true_v'] = [parseAllele(x[4], gene_regex, 'first') for x in df.sequence_id.str.split('_')] 
    df = df.loc[['OR' not in x for x in df.true_v], :]
        
    df['true_d'] = [parseAllele(x[5], gene_regex, 'first') for x in df.sequence_id.str.split('_')] 
    df = df.loc[['OR' not in x for x in df.true_d], :]
    
    df['true_j'] = [parseAllele(x[6], gene_regex, 'first') for x in df.sequence_id.str.split('_')] 

    row['unique V genes'] = int(df.true_v.unique().size)
    row['unique D genes'] = int(df.true_d.unique().size)
    row['unique J genes'] = int(df.true_j.unique().size)
    
    row['unique functional V genes'] = len([x for x in set(df.true_v) if 'OR' not in x])
    row['unique functional D genes'] = len([x for x in set(df.true_d) if 'OR' not in x])
    row['unique functional J genes'] = len([x for x in set(df.true_j) if 'OR' not in x])
    
    row['database'] = f.split('/')[-1]
    row['clonotypes'] = int(df.true_clone.unique().size)
    row['avg seqs/clone'] = np.mean([len(x) for x in df.groupby('true_clone').groups.values()])
        
    row['mean (std) of V gene mutation'] = "%.2f (%.2f)" % (df.mut.mean(), df.mut.std())
    df_all = df_all.append(row, ignore_index=True)

In [ ]:
df_all['indexNumber'] = [int(i.split('_')[-1].split('.')[0]) + int(
    i.split('_')[-1].split('.')[1]) for i in df_all.database]
# Perform sort of the rows
df_all.sort_values(['indexNumber'], ascending = [True], inplace = True)
# Deletion of the added column
df_all.drop('indexNumber', 1, inplace = True)

df_all['avg seqs/clone'] = df_all['avg seqs/clone'].map('{:.2f}'.format)

df_all[['n_seqs', 'clonotypes', 'unique V genes', 'unique D genes', 'unique J genes',
       'unique functional V genes','unique functional D genes','unique functional J genes']] = df_all[
    ['n_seqs', 'clonotypes', 'unique V genes', 'unique D genes', 'unique J genes',
     'unique functional V genes','unique functional D genes','unique functional J genes']].astype(int)

sorted_df = df_all.loc[:, ['database', 'n_seqs', 'clonotypes', 'avg seqs/clone', 'unique V genes',
               'unique D genes', 'unique J genes',
                           'mean (std) of V gene mutation']]

Save the results in the current folder.


In [ ]:
sorted_df.to_latex("dataset_table.tex", index=False)

Other useful visualisations


In [ ]:
df = io.load_dataframe(sorted([os.path.join(path, x) for x in os.listdir(path) if x.startswith('clones_')])[-1])

In [ ]:
df['true_v'] = [parseAllele(x[4], gene_regex, 'first') for x in df.sequence_id.str.split('_')] 
df['true_d'] = [parseAllele(x[5], gene_regex, 'first') for x in df.sequence_id.str.split('_')] 
df['true_j'] = [parseAllele(x[6], gene_regex, 'first') for x in df.sequence_id.str.split('_')]

In [ ]:
# Set of V genes
set(df.true_v)

In [ ]:
sorted_df['unique V genes'] = ['%d (%d)' % (a,b) for a, b in zip(sorted_df['unique V genes'],
                                                        sorted_df['unique functional V genes'])]
sorted_df['unique D genes'] = ['%d (%d)' % (a,b) for a, b in zip(sorted_df['unique D genes'],
                                                        sorted_df['unique functional D genes'])]
sorted_df['unique J genes'] = ['%d (%d)' % (a,b) for a, b in zip(sorted_df['unique J genes'],
                                                        sorted_df['unique functional J genes'])]

In [ ]:
del sorted_df['unique functional V genes']
del sorted_df['unique functional D genes']
del sorted_df['unique functional J genes']
del sorted_df['prova']

In [ ]:
df[['true_v', 'v_call']].iloc[0:72]
df[df['true_v'] == 'IGHV3-11', ['']]