/local/repo
):git clone git@github.com:cfe-lab/MiCall.git
cd MiCall
git checkout MultiUseDocker
docker build --target dev -t micall .
If the docker
command does not work try prefixing the command with sudo
and/or ensuring the docker daemon is running (sudo systemctl start docker
or sudo service docker start
Collect data
SARS-COV-2
on NCBI SRA (https://www.ncbi.nlm.nih.gov/sra)SRR11851926_1.fastq
SRR11851926_2.fastq
/path/to/fastq_files
Run MiCall
Run docker on your data folder
docker run --mount type=bind,source=/local/repo/MiCall,destination=/opt/micall --mount type=bind,source=/path/to/fastq_files,destination=/data micall folder
Results
folder in your /path/to/fastq_files
that contains an output
folder and a scratch
folder. For this analysis we will focus on the scratch
folder which should contain a folder for every sample you processedFor each sample you downloaded, you must add an entry into the /local/repo/MiCall/micall/utils/conseq_compare.py
script in the MAPPING
variable. Add a 4-column (space-delimited) line in the format:
reference_name
e.g. SRR11578349_1.fastq RNA-Seq 2020-05-04 EPI_ISL_427024
.
You must also add the reference sequences to /local/repo/MiCall/micall/tests/working/fetched_accessions.fasta
, most of them can be found by searching https://www.gisaid.org/ using the named reference in the SRA entry or other keywords from the SRA page (NRW-???).
>ABC
) match column 4 of at least one entry in the MAPPING
variable in step 4 (e.g. X X X ABC
)conseq_path
to be the path to your conseq_all.csv
file which will be in /path/to/fastq_files/Results/output
Run /local/repo/MiCall/micall/utils/conseq_compare.py
. For this step I used the script from the dan_latest
branch: https://github.com/cfe-lab/MiCall/blob/dan_latest/micall/utils/conseq_compare.py
docker run --mount type=bind,source=/local/repo/MiCall,destination=/opt/micall -it --rm -w /opt/micall --entrypoint python micall -m micall.utils.conseq_compare
/path/to/fastq_files/Results/scratch
for data.csv
and data_extended.csv
)
In [2]:
from pathlib import Path
import os
import csv
import pandas as pd
import yaml
import numpy as np
import statistics
from operator import itemgetter
In [1]:
def get_mixtures(row):
total = row['A'] + row['T'] + row['C'] + row['G']
thresh = 0.05 * total
alleles = {
'qpos': int(row['query.nuc.pos']),
'mixtures': {}
}
if row['coverage'] > 100:
for aa in ('A', 'C', 'T', 'G'):
if row[aa] > thresh:
alleles['mixtures'][aa] = round((row[aa] / row['coverage'] * 100), 2)
if len(alleles['mixtures']) > 1:
return alleles
else:
return None
In [ ]:
BASE = Path('/path/to/fastq_files')
ROOT = BASE / 'Results' / 'scratch'
samples = os.listdir(ROOT)
In [ ]:
results = {}
for sample in samples:
results[sample] = {}
data = ROOT / sample / 'data.csv'
data_extended = ROOT / sample / 'data_extended.csv'
nucs = ROOT / sample / 'nuc.csv'
conseq_all = ROOT / sample / 'conseq_all.csv'
# Get length of conseq
conseqs = pd.read_csv(conseq_all)
conseq_len = len(conseqs[conseqs['region'].isnull()]['sequence'].iloc[0])
results[sample]['conseq_len'] = conseq_len
# Get concordances
with open(data_extended) as f:
reader = csv.DictReader(f)
for row in reader:
results[sample]['concordance'] = row['concordance']
# Get coverages
datafile = pd.read_csv(data)
for _type in ('mismatch', 'deletion', 'addition'):
info = datafile[
(datafile['type'] == _type)
]
results[sample][_type] = info
# Get mixtures
nucsfile = pd.read_csv(nucs)
nucsfile = nucsfile[
~nucsfile['region'].str.contains('nsp')
]
nucsfile['mixtures'] = nucsfile.apply(compute, axis=1)
nucsfile = nucsfile[
~nucsfile['mixtures'].isnull()
]['mixtures']
results[sample]['mixtures'] = nucsfile
Get the number of samples N
In [ ]:
N = len(results)
print(f'We tested {N} samples')
Get the total number of nucleotides across all samples
In [ ]:
total_nucleotides = sum([results[x]['conseq_len'] for x in results])
Get a dataframe of the mismatches only
In [ ]:
mismatches = pd.concat(
[results[sample]['mismatch'] for sample in results]
)
Get the number of mismatches
In [ ]:
n_mismatches = len(mismatches)
Get the number of unique mismatches
In [ ]:
unique_mismacthes = len(mismatches['sample'].unique())
Match mixtures with mismatch positions
In [ ]:
mismatch_data = []
for i,row in mismatches.iterrows():
thing = row.to_dict()
mixtures = results[row['sample']]['mixtures']
matching_mixture = None
for mix in mixtures:
if mix['qpos'] == row.pos:
matching_mixture = mix
break
if matching_mixture:
thing['match'] = True
else:
thing['match'] = False
mismatch_data.append(thing)
Print information on the mismatches per sample
In [ ]:
mismatches_per_sample = [len(results[x]['mismatch']) for x in results]
median = statistics.median(mismatches_per_sample)
mean = round(statistics.mean(mismatches_per_sample), 3)
q75, q25 = np.percentile(mismatches_per_sample, [75 ,25], interpolation='midpoint')
iqr = q75 - q25
print(f'median number of mismatches per sample: {median}')
print(f'mean number of mismatches per sample: {mean}')
print(f'25th percentile of mismatches: {q25}')
print(f'75th percentile of mismatches: {q75}')
print(f'iqr number of mismatches per sample: {iqr}')
Print information on the concordances per sample
In [ ]:
concord = [float(results[x]['concordance']) for x in results]
median = statistics.median(concord)
mean = round(statistics.mean(concord), 3)
q75, q25 = np.percentile(concord, [75 ,25], interpolation='midpoint')
iqr = round(q75 - q25, 4)
print(f'median concordance: {median}')
print(f'mean concordance: {mean}')
print(f'25th percentile: {q25}')
print(f'75th percentile: {q75}')
print(f'iqr concordance: {iqr}')
Print information on the number of mixtures per sample
In [ ]:
mixnums = [len(results[s]['mixtures']) for s in results]
median = statistics.median(mixnums)
mean = round(statistics.mean(mixnums), 3)
q75, q25 = np.percentile(mixnums, [75 ,25], interpolation='midpoint')
iqr = q75 - q25
print(f'total number of mixtures: {sum(mixnums)}')
print(f'median number of mixtures: {median}')
print(f'mean number of mixtures: {mean}')
print(f'25th percentile: {q25}')
print(f'75th percentile: {q75}')
print(f'iqr number of mixtures: {iqr}')
Print information on the minor frequency alleles (If a position has 60% A and 40% T, the minor frequency allele is T)
In [ ]:
minor_freqs = []
for sample in results:
_min = None
for mix in results[sample]['mixtures']:
if not _min:
_min = min(mix['mixtures'].items(), key=itemgetter(1))
continue
mymin = min(mix['mixtures'].items(), key=itemgetter(1))
if mymin < _min:
_min = mymin
if _min:
minor_freqs.append(_min)
minor_freqs = [x[1] for x in minor_freqs]
median = round(statistics.median(minor_freqs), 2)
mean = round(statistics.mean(minor_freqs), 2)
q75, q25 = np.percentile(minor_freqs, [75 ,25], interpolation='midpoint')
iqr = round(q75 - q25, 4)
print(f'median minor allele frequency: {median}')
print(f'mean minor allele frequency: {mean}')
print(f'25th percentile: {q25}')
print(f'75th percentile: {q75}')
print(f'iqr minor allele frequency: {iqr}')