In [1]:
import pysam
In [39]:
counter = 0
with pysam.Samfile('../outputs/moleculo/LR6000017-DNA_A01-LRAAA-AllReads.unmasked.sorted.bam', "rb") as samfile:
with open('../outputs/moleculo/reads_lt_90_2.fasta', 'w') as f:
for record in samfile:
reclen = len(record.seq)
if (len(record.aligned_pairs) < .9 * reclen) and (reclen > 9000):
f.write('>' + record.qname + '\n')
f.write(record.seq)
f.write('\n')
print(record.qname, reclen, len(record.aligned_pairs), record.aligned_pairs[-1][0] - record.aligned_pairs[0][0])
if counter > 50:
break
counter += 1
In [34]:
counter
Out[34]:
In [19]:
import screed
In [25]:
db = screed.ScreedDB('../outputs/moleculo/LR6000017-DNA_A01-LRAAA-AllReads.fastq_screed')
In [40]:
db.keys()[:20]
Out[40]:
In [27]:
len('GTGGAGGAGAAGGACCTGGGGGTGTTGGTTGGCAGCTGAACACGGGCCTCAAGTTGTGCCAGGAAAGGTTTAAATTGGTCATTAGGAAGAGTGCTTCTGTGGACAGGGTGGTGAGAGGCTGTGATGGGCTGCGCCAGGCAGCAGTGCTGTCACCACCCCTGCAGGTGTTGGAGAGATGGTGGATGTGGCACTGAGGGATGTGGCTTAGGGGAGCTTGGGGGTCTGACAGTATCAGCTGGTGGTTGGACTTGATTTTAGAGGTCTTTTCCAACCTGAATGGCTCGATGGTGCTAATGGCTAAATAGGAATTGCAAGCACCGGGCTGTTTCTGCCCCAAGTCAGTGAGACCAATGCGTGTGTGCTGACCTGGCTTTGGGACAGTAATTCTGCTGTTCCTGAGTTCAGCCTCTGGGGTGGTGCAAAGGTGATGGAGCCGAGCTGCAGTGGTGCCCGTGGGTGGGATGGGGCAGTGGGCACAAACTGGAATGCAGGAAAACACCTCTGAACACCGTGAACTGCTCATCTGCTGTGTGGTGACGGAGCCTCAGCACAACCCAGAGCGGCTGTGGAGTTCCCCTCCTTGGGAAGCTGTCTGGAACTGATCTGGAGTAGAGGTGGTGCGCATTTGTGCCTCTGAGAAGCTGTGGCCTCGTGTTGAGTGGGAAGAACAGATTCACCCTCCTCAAACACTGCTGATGGTCACAGAGGCATTTTCATTTTCCCTGTCTTTAAATGAGTCTTTTCAGTTGCCGGCTTGGTGAACAGTGAATGGTGAAAGAAGAAATTTCCCTGTAAGCCTGTGCTACTTTGGAGCTCCACGGCGGCTGCAGGCCAGCTAAGATGCCTGTGAAATACTGGCACTATGTTTGCTGGCTGTTAAGCTGATACAGAACACGTGCACTCCTAGCTTTGTTGCAGTGTCCTTCACCAGAAGTGTTATCTCTAGAACAGGAGTCCAAGGCCTGCCTTCTTGCTTCACTGATTGCTTCCCAGGAATTCTGTTCTGCAGGCTGCAGAGGCTGAGATTGTGCCATAAAGGCATTAGAAATCAGTGTGTGTGTTTGCAGGGACGAGAGAAATCGTGTTTTCCAAACGAGGAAGACTTTTCACTTTCTATGCTTATTGCAGATGAGTGATTACGATAGACCCAGAGCGTGCTGTGTAGGAAGGAGAGAGGTTTTTTATCTCCTTACTGTCAAATAAATTGTTTTTCTTCTCTTCTGCCTCCCAGAACCCACGAGATAAACAGGAATGGTGTGAAGGAGCTGTGTGTGAGCAGCAGCACTGCCCTGGAAACCTCCCCCCTACTGAGCAAGCAGACCTATGAGCTCCTCAGTCCATGAAGGAGGTTGGTGTTCTCGTGGCCTTCTTGCTTCCTGATAGAGAAACGCAATTAGCACACTTAGTTTAAAGCCAGGTAGTTGGAAGCTGTGACTGCCACGCAGAAGTGATGCAGTGGTATTCTCTTCCTTGTTGGTGTTTCCATGCTTTACGTGGCAATGTGAGAGAGGAAGGGTTTGTTTGCAGAGATAAAGGAGATGGAGGATGAGCGTATGGCCAACTTGAGAAGTCTCCTTCTGGCTAAACACATTAAAACTTGATGTAAGCTTGAGTTTGTATCTAGCTTAATACTTGCGTGTCCTGCTGCAGTCAGGCTACAGCAGGGAGAACTGTAAGCAAACTAAAAGCTCCTTCTGCCTCCTCCCTCCGAGCGGTGTGGGGGAATGGGAGTCACGTGCAGTTTGTAATGCTTCACCTCTGCACTCCTTCATCCTCGCTCTGCTGTGAGGTCGTTCCTATGGGATGCCACCCTTCCCAAACTGATCCTGCCCAGGCTTCCCACAGGAATGGATTTGCTCCAGTGCGAGGCCCCTCAGGCTGCAGCTCCTGCCCAATGATGTGCTCCAATGTAGACTTAACTGCATGGACTGCACTCAGGCCTTGGGCCAACTCCTGCAGGGCCTTTCCGTGGGCTGCAGCCTCTTTAGGCCACATCTGCCTGCTCTGCAGCGGGCTCCTCTCCACAGGCTGCAGGGAACTTCTGTTCCATGCCTGGAGCACCTCTTGCCCTCCTCCTGCACCCACCTTGCTGTTCTCTCAACATTCTCTCCCTTCTGTCTCTGAGCGGTGCAGCATTTCTTCCCCTTTCTTCAGTCTGCTCTCCCAGAGGTGCGCTCAGCATCCGGAGCTGTCTGTGCCATGAAGCAGCTCCTGCACTCTGACAGAGGCCACCCCAGCAGCCCAAACCTTGTCACACAAACCCGATGCAATTCCTGAGGTTGTTCTGTTCTTTTGTGGTTTAACCCTGGCAGGCAGCTCGGCACCACACAGCTGCTTGCTCTCCGTCCCTAGGTGGGATGGGAAAGAGGATTGGAAAGGTAAAAGTGCGAGAACTCAATGAGCTGAGGTAAACCACTGTGTTCCCTTGGCAGAGGAGTTGTATTCTCCGTAAGCCTGATCAGTGTGGAGGGATGCCTTTTTTTCCCCCAGCAGTTTTGGCTGTATCCTGACTCTTTGCTCTGTTTCAGGCTGAAGATGCTGAGCCGTTTGTCAGGTTTGGCGAATACTGTTCTACAAGAGCTGTCTGGTGATGGTGGAGATGCAGTTACTGACTCTGAGTCCTCTGTCACTGTAAGTACGCTGTAGTGGAAAGACTTCCTGGCCTGACGTGGAATTCGTTTTGGGAGTATAAACCATCTGGCTGCAGTGAGAACTGTTTCCTCCTTCACTGCTGAGTGCTGAAGTGAACTACTGAGCCAATGTTCCAATCTGCTTCTGTATTCCCAACATCTTAAGTCAGCGCTTACGTGGGGCAAACCAGAATCTACTTCAGTTTCTAAATCCCAAAGTGATGCTAGAAAAAGCATTAGAATGAGAATGAAGGAAGATTTTCAAGTTTCAGTGGTTCTGTACAGCAGGAAGGGGAGCTCTGCACAGTTGGGTGCCAGGAGAAGACTTTGAGCTGCATGCTGCCCTTTAGCACCTGGGGCAGTGCAGAGCACATGAGTGGTGTCATCTCCTGAGAGCCACTCACTCCAGCTTTCCTGTGTTCATTGAAGATGCTTGCATTCTTTGTTTGTTTGTTTTGTTTTGTTTTCCCCCTCCTTATACATAGTATCATAGAATGGCCTGGGTTGAAAAGGACCACAATGCTCATCTCGTTCCAATCCCCCTGCTATGTGCAGGGTCGCCAACCAGCAGACCAGGCTGCCCAGAGCCACATCCAGCCTGGCCTTGAATGCCTGCAGGAATGGGGCATCCACAACATCTTTTTTCCCAAGGATCTTTTCTCACTGTTTCACGTAATCGTATGTAGTTCTGTAGGCTGCATGAAATGTTTGCAAGGAACAAGAAGGCAGCGTTTAGGAGCGGACCTGAAGGTGACATCTGGATCTGCGCTGCGGTCTGTTTGGACTTTCCTCCTCCAGAGCACTCATCTCACTGTTTCAGGTAGAGCCTTCAGAGCTGGGAGGAGGAGGCATGGAGGAGGCACCGGAGGACGTGTTGGAACGGTTGGCACACACTGAAAAGCTGGTTGTGCAGCTGAAGGATTTGATCCGAGAGAAGGATGCCCTGCTCCAGCAAAAGGAAACTGTGCTCAAGGTATCATCTCTTTAACTGAGGGCCTAACAGCAAGTGTGAAATGCTGTGTCACCACCTTTCTTTGCAGGCCACTTTGTGCTTTTTTTTCTGCCACTGAGATGCCTGCACTACTGAAGGCATCGGTTTCGCACGGTGTTGGGATTCCAAAGAGTGGGCCCCAACCTAAGAGATGTGGGGTTGCTTCTGTGCCCTTTGTTTAGGTGTAGCTGCCTCTAAAGTAAAGACACGGCATCCACGTGCCGTGCATGTTTGGAGTTGTCATTGCTCTAATAAGTGAGAATATCTCTTATGTGAGTGGCTTTTTTTTTTCCTCCAGTTCAGAGCATTTGAGACCTACCAGGCTGTGCTGCAGCCTCCCAGCAGTGCAGTGGTTTATGGCATTTAAACAAATGTATGTACTGCTTATCCTGAGGCATCGCAGGAGAGGAAGTGCTCAGGGACTGCATATTGTTGAGGAAGTCAAATTTAGCTTCTTCTGACTGCTAGGAGATGTCCCTTGAGTGGTCCATCCCCTGGAGAGAAACTGGCATGCAACTATTTTGCTGTCTGTAGCGCTTCAGGGCAGGTTTGTTTCCTGAATGTTTTAATTAGCATTGGTAGTGAGAGCGGTTTGGGCAAAGAAACAGACTGACAAAACTGCAGCTTTTATGAGAGGTGCTTTCAGGGTGATGGACAGCACTGGGATGTTAAGCACAGCCCGTTGGTGTTCTTCTCTTCCAGGAGGAACGCGAAGCTGCGGATGCAAAGCTGATGAAACTGAAACTTCAGGCCAAAGCCAAGCTGGCATCTCTGAACAAACGCATCGAGGAGCTGACAGAGAAGGGCCCTCTGCTGCCTGCACAGGCCGTGCCAGAAGAGCAAGTGTGTGCTAAGGTTGGGAGAGCAGGGCTACTTCAAAGGTCTGGGGTTGCCCTATATTGTAGGTCCCGTCTCTCTGTTGTTAGCAGGACATTCATAGACCCAGCAAGGATATCTGCTGCAGGAGGCTGATCTGTCCAGCTTCTCTGTACTGCAGACCTCCTTGGGGTGCACAGCTTGATCTCCTCAAGGGTGTGGGTCAGTTGCATGTTGTGTATCTTCAGTGCTCTCTTACAGTTAAGCACTTGCAGAAATCACATCTGTGTGTGAGCTGTAGGTTTCCAAAGGTTACCTGTTCTGGTAGCACAGTGAAAGAAAGGGGAGGTAGGCTTGAATACAAGACAGAAGGAGTAATTGAAAGGCAGTAACTGAAACGTAAGATGGTATTTCTGAGCCATGTATCTTCCTGTGCCTCTGCAGTGTGTCGTGGTAAGCAGGAACTGTTTATGTGAACCTGGGTAAGAGTAGGGGCAGGAATGTCCCATACCAGGACGTTAAAGAAGCTGCTGACTGCTGGTTAGGAGCTACCACAGGCATTGCTGAGTTTATGAGAACTTTGGAAATTGTTCTGCTGACCACATCATCTTTTTGGGAAAAACTTTGTATGAGTGGGTAAGAGGAGTGAGATAGGAGCCCAGAGAGGTCACAGCACGTTGGGTGATCTTCAGTCAACCAGGGAGGATAAAAGTGATTAGAAACACTGGGGAGAGAAAAGAAGAAGTTAAGAATTATAGGGAAAGTCAACAGGTGAGTGTGAGTGTATGAAAATGTGAGCTTTTAGGGTAGGGGTGGTCACTGATTGAAGAGCACCTGAGAGGCAATCCTGAAAAGACGTTGATCAAAGGTTTGTGGGTTCACCAGAGGCAAGCCCTGCTTCACAGACTGCATCTTCTTCTAAGATGAGATCCCCACCCAGCTGACTGAGGGTTGGTGTCTGTCTTTTGGATTCCAGTATAACTTTGAGGCTGTTTCTCACGGTATCCTTTGGACAAAGCATCCAGCATACAGGTAGCCAAAACCACAGCGCAGTGGGTGAGCAGTTGGCTGACAGGTGGGGTTCCGTCAGGCTGCAGGGGTTCTGCAGGGCTCTGTTTCAGGGCCACGTCTCTTGGATGCTTTCGTCAATGACTTAAATGTCTTGAAGGTGTGCTGAGTAGGTTTGTGACAGCGCTGAGCCAAGCTTAGAAGAAACACAGTTTCCTCTTAGGGCTGTGACAGGAAGGTCGGCACCAAGAAGGTGGCCCAGCTGTTGAAGTGGGAGGGCTTTACTGGCAAAGAAATGAAGTGAGAAAGAAAAAAAGAGGTTCTTTTTTTTTTAAAGTAGTAAAAAATAAGCATATGGTTTGAAGTCAAATGCAGTAGTGTTTGGTTTTTGGTTTTTGTTTTAAATGGACAGTAATAATAATAGTTTTTTCTTATGTTTCTTCCTGATTCTTTCTTGCAGTGTCAGAATAGCCAAAATGCAGGAGAAGAGCACAGGGACAAAGCTGAGGGTCTGGAAGAGCAGCTCAGGGAGCAAGAAGAGGCTGTTAAGGATCTGAAGGAGCAGCTGGCTTTAGCCAAGATGAATCTGAAAGATGCTGAAGTCAAATATGCATCACAGGTACATGGAAAAAAAAAAGGCTGTTTTTTCTCTAATTGTGTGTTTCTTACCCCACCACTTACACTGATGTAATTCATGAGAGAGGGTTATTCGTGGTGACGCTGCTGCTGCTGCTGGTGGGGATGTGCCCAAATCCAGCAGTCTTGGCAGAAACCAGACTACCAGGGCTCCAGCAACCCAGGTCCATCCTGAGGGCATCGTTCCTGGCAGCTGTGTGTGCACAGACAGCCATACAGCACGGCACTTGTATGAGAGCACACAAAGTACCCATTTTTCTCTCTGTCAGCTCAGATGAGAGAACAGTGAGAATATGCACAGTAGCAGGTTTGGATCCTCCCAACAGCTGTGCTCAGGCACTCAGTCTGCCCAGTAACCAGCCCTTCAACCCCAGTCTCCCCAGTTCCTGGCATGTGGATGTGCAGAACCACAGGGCTTTTGGTCCTGTTCCACCTGTAGGCGCAGACCTGCTGGTCTTGCACACGTGCACAGAGCTGACTGGGATTCCCCTGGTCCCTCTAACAGCTAGCCCTCTGTGCTGCTCTCCTTGGAGAGGTGCAGATACACACAGACCCTCATAATACACAGCTGCACACCCTTCTCTCTGCTAACTGGCTCCCTGCTAACTGGCTCCCTGCTGTCTCCCCCTCAGAGATGTGGAAATACACAGAAGCAGGCCTTGCTTTTAGAGCCCCCCGAGGTGCTGCGGTCCCTCTGGTATAGCTGGAGCTCCCTTGGCCATCCAGCAGCCAAGCCTGCTTGTGATATTGACCTTGGGTGCTATTCCCCATCTTCCCCTTTTGAGGGAGGGGGTCTGCCACTCTGCTCTGTGCTGGGACACCTCACCTGGAACACTGCATCCAGATGGGGAGTCCTCAGCACGGGAGAGACACAGACCTGTTGGAGTGCATGCAGAGGAGGGCCACAAAAGTGATCCAAGGGATGGAACACCTCTCCTATGAGGACAGGCTGAGAGCTGGAGCCGTGCAGCATGGAGAGGAGAAGGCTGCGAGGGAACCTGAGAGTGGCTTGTCAGTATCTACAGGGGAGCTGCAGGAAGGAAGGGGACAGACTATTGAGCAGGGTCTGTGGTGATAGAACAAGGGGAAGAGGCTTCAAGCTCAAGGAAGGGAGGTTTAGGCGGGATATAAGGAAAAGGTCTCACACAGTGAGGCTGGTGAGGCACTGGCACAGATTGCCCAGAGATGCGGTGAGTGCCCCATCCCTGGAGACTTTCAAGGCCGGGCTGGATCAGGCCCTGGGCAGCCTGATGGAGCTGTGGTGTCCCTGTGCGTTGCAGGGGAGTTGGGCTAGGTGGCCTTTAAAGGTCCCTTCCAACTCCTAAGGATTCTGTGATTGTGCTTGCCACGTCGTCTGGCTTGGAGTTAGCTGGTGGCGATGCATGCAGCTGCATGCTCTCACTTGCTCCAGTTGCCAGCACCACAGGCACGTGAGCCCTTTGGCCCCCAGCCTGACTCTGCTGGGATTTCCCCTCGGAGGACAGAAAGTGTTTCCCACAGAAAAGAGAAAGAATTAGAGGTAGAATTCAGCGAGACACAGCGCTGATCAGATGCAGGGTATGGCCAGGCAGCCGTGTCCGCCAGCTGCTGTTACACACAGCTTGCTTTTTTAATACCATTGCCATCTATTTCGTGCTTGCTCCTCCCCAAACATCCTAGATCCACGCACTTCCATGTTTTTGGTTCTTCCTCCAAACACCCCACAGTCAGTCTGTGCAGTCCCAGAGCATTCCTTGGCTGCTTCCCACTCTGTATCCCTCAGCAGTATCCACCACTTCGTGCAAGAGGTGCTCAGCAGTCCTCTTACTGACTTTCCCTCTTGGATAATGAGAGAGGTGGCTCTCTGAGCAGGAGGTGAGGAAGCAGCCCCTGGATGGGGCTGGACAGGGAAGCCCTGGCTCTGAGTTGGGATTTGGGCTCCTGCCTGCTGTTGTGCTTTGGTGGATGGGTTTTGGCATCTTATCTCTGTCCTGGGGTGAATTTTGGAGCTTCTCCCATCCCCACCATTGCCCCTCTCTGCTTTGCTGGGGATGAGCAGATGAGATTGTTATCTCATAGCAGCTGCTGCTGTTGTGGTCATTTATGGAGCGGTTGATGTTAGTGCTCTGGATGTCAGTTTTAGCTTCTGATCCTGGATCCTGTGCAAATACAAAACCAGAAGCTTTACCAGCTCGTAGTCACATAGCCATTATGTTTTATCATTCTGCATATTTATAAATGAAAATCATACGTTCTTTGCTATGTGGGATCTGAACTATCACCAGTTAATGTCTGTGTGTTCATAACTGATGAAGTTATTGTCTGACCAAAGCCACACAGAGGGTTGCCAGGCTGCAGACATCTCCATACACGTGTTGTGTGCTGATGCTGAAAGCACTCCTGCGCGTCTGTCAGCTCATAACCCTCTTGGGTTATGTTTGCTTGTTTCTTTTTCCTCTCTGGGTCCTGTATTGTGGTTTCTGTTTTCAGCTGAGCTCGCTGCAGGAAGTAATTCAGGAGAAGGAAGCTCTCCTCCAGGAGCAGGCTCACCAGCACCAAGCTGAACTGCTGAGAACAGCAGCCAAGGCAGATCAGGAAGCAGAGGTGCAGCAGGTATGTTTGTTATAACCACCTTGTTATGAAGAGGAGAAAAGAAAATGCGTTTTATGAGTCAGAGGATGCGCTGGGAAAGCTGTTCTCAGCAAAGGGAGCAGGGCTGAGAAGAGGTGGGAGCTGATGGCTTGGGGCTGTTCGGTTGCTTTTAGAAGGATATTTCACGCTTCCAGAGTTCCCCCTTCAGTCTGCTGAGCTGCTGTAAGACTTCCCTTGCTGCAGGTAGGAATGTGGTTATTGTTGAATGCAGTGGCAATGCTGCCATTCCATGTTTGTGAAACAGCGGTGAGCTCTCCTTGCTGCACTTTGGTTCCCTGGTAAACATCAATAGGAGCAAAGTTTTTTGGGCCATTCAAGTAACTGTTGCAAGAAGTGGGTGGGTGGGTGAAACTTAGGGGATGTGGTCCTTTTGCAGTAGCTTCTCAGGGACTCTTACAGACCCATCTCCCACAGAACCTGCGTACACTGCAGAGGAAGCTGGAGGAGCGAGAAGAAGCTCTGCTGGGACAAACGCAGGCAGTGGAACTGCTGCAGCAGGAGCTACGTGAGGCTGAGCAACAAAACCAGGTGCCCTTTTGTGTGCTTGTTGCTTTCTCCCATGTGTCTGAATGGCTGGAGGAGCTTTGTACTGACAAAGCATGGAAGTTCCCAGATGGGCTTATTGCTGGCACAAGGATTGCAGCACGGAATCAGCCAGCAGGTTCATTAACATACTCCTTCCTTTCAAAGAAGAGCACAGATTTAATTGCAGCAGAAAAGGAAATAAAATGAATACACTACGCAGAGAATGTTTCAGTAATAATATTATTGCCCACATCTTCTGATGTCTGCCTCTATTTCTGGCCTTATGGTCCCCCCCACAACATTCATGGCTTCCATCTGTTTGAGGATGAGGGATACAGCGAGCTGTCAGCACCGCAGACCTCCAAACAGGGGGGATTTGGTGTTTGTGGCTCATTCCTCGACCCAGGATCCACTTGGAGCCATTTTTATGATTGCTGAGATGGTTTTACACCTCTTTGTTAGTCCACAAGTTCATGCTGTGTCTGATT')
Out[27]:
In [19]:
!ls -lah ../outputs/moleculo/unmapped_reads.bam
In [18]:
%%bash
module load samtools
samtools view -b -f 4 ../outputs/moleculo/LR6000017-DNA_A01-LRAAA-AllReads.sorted.bam > ../outputs/moleculo/unmapped_reads.bam