Dynamically typed language created by Guido van Rossum (BDFL) in mid-90.
A data structure that holds (encapsulates) both data and actions that can be performed on these data
A pointer to specific memory location that holds the actual value (object)
Functions (reusable blocks of code that compute something) can be manipulated (assigned and passed around) as any other object
Python's standard library (things that come with language) and docs are tremendous resources
To save later aggravation, set your editors to use 4 spaces indents and NEVER EVER mix tabs and spaces.
In [1]:
import this
Always code as if the guy who ends up maintaining your code will be a violent psychopath who knows where you live.
John F Woods
Python is interpreted language. When you launch it you find yourself in a python shell which operates the same way as bash.
In [2]:
a = 1
a
Out[2]:
In [1]:
from __future__ import division, print_function
In [10]:
b = 2
a / b
Out[10]:
In [4]:
a / float(b)
Out[4]:
In [5]:
float(a) / b
Out[5]:
In [6]:
# Float can be written in two ways
f = 1.23456
g = 1e-8
f + g
Out[6]:
In [7]:
c = 1j # same as (0 + 1j)
i = 1.092 - 2.98003j
c - i
Out[7]:
In [2]:
s = 'I am a Python string. I can be "manipulated" in many ways.'
s
Out[2]:
In [3]:
seq = '>RNA\tAGUUCGAGGCUUAAACGGGCCUUAU'
seq
Out[3]:
In [4]:
print(seq)
In [5]:
len(seq)
Out[5]:
In [6]:
print(s[0])
print(s[5])
In [7]:
# Strings are immutable!!!
s[3] = 'n'
In [8]:
# Let's have a look inside:
print(dir(s))
In [9]:
s.split()
Out[9]:
In [10]:
s.split('p')
Out[10]:
In [11]:
s.upper()
Out[11]:
In [12]:
seq.split()
Out[12]:
In [13]:
seq
Out[13]:
In [14]:
# Unpacking
name, seq = seq.split()
print('Name: {}\tSequence: {}'.format(name,seq))
In [15]:
s = ' i am a really stupid long string '.strip()
s.strip('i')
Out[15]:
In [16]:
sl = s.split()
print(' '.join(sl))
In [17]:
sl, len(sl)
Out[17]:
In [21]:
sl[2:]
Out[21]:
In [22]:
print(dir(sl))
In [24]:
sl.insert(2, 'another')
sl
Out[24]:
In [25]:
sl.extend(['another', 'item'])
sl
Out[25]:
In [32]:
# Lists are mutable!!!
lst = ['item', 'another item']
lst1 = [item.upper() for item in lst]
print(lst1)
lst.append('add to lst')
print(lst1)
In [30]:
lst.append('YAY')
print(lst)
print(lst1)
In [28]:
# Lists are ordered and iterable:
lst1 = []
for item in lst:
lst1.append(item)
print(lst1)
In [34]:
for c in seq:
print(c)
In [33]:
# Don't do this:
for i in range(len(lst)):
print(lst[i])
# If you need index of each element:
for ind,item in enumerate(lst):
print(ind, item)
In [35]:
# To check that an element is in list:
print('item' in lst)
print('Item' in lst)
if 'item' in lst:
print('Item in list!')
else:
print('Not in list')
In [36]:
t = ('name', 'position')
name, position = t
print(name, position)
In [38]:
# From the earlier example:
r1 = range(11)
r2 = range(0, 100, 10)
for i1,i2 in zip(r1,r2):
print(i1,i2)
In [39]:
sl
Out[39]:
In [40]:
set(sl)
Out[40]:
In [41]:
# Nice optimization trick:
huge_list = range(1000000)
huge_set = set(huge_list)
def check_list(elem, lst=None):
if lst:
# DO smth
return elem in huge_list
def check_set(elem):
return elem in huge_set
# And let's refactor this into one function and talk about variable scopes
In [44]:
%time check_list(10000)
Out[44]:
In [45]:
%time check_set(10000)
Out[45]:
AKA hashtable, associative array. In general, mapping type.
In [48]:
d = {'key1': 1, 'key2': 20, 'key3': 345}
d
Out[48]:
key can be any hashable type. What it means in practical terms is that it must be immutable.
In [47]:
print(dir(d))
In [49]:
d.pop('key1')
print(d)
# Note also that dict is unordered
In [50]:
# Assignment
d['key1'] = 101
d['key2'] = 202
d
Out[50]:
In [51]:
# Membership test operates on keys
print('key3' in d)
print('key4' in d)
In [52]:
# Looping over key:value pairs
for key,val in d.items():
print(key,val)
In [ ]:
# This works but is not good
fh = open('myfile.txt', 'r')
for line in fh:
# Do something
fh.close()
In [ ]:
# Better way (context manager)
import os.path
directory = 'dir1'
filename = 'file.txt'
with open(os.path.join(directory, filename), 'r') as fh:
for line in fh:
# Do something
# Do something else. At this point fh.close() will be called automatically
In [ ]:
# We can also work with gzipped files
import gzip
with gzip.open('myfile.gz', 'rb') as fh: # Note 'rb' as file opening mode
for line in fh:
# Do something
In [55]:
a = 1
b = 0
if 0 < a < 2:
print('a is True!')
if not b:
print('b is False!')
In [56]:
s1 = 'Fox'
s2 = 'Dog'
# Equality check
if s1 != s2:
print('s1 and s2 are different')
# Identity check
if s1 is s2:
pass
else:
print('s1 and s2 are not the same thing!')
In [57]:
s2 = s1
# Identity check
if s1 is s2:
print('s1 and s2 POINT to the same thing')
else:
print('s1 and s2 are not the same thing!')
In [58]:
# This however doesn't work for integers
b = 1
if a == b:
print('a and b are equal!')
if a is b:
print('a and b are not the same thing!')
# Implementation detail: integers from -5 to ~256 are cached (singletons)
In [59]:
# Empty sequence types evaluate to False, non-empty ones to True
lst1 = []
lst2 = ['a']
if lst1 and lst2:
print('Both lists are non-empty.')
else:
print('One of the lists is empty.')
In [61]:
if lst1:
pass
else:
print('Empty')
In [ ]:
# Another singleton
a = None
In [62]:
!ls -lah ../data
In [70]:
import os
import sys
import csv
import gzip
import pandas as pd
csv.field_size_limit(sys.maxsize)
def parse_pileup(barcode, dirname='../results', track='minus', sample_id='minus'):
filename = os.path.join(dirname, '{0}_{1}.pileup.gz'.format(barcode, sample_id))
print(filename)
with gzip.open(filename, 'rb') as pileup:
reader = csv.DictReader(pileup,
delimiter='\t',
fieldnames=['seqname', 'pos', 'base', 'coverage', 'details', 'qual'])
data = []
for rec in reader:
pos = int(rec['pos'])
last = rec
if pos == 1:
data.append({'pos': 0, 'base': '*',
track: rec['details'].count('^')})
else:
data.append({'pos': pos-1, 'base': last['base'],
track: rec['details'].count('^')})
return pd.DataFrame.from_records(data)
In [71]:
df = parse_pileup('AAGCTA', dirname='../data', sample_id='R2')
In [72]:
df
Out[72]:
Do the summary of E.coli .gff file in python
Find all occurences of the sequence
GGGGCGGGGGin E.coli genome
Find all almost exact occurences of the same sequence in E.coli genome (ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz)
In [1]:
import gzip
from collections import Counter
foo = 'bar'
def column_stat(filename, col=2, sep='\t'):
counter = Counter()
foo = 'BAZ'
print(foo)
with gzip.open(filename) as fi:
for line in fi:
line_norm = line.strip()
if line and (not line_norm.startswith('#')):
fields = line_norm.split(sep)
counter[fields[col]] += 1
return counter
def column_stat1(filename, **kwargs):
counter = Counter()
col = kwargs['col']
sep = kwargs['sep']
with gzip.open(filename) as fi:
for line in fi:
line_norm = line.strip()
if line and (not line_norm.startswith('#')):
fields = line_norm.split(sep)
counter[fields[col]] += 1
return counter
In [2]:
#column_stat('../data/GCF_000005845.2_ASM584v2_genomic.gff.gz', col=3, sep=',')
kwargs = {'col': 2, 'sep': '\t'}
column_stat('../data/GCF_000005845.2_ASM584v2_genomic.gff.gz', **kwargs)
Out[2]:
In [3]:
!ls -lah ../data
In [4]:
pattern = 'GGGGCGGGGG'
nuc = {'A':'T', 'T' : 'A', 'C' : 'G', 'G' : 'C'}
def rc(seq):
return ''.join([nuc[i] for i in seq])[::-1]
with gzip.open('../data/GCF_000005845.2_ASM584v2_genomic.fna.gz') as fi:
glines = []
for line in fi:
line_norm = line.strip().upper()
if not line_norm.startswith('>'):
glines.append(line_norm)
seq = ''.join(glines)
def find_kmer(seq, pattern):
seq_map = {}
k = len(pattern)
for kmer, pos in [(seq[i:i+k], i) for i in xrange(0, len(seq)-k)]:
if kmer in seq_map:
seq_map[kmer].append(pos)
else:
seq_map[kmer] = [pos,]
return seq_map
forward = find_kmer(seq, pattern)
print(' '.join([str(x) for x in forward[pattern]]))
reverse = find_kmer(seq, rc(pattern))
print(' '.join([str(x) for x in reverse[rc(pattern)]]))
In [5]:
nuc = {'A':'T', 'T' : 'A', 'C' : 'G', 'G' : 'C'}
#take the patter , search it based on dic
res = []
for i in pattern :
res.append(nuc[i])
print(''.join(res)[::-1])
In [6]:
def rc(seq):
return ''.join([nuc[i] for i in seq])[::-1]
In [7]:
D = 1 # number of mismatches
def hamming(s1, s2):
'''
Computes Hamming distance between s1 and s2.
'''
if len(s1) != len(s2):
raise ValueError('s1 and s2 must be the same length to compute Hamming distance!')
return sum(ch1 != ch2 for ch1,ch2 in zip(s1, s2))
seq_map = {}
k = len(pattern)
for kmer, pos in [(seq[i:i+k], i) for i in xrange(0, len(seq)-k+1)]:
if hamming(kmer, pattern) <= D:
if kmer in seq_map:
seq_map[kmer].append(pos)
else:
seq_map[kmer] = [pos,]
res = ' '.join([' '.join([str(x) for x in v]) for key,v in seq_map.items()])
print(' '.join([str(x) for x in sorted([int(x) for x in res.split()])]))
In [8]:
s1 = 'AAATTTTAAA'
s2 = 'AAATTATAAT'
for c1,c2 in zip(s1,s2):
print(c1,c2)
In [9]:
def hamming(s1, s2):
'''
Computes Hamming distance between s1 and s2.
'''
if len(s1) != len(s2):
raise ValueError('s1 and s2 must be the same length to compute Hamming distance!')
return sum(ch1 != ch2 for ch1,ch2 in zip(s1, s2))
def find_with_mismatch(seq, pattern, D=1):
seq_map = {}
k = len(pattern)
for kmer, pos in [(seq[i:i+k], i) for i in xrange(0, len(seq)-k+1)]:
if hamming(kmer, pattern) <= D:
if kmer in seq_map:
seq_map[kmer].append(pos)
else:
seq_map[kmer] = [pos,]
return seq_map
print(find_with_mismatch(seq, pattern, D=0))
In [10]:
print(find_with_mismatch(seq, pattern, D=1))
In [ ]: