In [ ]:
import glob
glob.glob("inflammation-??.csv")
In [ ]:
filenames = glob.glob('inflammation*.csv')
for f in filenames:
data = numpy.loadtxt(fname=f, delimiter=',')
print("file",f[13:15],
", # rows and columns: ", data.shape, sep="")
In [ ]:
import numpy
import matplotlib.pyplot
filenames = sorted(glob.glob('inflammation*.csv'))
filenames = filenames[0:3]
for f in filenames:
print(f)
data = numpy.loadtxt(fname=f, delimiter=',')
fig = matplotlib.pyplot.figure(figsize=(10.0, 3.0))
axes1 = fig.add_subplot(1, 3, 1)
axes2 = fig.add_subplot(1, 3, 2)
axes3 = fig.add_subplot(1, 3, 3)
axes1.set_ylabel('average')
axes1.plot(numpy.mean(data, axis=0))
axes2.set_ylabel('max')
axes2.plot(numpy.max(data, axis=0))
axes3.set_ylabel('min')
axes3.plot(numpy.min(data, axis=0))
fig.tight_layout()
matplotlib.pyplot.show()
In [ ]:
num = -3
print("num is",num)
print("first try: ")
if num > 0:
print(num, "is positive")
print("second try: ", end="")
if num > 0:
print(num, "is positive")
else:
print(num, "is 0 or negative")
print("third try: ", end="")
if num > 0:
print(num, "is positive")
elif num == 0:
print(num, "is zero")
else:
print(num, "is negative")
In [ ]:
f = 'inflammation-01.csv'
data = numpy.loadtxt(fname=f, delimiter=',')
print("file",f,": ", sep="", end="")
if numpy.max(data, axis=0)[0] == 0 and numpy.max(data, axis=0)[20] == 20:
print('Suspicious looking maxima!')
elif numpy.sum(numpy.min(data, axis=0)) == 0:
print('Minima add up to zero!')
else:
print('Seems OK!')
In [ ]:
a = None # try again with a = "abc", "", [1,2,3], [], True, False, 0, 1, 18
if a:
print("a was true")
else:
print("a was false")
useful tools for strings: in
, startswith
In [ ]:
a = "hello world"
print(a.startswith("h"))
print(a.startswith("he"))
print("h" in a)
print("low" in a)
print("lo w" in a)
We have already seen easy ways to search/replace things in strings, with the string methods .split
, .replace
, .strip
. See dir("")
for string methods. To get access to manipulations using regular expressions:
re
libraryr''
to write the regular expression pattern. for raw
strings: to read a \n
as slash and an n
, not as a newline character.*
, +
, ?
. Add ?
to make them non-greedyre.search
, re.findall
, re.sub
.group
, .start
, .end
In [ ]:
print(type(""))
print(dir(""))
print("aha".find("a"))
print("hohoho".find("oh"))
mylist = ["AA","BB","CC"]
"coolsep".join(mylist)
In [ ]:
import re
filenames = glob.glob('*.csv')
print(filenames)
In [ ]:
mo = re.search(r'i.*n',filenames[0]) # multiplier * is greedy
print(mo) # match object, stores much info. search: first instance only.
print(mo.group()) # what matched
print(mo.start()) # where match starts: indexing start at 0
print(mo.end()) # where it ends: index *after*!
In [ ]:
mo = re.search(r'i.*?n',filenames[0])
print(mo)
print(mo.group())
print(mo.start())
print(mo.end())
When there is no match, the matched object is None and interpreted as False in boolean context:
In [ ]:
sequences = ["ATCGGGGATCGAAGTTGAG", "ACGGCCAGUGUACN"]
for dna in sequences:
mo = re.search(r'[^ATCG]', dna)
if mo:
print("non-ACGT found in sequence",dna,": found", mo.group())
by the way: compare with the less efficient code:
In [ ]:
for dna in sequences:
if re.search(r'[^ATCG]', dna):
mo = re.search(r'[^ATCG]', dna)
print("non-ACGT found in sequence",dna,": found", mo.group())
Finding all instances:
In [ ]:
print(re.findall(r'i.*n',filenames[0])) # greedy. non-overlapping matches
mo = re.findall(r'i.*?n',filenames[0]) # non-greedy
print(mo)
mo
In [ ]:
for f in filenames:
if not re.search(r'^i', f): # if no match: search object interpreted as False
print("file name",f,"does not start with i")
In [ ]:
re.sub(r'^(\w)\w+-(\d+)\.csv', r'\1\2.csv', filenames[0])
In [ ]:
for i in range(0,len(filenames)):
filenames[i] = re.sub(r'^(\w)\w+-(\d+)\.csv', r'\1\2.csv', filenames[i])
print(filenames)
In [ ]:
taxa = ["Drosophila melanogaster", "Homo sapiens"]
for taxon in taxa:
mo = re.search(r'^([^\s]+) ([^\s]+)$', taxon)
if mo:
genus = mo.group(1)
species = mo.group(2)
print("genus=" + genus + ", species=" + species)
print(taxon)
print(mo)
print(mo.start(1))
print(mo.start(2))
next: abbreviate genus name to its first letter, and replace space by underscore:
In [ ]:
taxa_abbrev = []
for taxon in taxa:
taxa_abbrev.append(
re.sub(r'^(\S).* ([^\s]+)$', r'\1_\2', taxon)
)
print(taxa_abbrev)
In [ ]:
coolstring = "Homo sapiens is pretty super"
re.split(r's.p', coolstring)
In [ ]:
def startswithi(str):
"""returns True if the input string starts with 'i'.
Let me continue this very long explanation
and also give an example:
startswithi("hohoho")
note 2 things:
- the double and single quotes inside my tripe double-quoted docstring
- in my text here the indentation adds 4 spaces on each line.
Those are ignored because it's a triple set of quotes."""
return(bool(re.search(r'^i', str)))
In [ ]:
print(startswithi("ihihi"))
print(startswithi("hohoho"))
In [ ]:
?startswithi
In [ ]:
def fahr_to_kelvin(temp):
return ((temp - 32) * (5/9)) + 273.15
print('freezing point of water:', fahr_to_kelvin(32))
print('boiling point of water:', fahr_to_kelvin(212))
def kelvin_to_celsius(temp_k):
return temp_k - 273.15
print('absolute zero in Celsius:', kelvin_to_celsius(0.0))
def fahr_to_celsius(temp_f):
temp_k = fahr_to_kelvin(temp_f)
result = kelvin_to_celsius(temp_k)
return result
print('freezing point of water in Celsius:', fahr_to_celsius(32.0))
In [ ]:
import numpy
import glob
def analyze(filename):
data = numpy.loadtxt(fname=filename, delimiter=',')
fig = matplotlib.pyplot.figure(figsize=(10.0, 3.0))
axes1 = fig.add_subplot(1, 3, 1)
axes2 = fig.add_subplot(1, 3, 2)
axes3 = fig.add_subplot(1, 3, 3)
axes1.set_ylabel('average')
axes1.plot(numpy.mean(data, axis=0))
axes2.set_ylabel('max')
axes2.plot(numpy.max(data, axis=0))
axes3.set_ylabel('min')
axes3.plot(numpy.min(data, axis=0))
fig.tight_layout()
matplotlib.pyplot.show()
def detect_problems(filename):
data = numpy.loadtxt(fname=filename, delimiter=',')
if (numpy.max(data, axis=0)[0] == 0 and
numpy.max(data, axis=0)[20] == 20):
print('Suspicious looking maxima!')
elif numpy.sum(numpy.min(data, axis=0)) == 0:
print('Minima add up to zero!')
else:
print('Seems OK!')
filenames = glob.glob('inflammation*.csv')
for f in filenames[:3]:
print(f)
analyze(f)
detect_problems(f)
In [ ]:
numpy.loadtxt('inflammation-01.csv', delimiter=',')
In [ ]:
numpy.loadtxt('inflammation-01.csv', ',')
In [ ]:
help(numpy.loadtxt)
Calculating binomial coefficients is not easy numerically. The number of ways to choose k elements among n is
choose(n,k) = n! / (k! (n-k)!)
where factorial n: n! = 1*2*...*n
becomes very big very fast.
But many terms cancel each other in choose(n,k)
, and it is a lot easier numerically to calculate the log of factorial numbers: log(n!)=log(1)+ ... + log(n)
.
log(n!)
for any integer n>0
.n
k
to calculate log(n!/k!)=log((k+1)*...*n)
, with default k=0
.
Add an associated test.log(choose(n,k))
for any integers n>=0
and 0 <= k <= n
. Start with the docstring and with a test.choose
function, to return the binomial coefficient itself or its log. Make the function return the binomial coefficient by default, not its log.
In [ ]:
import math
def logfactorial(n):
"""calculate the log of factorial n: log(1) + ... + log(n).
Examples:
>>> round(math.exp(logfactorial(5)),5)
120.0
"""
assert type(n)==int, "argument to logfactorial should be an integer"
res = 0
for i in range(0,n):
res += math.log(i+1)
return res
In [ ]:
help(logfactorial)
?logfactorial
Save the function in a file binomial.py
.
Add this at the beginning of the file:
#!/usr/bin/env python
and add this at the end:
if __name__ == '__main__':
import doctest
doctest.testmod()
and change the permission to let you execute the file. Then you can run the script like this:
./binomial.py
(equivalent to python binomial.py
).
Or you can import the functions in your file as a module,
inside an python session:
import binomial