In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datasets import synthetic as synth
from utils import arrays as ar
# enable imports; must be run from notebook's directory
import sys
sys.path.append('.') # get access to all our code
sys.path.append('../figs') # get access to all our figs
sys.path.append('..') # stuff doing relative imports needs to be imported relative to here
%matplotlib inline
%load_ext autoreload
%autoreload 2
A = np.arange(12).reshape((4,3))
x = np.arange(9)
l = [1,3,2,4]
d = {'a': 1, 'b': 2, 'c': 3}
In [61]:
# FLANN stuff
from pyflann import *
from numpy.random import randn
length = 128
dataset = randn(10000, length)
testset = randn(1000, length)
# make them randwalks
dataset = np.cumsum(dataset, axis=1)
testset = np.cumsum(testset, axis=1)
# znorm
dataset = ar.zNormalizeRows(dataset)
testset = ar.zNormalizeRows(testset)
# set_distance_type('euclidean')
flann = FLANN()
# result, dists = flann.nn(dataset,testset,5,algorithm="kmeans", branching=32, iterations=7, checks=16)
k = 1
result, dists = flann.nn(dataset,testset,k)
print "neighbor idxs and dists shapes"
print result.shape, dists.shape
print
print "min and max neighbor idxs", np.min(result), np.max(result)
print
print "avg squared dist", np.mean(dists / length)
nnIdx = result[0]
q, x = testset[0], dataset[nnIdx]
print "dist between test[0] and its nn:"
dist = np.sum((q-x)**2)
print dist / length
print dists[0] / length # same as prev line; dists returned must be squared Euclidean
print "----- radius search:"
# params = flann.build_index(dataset, algorithm="autotuned", target_precision=0.9, log_level="info", memory_weight=.01)
# params = flann.build_index(dataset, checks=500, log_level="info", memory_weight=.01)
# params = flann.build_index(dataset, checks=100, log_level="info", memory_weight=.01, algorithm="kmeans")
params = flann.build_index(dataset, log_level="info", memory_weight=.01, algorithm="kdtree", trees=32) # 16 -> still inconsistent
print params
print "stuff within radius:"
results, dists = flann.nn_radius(testset[0], radius=.25 * length) # whoa; super nondeterministic, often with terrible recall
print results, dists / length
print "number of neighbors in radius:", len(results)
In [62]:
Objs = np.empty((4,3), dtype=np.object)
print Objs
Objs[0,0] = np.ones(2)
print Objs
In [66]:
from scipy import sparse
x = np.zeros(10)
x[1] = 1
x[5] = 5
xs = sparse.csr_matrix(x)
print xs
print xs.shape
print
print sparse.hstack([xs, xs])
print
print sparse.hstack(np.array([xs, xs]))
print
xs[1:5] = 2
print xs
In [44]:
x = np.arange(9)
y = np.arange(9)
x += 3
print x
print y
print
y[:] = x
# y = x
x += 3
print x
print y
In [60]:
i = np.arange(3)
j = np.arange(3)
# idxPairs = zip(i, j) # doesn't work--want 2 lists, not 2 lists of pairs
idxPairs = [i, j]
print idxPairs
print A
# print A[([0,1], [0,1])] # works
# print A[((0,1), (0,1))] # works
print A[idxPairs]
In [59]:
from scipy import signal as sig
s = [0,1,1,1,0]
In [68]:
a = [0, -2, 11]
np.sign(a)
Out[68]:
In [135]:
# how well does generating random data with the same frequency
# magnitude spectrum, but different phase, work out?
# EDIT: pretty well--I'd say this is from roughly the same
# manifold; except maybe that stuff near endpoints tends to
# be extreme (and not just last point--it's a few of them)
from datasets import synthetic as synth
x = synth.notSoRandomWalk((2, 100))
noise = synth.randWithFreqMagMatching(x)
plt.plot(x.T)
plt.plot(noise.T)
# print np.std(x) # these 2 are the same, meaning
# print np.std(noise) # we did it right
mags = np.fft.fft(x)
noiseMags = np.fft.fft(noise)
whiteNoiseMags = np.fft.fft(np.random.randn(*x.shape))
plt.figure()
plt.plot(mags.T)
plt.plot(noiseMags.T)
plt.plot(whiteNoiseMags.T)
Out[135]:
In [141]:
np.var(np.ones(5))
np.where(A)
print ar.nonzeroRows(A)
zeros = ar.nonzeroRows(np.zeros((4,3)))
In [14]:
Ts = [1,2,4,3,5]
Te = [4,3,2,1,0]
sortThese = np.c_[Te, Ts]
sortIdxs = np.argsort(sortThese, axis=0)
print sortThese
print "sortIdxs:"
print sortIdxs
print "sorted:"
print sortThese[sortIdxs]
In [20]:
x = [1,1,3,3,2,4,5,4,6]
futureMins = np.minimum.accumulate(x[::-1])[::-1]
print futureMins
print
x = [1,1,3,3,0,4,5,4,6]
futureMins = np.minimum.accumulate(x[::-1])[::-1]
print futureMins
In [24]:
# np.c_[x, x]
np.minimum.accumulate([0][::-1])[::-1]
Out[24]:
In [28]:
x1, x2 = np.vstack((x, x)).T
print x1, x2
In [5]:
if x[-1:-1]:
print "true"
else:
print "false"
In [7]:
# is argsort ascending or descending? EDIT: ascending
print x
print np.argsort(x)
print np.argsort(x[::-1])
In [8]:
np.argmin([1, 1])
Out[8]:
In [14]:
x.astype(np.int)
A
print np.mean(A, axis=0)
print np.mean(A, axis=0).astype(np.int)
In [27]:
s = set()
s |= set([2, 4, 6])
s |= set(range(3))
s
print 2 in s
print 71 in s
In [24]:
print A
print A[[1, 2]] # select multiple rows
print A[(1, 2)] # select row, col
In [28]:
2**8
Out[28]:
In [32]:
z = [-1.5, -.5, 0, .5, 1.5]
bins = [-1, 0, 1]
print np.digitize(z, bins)
print np.digitize(z, bins, right=True) # <= vs <
In [37]:
from scipy.stats import norm
# yep, so ppf is inverse cdf
print norm.ppf(.5)
print norm.ppf(0)
print norm.ppf(1.)
print norm.ppf(.975)
print norm.ppf([0, .5, .975, 1])
In [41]:
quantiles = np.linspace(0, 1, 4 + 1) # 4+1 pts -> 4 intervals
print quantiles
print quantiles[1:-1]
In [45]:
quantilesNoInf = quantiles[1:-1]
breakPoints = norm.ppf(quantilesNoInf)
print breakPoints
print
print np.digitize(z, breakPoints)
In [68]:
def saxMat(origLen, wordLen):
# note that this just ignores the last few points
# if origLen isn't a multiple of wordLen
symbolLen = origLen // wordLen
startIdxs = np.arange(0, origLen, symbolLen)
endIdxs = np.r_[startIdxs[1:], origLen]
# filterMat = np.zeros((origLen, wordLen))
# for i in range(wordLen):
# start, end = startIdxs[i], endIdxs[i]
# filterMat[start:end, i] = 1. / (end - start)
# return filterMat
filterMat = np.zeros((wordLen, origLen))
for i in range(wordLen):
start, end = startIdxs[i], endIdxs[i]
filterMat[i, start:end] = 1. / (end - start)
return filterMat.T
print saxMat(8, 4)
print saxMat(6, 4)
In [70]:
2 ** x
Out[70]:
In [74]:
[[x, (x * (x+1))/2] for x in range(1,8)]
Out[74]:
In [79]:
age = np.arange(100)
prob = np.exp(-.003 * np.exp((age - 25.)/10))
plt.plot(age, pct)
print prob[[23, 50, 75, 80]]
In [90]:
print A
print A[1, 1:3]
print A[1, [1, 2]]
print A[1, (1, 2)]
print A[1, np.array((1, 2))]
print
print A[:, [0, 2, 1]] # not useful
print A[np.arange(len(A)), [0, 2, 1, 0]] # elements [0,2,1,0] from rows
In [99]:
# how does logspace work?
print np.logspace(np.log2(1), np.log2(100), num=5, base=2.)
print np.logspace(np.log10(1), np.log10(100), num=5, base=10.)
In [113]:
T = np.empty((2, A.shape[0], A.shape[1]))
T[0] = A
T[1] = -A
print T
# print T.reshape((T.shape[1], -1)) # nope, not what I want...
# print T.T.reshape((T.shape[1], -1)) # garbage
def vstack3Tensor(X):
# assumes time progresses along columns
return X.reshape((-1, X.shape[2])) # add rows for each leading dim
def hstack3Tensor(X):
nDims = X.shape[0]
transposed = np.empty((nDims, X.shape[2], X.shape[1]))
for dim in range(nDims):
transposed[dim] = X[dim].T
return vstack3Tensor(transposed).T
print hstack3Tensor(T)
In [114]:
A[[1,3]]
Out[114]:
In [116]:
4 * (1. - 1.)
Out[116]:
In [5]:
print A[:, 1:3]
print A[:, [1,2]]
print A[:, (1,2)]
In [3]:
d = {'a': 1, 'b': 2, 'c': 3}
print d
print isinstance(d, dict)
In [8]:
a = [1, 5, 5, 4, 2, 2]
print a
print
uniq, idxs, inverse = np.unique(a, return_index=True, return_inverse=True)
print uniq
print idxs
print inverse
In [9]:
ord('a')
Out[9]:
In [11]:
d
len(d)
Out[11]:
In [12]:
ord('abc')
In [22]:
rowNorms = np.linalg.norm(A, axis=1)
print rowNorms
print A / rowNorms.reshape((-1, 1))
In [29]:
Amips = A / 20.
print Amips
newNorms = np.linalg.norm(Amips, axis=1).reshape((-1,1))
Aaugment = np.hstack((Amips, np.sqrt(1. - newNorms**2)))
print Aaugment
print np.linalg.norm(Aaugment, axis=1)
In [41]:
print np.hstack([x[0], x[0], x])
print [3] * 20
print x[:50]
In [47]:
print repr(np.array(['a'] * 3))
print repr(np.array(['a'] * 3, dtype=np.object))
In [48]:
'abc'.replace('a', 'A')
Out[48]:
In [56]:
l
l2 = []
l2.append(l[2:4])
l2.append(l[2:10])
print l2
l3 = []
print l3 + l2
arr = np.array(l2)
print arr
print arr[-1]
In [58]:
import os
os.path.join("foo", "", "bar")
Out[58]:
In [60]:
x
x[::3]
Out[60]:
In [65]:
print l
print l + l
In [76]:
print x
print np.append(x, np.random.choice(x))
xl = list(x)
print xl[:2] + xl[3:]
In [79]:
l2 = [l, l, l]
print l2
np.array(l2)
Out[79]:
In [85]:
d
d2 = {'c': 3}
import copy
d3 = copy.copy(d)
print d3
d3.update(d2)
print d3
In [86]:
np.std([3])
Out[86]:
In [98]:
print l
print np.std(l)
np.std([])
print np.sort(set((3,1,2)))
print np.sort(list(set((3,1,2))))
print np.sort(list(set((3,1,2)))).shape
In [103]:
np.arange(2, 4 + .5, .5)
np.arange(1.6, 2.3, .15)
Out[103]:
In [7]:
B = np.copy(A)
B[0] = (3,3,3)
B
Out[7]:
In [4]:
print l
print np.copy(l)
In [12]:
x = np.array([[1, 1], [2, 2]])
X = np.arange(12).reshape((2, -1))
from scipy import signal as sig
print sig.correlate2d(X, x, mode='valid').flatten()
for i in range(len(X.T) - 1):
print np.sum(x * X[:,i:i+2])
In [15]:
x = np.arange(5)
print np.append(x, [-1])
print np.append(x, -1)
In [19]:
gains = np.random.randn(5,5)
print (gains > 0)
gains * (gains > 0)
Out[19]:
In [22]:
# fig, axes = plt.subplots(2, 5, figsize=(10,8))
# print axes
# print axes.flatten()
In [28]:
mask = A > 0
print mask
idxs = np.where(A > 0)
print idxs
print
print A[mask]
print A[idxs]
In [31]:
B = np.zeros((4,3)) + 7
print B
B[:] = 0
print B
In [36]:
np.maximum([np.log(0)], [0])
Out[36]:
In [38]:
X = np.array([0,1,1,1,1,0,1,0,0,1,1,1,0,1,0])
print np.max(X > 0)
In [40]:
np.random.randn(4,3)
Out[40]:
In [43]:
min(37 // 2, 75)
max(18, 8)
Out[43]:
In [44]:
np.sqrt(np.inf)
Out[44]:
In [50]:
uniqs, counts = np.unique(['a','b','a'], return_counts=True)
print uniqs, counts
uniqs, counts = np.unique([], return_counts=True)
print uniqs, counts
# print np.max(counts) # throws
uniqs, counts = np.unique(['a'], return_counts=True)
print uniqs, counts
In [52]:
None > 0
max(1, None)
Out[52]:
In [54]:
import os
os.path.join('foo', 'bar', 'baz')
Out[54]:
In [2]:
print np.random.choice(np.arange(10, 20), 2)
print list(np.random.choice(np.arange(10, 20), 2))
In [4]:
if np.nan:
print "nan is true"
In [23]:
# load dataframes to play with; src =
# http://pandas.pydata.org/pandas-docs/stable/merging.html
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
print df1
print df2
print df3
In [33]:
df4 = df1[['A', 'C', 'D']] # select the "relevant" columns
df5 = df4.set_index('A')
print df5
print len(df5)
In [39]:
df5.ix['A0']['C']
Out[39]:
In [33]:
print df1
print
print pd.melt(df1, id_vars='C') # melts *everything*
print
print pd.melt(df1, id_vars='C', value_vars=['A']) # melts *everything*, but only keeps A....
print
print pd.melt(df1, id_vars='C', value_vars=['A', 'B']) # melts *everything*, but only keeps A....
print
print pd.melt(df1, id_vars=['C', 'D'], value_vars=['A']) # melts *everything*, but only keeps A....
# So here's what melt() does:
# -keep the values in id_vars
# -for all other cols:
# -throw them away if value_vars is specified and
# they aren't in the list give
# -if not discarded, treat each col *name* as the value of a
# categorical variable that's basically a key, and record the
# value in that col (where the index cols have whatever values)
# as the value associated with that key
#
# basically, split cols into id cols, which become like an index, and
# other cols, then flatten other cols into a bunch of kv pairs, where
# the key is the col name and value is whatever the value is when
# id cols take on those values; this means replicating sets of values in
# the id cols multiple times if there are multiple values
#
# or, even more briefly, it takes one record with a bunch of fields and
# turns it into a bunch of records with one field
# -and the "index" vars let you uniquely identify which records refer
# to the same underlying thing
#
# I don't actually know why you would use this
In [34]:
# DataFrame.pivot(index=None, columns=None, values=None)
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.pivot.html#pandas.DataFrame.pivot
print df1
print
# df5 = df1.drop('D', axis=1)
df5 = df1
# print df1.pivot(index=['A']) # throws
print df5.pivot(index='A', columns='B')
print
print df5.pivot(index='A', columns='B', values='C')
# so this would do something useful if there were multiple values of
# C or D for a given value of A and B, but since there's only 1, there's
# only one non-nan element per row/col.
#
# the idea is that it creates another dataframe with an index
# given by the values in the "index" columns, and creates a column
# for each value in the "columns" column(s). It populates this df using
# the data in the remaining columns (creating a hierarchical column
# index if there's more than one remaining column, apparently...)
#
# more concisely, it takes a bunch of records with one variable (but
# that refer to the "same thing" according to the index vars) and
# turns them into one record with a bunch of variables
In [11]:
s = '[1, 2, 3]'
splitStr = s.strip('[').strip(']').split(',')
numList = [int(s) for s in splitStr]
print np.array(numList)
In [19]:
import seaborn as sb
sb.set_context('talk')
gammas = sb.load_dataset('gammas')
print gammas.columns.values
ax = sb.tsplot(time="timepoint", value="BOLD signal", unit="subject", condition="ROI", data=gammas)
# ax = sb.tsplot(time="timepoint", value="BOLD signal", condition="ROI", data=gammas, n_boot=0)
# ^ "cannot label index with a null key" when we don't tell it the unit
In [21]:
import os
os.path.join(None, 'foo')
In [23]:
# print [i for i in range(3) for j in range(2)]
print [(i, j) for i in range(3) for j in range(2)]
In [8]:
print d
t = True
t &= False
print t
In [18]:
a = np.arange(5)
list(a)
# list(None) # throws
list([])
import os
print os.path.join(None)
print os.path.join([])
In [23]:
d = {'a': 1, 'c': 3, 'b': 2}
print d.setdefault('a', 5)
print d
print
print d.setdefault('d', 4)
print d
In [26]:
from pprint import pprint
pprint(d)
In [39]:
# figure out how to make ts 1D using PCA
from sklearn.decomposition import TruncatedSVD
X = []
for i in range(5):
v = np.sin(np.linspace(0, 6, 100))
X.append(v)
X.append(v + np.random.randn(*v.shape) / 2.)
X = np.vstack(X).T # each col of x is 1 dimension
plt.plot(X)
vect = TruncatedSVD(n_components=1).fit_transform(X)
plt.figure()
plt.plot(vect)
Out[39]:
In [46]:
print x
print np.append(x, np.zeros(0))
In [53]:
print x
idxs = np.array([1,2,4], dtype=np.int)
np.vstack([x[i:i+2] for i in idxs])
Out[53]:
In [55]:
np.mean(A, axis=1, keepdims=True)
Out[55]:
In [58]:
np.ceil(1045 / 16)
Out[58]:
In [67]:
l = [1, 3, 2, 4]
l.insert(2, 99)
print l
In [70]:
ll = [(1, 2), (3, 4), (5, 6)]
print zip(*ll)
ll2 = [np.array(el) for el in ll]
print zip(*ll2) # works with arrays too (as it should)
In [64]:
idxs = np.arange(3)
print A
print A[[idxs, idxs]]
In [67]:
print x
print np.hstack((x, x))
In [68]:
[None] * 5
Out[68]:
In [70]:
print np.logspace(-4, 4, 3)
In [7]:
print d
print zip(*d.items())
print list(d)
print dict([(v, k) for k, v in d.items()])
In [9]:
np.random.randn(3,4)
Out[9]:
In [14]:
a = np.random.randn(5)
print a
print a.astype(np.int32)
print np.floor(a).astype(np.int32)
In [15]:
# from collections import defaultdict
# dd = defaultdict(defaultdict)
In [18]:
x[::-1][:-1]
Out[18]:
In [22]:
B = A.astype(np.float)
print B
print B * (B > 3.5)
In [28]:
x
2 <= x <= 7
In [37]:
# is numpy pad func worth using?
# -> not really; super awkward to only pad 1 dimension
ax0Pad = 0
ax1Pad = 2
print np.lib.pad(A, [(ax0Pad, ax0Pad),(ax1Pad, ax1Pad)], mode='constant')
print np.lib.pad(A, [(ax0Pad, ax0Pad),(ax1Pad, ax1Pad)], mode='edge')
In [40]:
s = 'bletchKey=bletchVal'
print s.split('=')
print len(s.split('='))
In [42]:
range(5, 0-1, -1)
Out[42]:
In [52]:
from collections import namedtuple
_fields = [
'children',
'is_leaf'
]
Node = namedtuple('Node', _fields)
Node.__new__.__defaults__ = ([], True)
Node()
Out[52]:
In [71]:
# bonferroni correction vs sidak correction
#
# Hmm...it appears that there's effectively no
# difference between them whatsoever unless
# alpha is huge
k = np.linspace(1, 10, 100)
for alpha in (.01, .05, .1, .5):
y1 = 1 - (1. - alpha) ** (1./k) # sidak
y2 = alpha / k # bonferroni
plt.plot(k, y1, 'b')
plt.plot(k, y2, 'r')
plt.xlim([1, np.max(k)])
plt.ylim([0, alpha])
plt.show()
In [6]:
l
l2 = l[:]
l3 = l[:]
print l2
print l2[1:-1]
l2[1:-1] = ''
print l2
l3[1] = ''
l3[2] = ''
print l3
In [9]:
print d
print {k: d[k] for k in ('a', 'c')}
In [14]:
y = [3,2,-1,2,1]
print np.unique(y, return_inverse=True)
In [15]:
5.0 // 3.0
Out[15]:
In [18]:
lll = [[]] * 3
print lll
lll[0].append(3)
print lll # ugh, bad! copied same list pointer 3 times...
print
l3 = [[] for i in range(3)]
print l3
l3[0].append(3)
print l3
In [23]:
# am I calling setattr right?
class C(object):
pass
o = C()
print o
setattr(o, 'foo', 7)
print o.__dict__['foo']
print
print d
o.__dict__.update(d)
print [o.__dict__[k] for k in d]
In [26]:
lll = [[0,1],[2,3],[4,6]]
print zip(*lll)
print zip(*lll)[1]
In [ ]: