In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from datasets import synthetic as synth
from utils import arrays as ar

# enable imports; must be run from notebook's directory
import sys
sys.path.append('.') # get access to all our code
sys.path.append('../figs') # get access to all our figs
sys.path.append('..') # stuff doing relative imports needs to be imported relative to here

%matplotlib inline
%load_ext autoreload
%autoreload 2

A = np.arange(12).reshape((4,3))
x = np.arange(9)
l = [1,3,2,4]
d = {'a': 1, 'b': 2, 'c': 3}

In [61]:
# FLANN stuff
from pyflann import *
from numpy.random import randn
length = 128
dataset = randn(10000, length)
testset = randn(1000, length)
# make them randwalks
dataset = np.cumsum(dataset, axis=1)
testset = np.cumsum(testset, axis=1)
# znorm
dataset = ar.zNormalizeRows(dataset)
testset = ar.zNormalizeRows(testset)
# set_distance_type('euclidean')
flann = FLANN()
# result, dists = flann.nn(dataset,testset,5,algorithm="kmeans", branching=32, iterations=7, checks=16)
k = 1
result, dists = flann.nn(dataset,testset,k)
print "neighbor idxs and dists shapes"
print result.shape, dists.shape
print
print "min and max neighbor idxs", np.min(result), np.max(result)
print
print "avg squared dist", np.mean(dists / length)
nnIdx = result[0]
q, x = testset[0], dataset[nnIdx]
print "dist between test[0] and its nn:"
dist = np.sum((q-x)**2)
print dist / length
print dists[0] / length # same as prev line; dists returned must be squared Euclidean

print "----- radius search:"
# params = flann.build_index(dataset, algorithm="autotuned", target_precision=0.9, log_level="info", memory_weight=.01)
# params = flann.build_index(dataset, checks=500, log_level="info", memory_weight=.01)
# params = flann.build_index(dataset, checks=100, log_level="info", memory_weight=.01, algorithm="kmeans")
params = flann.build_index(dataset, log_level="info", memory_weight=.01, algorithm="kdtree", trees=32) # 16 -> still inconsistent
print params
print "stuff within radius:"
results, dists = flann.nn_radius(testset[0], radius=.25 * length) # whoa; super nondeterministic, often with terrible recall
print results, dists / length
print "number of neighbors in radius:", len(results)


neighbor idxs and dists shapes
(1000,) (1000,)

min and max neighbor idxs 12 9976

avg squared dist 0.267363239904
dist between test[0] and its nn:
0.221258532299
0.221258532299
----- radius search:
{'speedup': 0.0, 'iterations': 5, 'multi_probe_level_': 2L, 'cb_index': 0.5, 'centers_init': 'default', 'log_level': 'info', 'build_weight': 0.009999999776482582, 'leaf_max_size': 4, 'memory_weight': 0.009999999776482582, 'sample_fraction': 0.10000000149011612, 'checks': 32, 'max_neighbors': -1, 'random_seed': 673234748, 'trees': 32, 'target_precision': 0.8999999761581421, 'table_number_': 12L, 'sorted': 1, 'branching': 32, 'algorithm': 'default', 'key_size_': 20L, 'eps': 0.0, 'cores': 0}
stuff within radius:
[3105 4850 1174 1048 6287 6927 3884 5003] [ 0.17796051  0.21241208  0.21455668  0.21862764  0.22068061  0.22392558
  0.22568223  0.22749204]
number of neighbors in radius: 8

In [62]:
Objs = np.empty((4,3), dtype=np.object)
print Objs
Objs[0,0] = np.ones(2)
print Objs


[[None None None]
 [None None None]
 [None None None]
 [None None None]]
[[array([ 1.,  1.]) None None]
 [None None None]
 [None None None]
 [None None None]]

In [66]:
from scipy import sparse
x = np.zeros(10)
x[1] = 1
x[5] = 5
xs = sparse.csr_matrix(x)
print xs
print xs.shape
print
print sparse.hstack([xs, xs])
print
print sparse.hstack(np.array([xs, xs]))
print
xs[1:5] = 2
print xs


  (0, 1)	1.0
  (0, 5)	5.0
(1, 10)

  (0, 1)	1.0
  (0, 5)	5.0
  (0, 11)	1.0
  (0, 15)	5.0

  (0, 1)	1.0
  (0, 5)	5.0
  (0, 11)	1.0
  (0, 15)	5.0

  (0, 1)	1.0
  (0, 5)	5.0

In [44]:
x = np.arange(9)
y = np.arange(9)
x += 3
print x
print y
print
y[:] = x
# y = x
x += 3
print x
print y


[ 3  4  5  6  7  8  9 10 11]
[0 1 2 3 4 5 6 7 8]

[ 6  7  8  9 10 11 12 13 14]
[ 3  4  5  6  7  8  9 10 11]

In [60]:
i = np.arange(3)
j = np.arange(3)
# idxPairs = zip(i, j) # doesn't work--want 2 lists, not 2 lists of pairs
idxPairs = [i, j]
print idxPairs
print A
# print A[([0,1], [0,1])] # works
# print A[((0,1), (0,1))] # works
print A[idxPairs]


[array([0, 1, 2]), array([0, 1, 2])]
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[0 4 8]

In [59]:
from scipy import signal as sig
s = [0,1,1,1,0]

In [68]:
a = [0, -2, 11]
np.sign(a)


Out[68]:
array([ 0, -1,  1])

In [135]:
# how well does generating random data with the same frequency
# magnitude spectrum, but different phase, work out?
# EDIT: pretty well--I'd say this is from roughly the same 
# manifold; except maybe that stuff near endpoints tends to
# be extreme (and not just last point--it's a few of them)
from datasets import synthetic as synth
x = synth.notSoRandomWalk((2, 100))
noise = synth.randWithFreqMagMatching(x)
plt.plot(x.T)
plt.plot(noise.T)
# print np.std(x)     # these 2 are the same, meaning
# print np.std(noise) # we did it right

mags = np.fft.fft(x)
noiseMags = np.fft.fft(noise)
whiteNoiseMags = np.fft.fft(np.random.randn(*x.shape))
plt.figure()
plt.plot(mags.T)
plt.plot(noiseMags.T)
plt.plot(whiteNoiseMags.T)


Out[135]:
[<matplotlib.lines.Line2D at 0x127232450>,
 <matplotlib.lines.Line2D at 0x127212a90>]

In [141]:
np.var(np.ones(5))
np.where(A)
print ar.nonzeroRows(A)
zeros = ar.nonzeroRows(np.zeros((4,3)))


[0 1 2 3]

In [14]:
Ts = [1,2,4,3,5]
Te = [4,3,2,1,0]
sortThese = np.c_[Te, Ts]
sortIdxs = np.argsort(sortThese, axis=0)
print sortThese
print "sortIdxs:"
print sortIdxs
print "sorted:"
print sortThese[sortIdxs]


[[4 1]
 [3 2]
 [2 4]
 [1 3]
 [0 5]]
sortIdxs:
[[4 0]
 [3 1]
 [2 3]
 [1 2]
 [0 4]]
sorted:
[[[0 5]
  [4 1]]

 [[1 3]
  [3 2]]

 [[2 4]
  [1 3]]

 [[3 2]
  [2 4]]

 [[4 1]
  [0 5]]]

In [20]:
x = [1,1,3,3,2,4,5,4,6]
futureMins = np.minimum.accumulate(x[::-1])[::-1]
print futureMins
print
x = [1,1,3,3,0,4,5,4,6]
futureMins = np.minimum.accumulate(x[::-1])[::-1]
print futureMins


[1 1 2 2 2 4 4 4 6]

[0 0 0 0 0 4 4 4 6]

In [24]:
# np.c_[x, x]
np.minimum.accumulate([0][::-1])[::-1]


Out[24]:
array([0])

In [28]:
x1, x2 = np.vstack((x, x)).T
print x1, x2


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-7fceca1dad26> in <module>()
----> 1 x1, x2 = np.vstack((x, x)).T
      2 print x1, x2

ValueError: too many values to unpack

In [5]:
if x[-1:-1]:
    print "true"
else:
    print "false"


false

In [7]:
# is argsort ascending or descending? EDIT: ascending
print x
print np.argsort(x)
print np.argsort(x[::-1])


[0 1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7 8]
[8 7 6 5 4 3 2 1 0]

In [8]:
np.argmin([1, 1])


Out[8]:
0

In [14]:
x.astype(np.int)
A
print np.mean(A, axis=0)
print np.mean(A, axis=0).astype(np.int)


[ 4.5  5.5  6.5]
[4 5 6]

In [27]:
s = set()
s |= set([2, 4, 6])
s |= set(range(3))
s
print 2 in s
print 71 in s


True
False

In [24]:
print A
print A[[1, 2]] # select multiple rows
print A[(1, 2)] # select row, col


[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[[3 4 5]
 [6 7 8]]
5

In [28]:
2**8


Out[28]:
256

In [32]:
z = [-1.5, -.5, 0, .5, 1.5]
bins = [-1, 0, 1]
print np.digitize(z, bins)
print np.digitize(z, bins, right=True) # <= vs <


[0 1 2 2 3]
[0 1 1 2 3]

In [37]:
from scipy.stats import norm
# yep, so ppf is inverse cdf
print norm.ppf(.5)
print norm.ppf(0)
print norm.ppf(1.)
print norm.ppf(.975)
print norm.ppf([0, .5, .975, 1])


0.0
-inf
inf
1.95996398454
[       -inf  0.          1.95996398         inf]

In [41]:
quantiles = np.linspace(0, 1, 4 + 1) # 4+1 pts -> 4 intervals
print quantiles
print quantiles[1:-1]


[ 0.    0.25  0.5   0.75  1.  ]
[ 0.25  0.5   0.75]

In [45]:
quantilesNoInf = quantiles[1:-1]
breakPoints = norm.ppf(quantilesNoInf)
print breakPoints
print
print np.digitize(z, breakPoints)


[-0.67448975  0.          0.67448975]

[0 1 2 2 3]

In [68]:
def saxMat(origLen, wordLen):
    # note that this just ignores the last few points
    # if origLen isn't a multiple of wordLen
	symbolLen = origLen // wordLen
	startIdxs = np.arange(0, origLen, symbolLen)
	endIdxs = np.r_[startIdxs[1:], origLen]

# 	filterMat = np.zeros((origLen, wordLen))
# 	for i in range(wordLen):
# 		start, end = startIdxs[i], endIdxs[i]
# 		filterMat[start:end, i] = 1. / (end - start)

# 	return filterMat

	filterMat = np.zeros((wordLen, origLen))
	for i in range(wordLen):
		start, end = startIdxs[i], endIdxs[i]
		filterMat[i, start:end] = 1. / (end - start)

	return filterMat.T

print saxMat(8, 4)
print saxMat(6, 4)


[[ 0.5  0.   0.   0. ]
 [ 0.5  0.   0.   0. ]
 [ 0.   0.5  0.   0. ]
 [ 0.   0.5  0.   0. ]
 [ 0.   0.   0.5  0. ]
 [ 0.   0.   0.5  0. ]
 [ 0.   0.   0.   0.5]
 [ 0.   0.   0.   0.5]]
[[ 1.  0.  0.  0.]
 [ 0.  1.  0.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]]

In [70]:
2 ** x


Out[70]:
array([  1,   2,   4,   8,  16,  32,  64, 128, 256])

In [74]:
[[x, (x * (x+1))/2] for x in range(1,8)]


Out[74]:
[[1, 1], [2, 3], [3, 6], [4, 10], [5, 15], [6, 21], [7, 28]]

In [79]:
age = np.arange(100)
prob = np.exp(-.003 * np.exp((age - 25.)/10))
plt.plot(age, pct)
print prob[[23, 50, 75, 80]]


[ 0.99754682  0.96411231  0.64067083  0.47994882]

In [90]:
print A
print A[1, 1:3]
print A[1, [1, 2]]
print A[1, (1, 2)]
print A[1, np.array((1, 2))]
print
print A[:, [0, 2, 1]] # not useful
print A[np.arange(len(A)), [0, 2, 1, 0]] # elements [0,2,1,0] from rows


[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[4 5]
[4 5]
[4 5]
[4 5]

[[ 0  2  1]
 [ 3  5  4]
 [ 6  8  7]
 [ 9 11 10]]
[0 5 7 9]

In [99]:
# how does logspace work?
print np.logspace(np.log2(1), np.log2(100), num=5, base=2.)
print np.logspace(np.log10(1), np.log10(100), num=5, base=10.)


[  1.           3.16227766  10.          31.6227766  100.        ]
[   1.            3.16227766   10.           31.6227766   100.        ]

In [113]:
T = np.empty((2, A.shape[0], A.shape[1]))
T[0] = A
T[1] = -A
print T
# print T.reshape((T.shape[1], -1)) # nope, not what I want...
# print T.T.reshape((T.shape[1], -1)) # garbage

def vstack3Tensor(X):
	# assumes time progresses along columns
	return X.reshape((-1, X.shape[2])) # add rows for each leading dim

def hstack3Tensor(X):
	nDims = X.shape[0]
	transposed = np.empty((nDims, X.shape[2], X.shape[1]))
	for dim in range(nDims):
		transposed[dim] = X[dim].T
	return vstack3Tensor(transposed).T

print hstack3Tensor(T)


[[[  0.   1.   2.]
  [  3.   4.   5.]
  [  6.   7.   8.]
  [  9.  10.  11.]]

 [[  0.  -1.  -2.]
  [ -3.  -4.  -5.]
  [ -6.  -7.  -8.]
  [ -9. -10. -11.]]]
[[  0.   1.   2.   0.  -1.  -2.]
 [  3.   4.   5.  -3.  -4.  -5.]
 [  6.   7.   8.  -6.  -7.  -8.]
 [  9.  10.  11.  -9. -10. -11.]]

In [114]:
A[[1,3]]


Out[114]:
array([[ 3,  4,  5],
       [ 9, 10, 11]])

In [116]:
4 * (1. - 1.)


Out[116]:
0.0

In [5]:
print A[:, 1:3]
print A[:, [1,2]]
print A[:, (1,2)]


[[ 1  2]
 [ 4  5]
 [ 7  8]
 [10 11]]
[[ 1  2]
 [ 4  5]
 [ 7  8]
 [10 11]]
[[ 1  2]
 [ 4  5]
 [ 7  8]
 [10 11]]

In [3]:
d = {'a': 1, 'b': 2, 'c': 3}
print d
print isinstance(d, dict)


{'a': 1, 'b': 2}
True

In [8]:
a = [1, 5, 5, 4, 2, 2]
print a
print
uniq, idxs, inverse = np.unique(a, return_index=True, return_inverse=True)
print uniq
print idxs
print inverse


[1, 5, 5, 4, 2, 2]

[1 2 4 5]
[0 4 3 1]
[0 3 3 2 1 1]

In [9]:
ord('a')


Out[9]:
97

In [11]:
d
len(d)


Out[11]:
2

In [12]:
ord('abc')


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-6930e82a821d> in <module>()
----> 1 ord('abc')

TypeError: ord() expected a character, but string of length 3 found

In [22]:
rowNorms = np.linalg.norm(A, axis=1)
print rowNorms
print A / rowNorms.reshape((-1, 1))


[  2.23606798   7.07106781  12.20655562  17.3781472 ]
[[ 0.          0.4472136   0.89442719]
 [ 0.42426407  0.56568542  0.70710678]
 [ 0.49153915  0.57346234  0.65538554]
 [ 0.5178918   0.57543534  0.63297887]]

In [29]:
Amips = A / 20.
print Amips
newNorms = np.linalg.norm(Amips, axis=1).reshape((-1,1))
Aaugment = np.hstack((Amips, np.sqrt(1. - newNorms**2)))
print Aaugment
print np.linalg.norm(Aaugment, axis=1)


[[ 0.    0.05  0.1 ]
 [ 0.15  0.2   0.25]
 [ 0.3   0.35  0.4 ]
 [ 0.45  0.5   0.55]]
[[ 0.          0.05        0.1         0.99373035]
 [ 0.15        0.2         0.25        0.93541435]
 [ 0.3         0.35        0.4         0.79214898]
 [ 0.45        0.5         0.55        0.49497475]]
[ 1.  1.  1.  1.]

In [41]:
print np.hstack([x[0], x[0], x])
print [3] * 20
print x[:50]


[0 0 0 1 2 3 4 5 6 7 8]
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
[0 1 2 3 4 5 6 7 8]

In [47]:
print repr(np.array(['a'] * 3))
print repr(np.array(['a'] * 3, dtype=np.object))


array(['a', 'a', 'a'], 
      dtype='|S1')
array(['a', 'a', 'a'], dtype=object)

In [48]:
'abc'.replace('a', 'A')


Out[48]:
'Abc'

In [56]:
l
l2 = []
l2.append(l[2:4])
l2.append(l[2:10])
print l2
l3 = []
print l3 + l2
arr = np.array(l2)
print arr
print arr[-1]


[[2, 4], [2, 4]]
[[2, 4], [2, 4]]
[[2 4]
 [2 4]]
[2 4]

In [58]:
import os
os.path.join("foo", "", "bar")


Out[58]:
'foo/bar'

In [60]:
x
x[::3]


Out[60]:
array([0, 3, 6])

In [65]:
print l
print l + l


[1, 3, 2, 4]
[1, 3, 2, 4, 1, 3, 2, 4]

In [76]:
print x
print np.append(x, np.random.choice(x))
xl = list(x)
print xl[:2] + xl[3:]


[0 1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7 8 8]
[0, 1, 3, 4, 5, 6, 7, 8]

In [79]:
l2 = [l, l, l]
print l2
np.array(l2)


[[1, 3, 2, 4], [1, 3, 2, 4], [1, 3, 2, 4]]
Out[79]:
array([[1, 3, 2, 4],
       [1, 3, 2, 4],
       [1, 3, 2, 4]])

In [85]:
d
d2 = {'c': 3}
import copy
d3 = copy.copy(d)
print d3
d3.update(d2)
print d3


{'a': 1, 'b': 2}
{'a': 1, 'c': 3, 'b': 2}

In [86]:
np.std([3])


Out[86]:
0.0

In [98]:
print l
print np.std(l)
np.std([])
print np.sort(set((3,1,2)))
print np.sort(list(set((3,1,2))))
print np.sort(list(set((3,1,2)))).shape


[1, 3, 2, 4]
1.11803398875
set([1, 2, 3])
[1 2 3]
(3,)
[1 2 3]

In [103]:
np.arange(2, 4 + .5, .5)
np.arange(1.6, 2.3, .15)


Out[103]:
array([ 1.6 ,  1.75,  1.9 ,  2.05,  2.2 ])

In [7]:
B = np.copy(A)
B[0] = (3,3,3)
B


Out[7]:
array([[ 3,  3,  3],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [4]:
print l
print np.copy(l)


[1, 3, 2, 4]
[1 3 2 4]

In [12]:
x = np.array([[1, 1], [2, 2]])
X = np.arange(12).reshape((2, -1))
from scipy import signal as sig

print sig.correlate2d(X, x, mode='valid').flatten()

for i in range(len(X.T) - 1):
    print np.sum(x * X[:,i:i+2])


[27 33 39 45 51]
27
33
39
45
51

In [15]:
x = np.arange(5)
print np.append(x, [-1])
print np.append(x, -1)


[ 0  1  2  3  4 -1]
[ 0  1  2  3  4 -1]

In [19]:
gains = np.random.randn(5,5)
print (gains > 0)
gains * (gains > 0)


[[ True False False False  True]
 [False False False False  True]
 [False  True  True False  True]
 [ True  True  True  True False]
 [ True  True False  True False]]
Out[19]:
array([[ 1.00293799, -0.        , -0.        , -0.        ,  1.26178301],
       [-0.        , -0.        , -0.        , -0.        ,  1.11865304],
       [-0.        ,  0.52674634,  0.56035196, -0.        ,  0.9082665 ],
       [ 0.14635371,  1.74640125,  0.35616502,  0.83678395, -0.        ],
       [ 0.08436306,  1.48986375, -0.        ,  1.85017025, -0.        ]])

In [22]:
# fig, axes = plt.subplots(2, 5, figsize=(10,8))
# print axes
# print axes.flatten()

In [28]:
mask = A > 0
print mask
idxs = np.where(A > 0)
print idxs
print
print A[mask]
print A[idxs]


[[False  True  True]
 [ True  True  True]
 [ True  True  True]
 [ True  True  True]]
(array([0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]), array([1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]))

[ 1  2  3  4  5  6  7  8  9 10 11]
[ 1  2  3  4  5  6  7  8  9 10 11]

In [31]:
B = np.zeros((4,3)) + 7
print B
B[:] = 0
print B


[[ 7.  7.  7.]
 [ 7.  7.  7.]
 [ 7.  7.  7.]
 [ 7.  7.  7.]]
[[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]

In [36]:
np.maximum([np.log(0)], [0])


Out[36]:
array([ 0.])

In [38]:
X = np.array([0,1,1,1,1,0,1,0,0,1,1,1,0,1,0])
print np.max(X > 0)


True

In [40]:
np.random.randn(4,3)


Out[40]:
array([[ 0.61668957,  1.42071182, -0.40607884],
       [ 1.47055673,  1.5408724 , -0.05598736],
       [-1.14912962, -0.90028814, -1.6297542 ],
       [-0.59258562, -0.79848367,  0.8873844 ]])

In [43]:
min(37 // 2, 75)
max(18, 8)


Out[43]:
18

In [44]:
np.sqrt(np.inf)


Out[44]:
inf

In [50]:
uniqs, counts = np.unique(['a','b','a'], return_counts=True)
print uniqs, counts
uniqs, counts = np.unique([], return_counts=True)
print uniqs, counts
# print np.max(counts) # throws
uniqs, counts = np.unique(['a'], return_counts=True)
print uniqs, counts


['a' 'b'] [2 1]
[] []
['a'] [1]

In [52]:
None > 0
max(1, None)


Out[52]:
1

In [54]:
import os
os.path.join('foo', 'bar', 'baz')


Out[54]:
'foo/bar/baz'

In [2]:
print np.random.choice(np.arange(10, 20), 2)
print list(np.random.choice(np.arange(10, 20), 2))


[15 10]
[16, 18]

In [4]:
if np.nan:
    print "nan is true"


nan is true

In [23]:
# load dataframes to play with; src = 
# http://pandas.pydata.org/pandas-docs/stable/merging.html
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                     'C': ['C0', 'C1', 'C2', 'C3'],
                     'D': ['D0', 'D1', 'D2', 'D3']},
                     index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                     'B': ['B4', 'B5', 'B6', 'B7'],
                     'C': ['C4', 'C5', 'C6', 'C7'],
                     'D': ['D4', 'D5', 'D6', 'D7']},
                      index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                     'B': ['B8', 'B9', 'B10', 'B11'],
                     'C': ['C8', 'C9', 'C10', 'C11'],
                     'D': ['D8', 'D9', 'D10', 'D11']},
                     index=[8, 9, 10, 11])
print df1
print df2
print df3


    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2
3  A3  B3  C3  D3
    A   B   C   D
4  A4  B4  C4  D4
5  A5  B5  C5  D5
6  A6  B6  C6  D6
7  A7  B7  C7  D7
      A    B    C    D
8    A8   B8   C8   D8
9    A9   B9   C9   D9
10  A10  B10  C10  D10
11  A11  B11  C11  D11

In [33]:
df4 = df1[['A', 'C', 'D']] # select the "relevant" columns
df5 = df4.set_index('A')
print df5
print len(df5)


     C   D
A         
A0  C0  D0
A1  C1  D1
A2  C2  D2
A3  C3  D3
4

In [39]:
df5.ix['A0']['C']


Out[39]:
'C0'

In [33]:
print df1
print
print pd.melt(df1, id_vars='C') # melts *everything*
print
print pd.melt(df1, id_vars='C', value_vars=['A']) # melts *everything*, but only keeps A....
print
print pd.melt(df1, id_vars='C', value_vars=['A', 'B']) # melts *everything*, but only keeps A....
print
print pd.melt(df1, id_vars=['C', 'D'], value_vars=['A']) # melts *everything*, but only keeps A....


# So here's what melt() does:
# -keep the values in id_vars
# -for all other cols:
#   -throw them away if value_vars is specified and
#   they aren't in the list give
#   -if not discarded, treat each col *name* as the value of a 
#   categorical variable that's basically a key, and record the
#   value in that col (where the index cols have whatever values)
#   as the value associated with that key
#
# basically, split cols into id cols, which become like an index, and
# other cols, then flatten other cols into a bunch of kv pairs, where
# the key is the col name and value is whatever the value is when
# id cols take on those values; this means replicating sets of values in
# the id cols multiple times if there are multiple values
# 
# or, even more briefly, it takes one record with a bunch of fields and
# turns it into a bunch of records with one field
#  -and the "index" vars let you uniquely identify which records refer 
#  to the same underlying thing
#
# I don't actually know why you would use this


    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2
3  A3  B3  C3  D3

     C variable value
0   C0        A    A0
1   C1        A    A1
2   C2        A    A2
3   C3        A    A3
4   C0        B    B0
5   C1        B    B1
6   C2        B    B2
7   C3        B    B3
8   C0        D    D0
9   C1        D    D1
10  C2        D    D2
11  C3        D    D3

    C variable value
0  C0        A    A0
1  C1        A    A1
2  C2        A    A2
3  C3        A    A3

    C variable value
0  C0        A    A0
1  C1        A    A1
2  C2        A    A2
3  C3        A    A3
4  C0        B    B0
5  C1        B    B1
6  C2        B    B2
7  C3        B    B3

    C   D variable value
0  C0  D0        A    A0
1  C1  D1        A    A1
2  C2  D2        A    A2
3  C3  D3        A    A3

In [34]:
# DataFrame.pivot(index=None, columns=None, values=None)
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.pivot.html#pandas.DataFrame.pivot
print df1
print
# df5 = df1.drop('D', axis=1)
df5 = df1
# print df1.pivot(index=['A']) # throws
print df5.pivot(index='A', columns='B')
print
print df5.pivot(index='A', columns='B', values='C')

# so this would do something useful if there were multiple values of 
# C or D for a given value of A and B, but since there's only 1, there's
# only one non-nan element per row/col.
#
# the idea is that it creates another dataframe with an index
# given by the values in the "index" columns, and creates a column
# for each value in the "columns" column(s). It populates this df using
# the data in the remaining columns (creating a hierarchical column
# index if there's more than one remaining column, apparently...)
#
# more concisely, it takes a bunch of records with one variable (but
# that refer to the "same thing" according to the index vars) and
# turns them into one record with a bunch of variables


    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1
2  A2  B2  C2  D2
3  A3  B3  C3  D3

      C                   D               
B    B0   B1   B2   B3   B0   B1   B2   B3
A                                         
A0   C0  NaN  NaN  NaN   D0  NaN  NaN  NaN
A1  NaN   C1  NaN  NaN  NaN   D1  NaN  NaN
A2  NaN  NaN   C2  NaN  NaN  NaN   D2  NaN
A3  NaN  NaN  NaN   C3  NaN  NaN  NaN   D3

B    B0   B1   B2   B3
A                     
A0   C0  NaN  NaN  NaN
A1  NaN   C1  NaN  NaN
A2  NaN  NaN   C2  NaN
A3  NaN  NaN  NaN   C3

In [11]:
s = '[1, 2, 3]'
splitStr = s.strip('[').strip(']').split(',')
numList = [int(s) for s in splitStr]
print np.array(numList)


[1 2 3]

In [19]:
import seaborn as sb
sb.set_context('talk')
gammas = sb.load_dataset('gammas')
print gammas.columns.values
ax = sb.tsplot(time="timepoint", value="BOLD signal", unit="subject", condition="ROI", data=gammas)
# ax = sb.tsplot(time="timepoint", value="BOLD signal", condition="ROI", data=gammas, n_boot=0)
# ^ "cannot label index with a null key" when we don't tell it the unit


['timepoint' 'ROI' 'subject' 'BOLD signal']

In [21]:
import os
os.path.join(None, 'foo')


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-21-a311833522d9> in <module>()
      1 import os
----> 2 os.path.join(None, 'foo')

//anaconda/python.app/Contents/lib/python2.7/posixpath.pyc in join(a, *p)
     68         if b.startswith('/'):
     69             path = b
---> 70         elif path == '' or path.endswith('/'):
     71             path +=  b
     72         else:

AttributeError: 'NoneType' object has no attribute 'endswith'

In [23]:
# print [i for i in range(3) for j in range(2)]
print [(i, j) for i in range(3) for j in range(2)]


[(0, 0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)]

In [8]:
print d
t = True
t &= False
print t


{'a': 1, 'c': 3, 'b': 2}
False

In [18]:
a = np.arange(5)
list(a)
# list(None) # throws
list([])
import os
print os.path.join(None)
print os.path.join([])


None
[]

In [23]:
d = {'a': 1, 'c': 3, 'b': 2}
print d.setdefault('a', 5)
print d
print
print d.setdefault('d', 4)
print d


1
{'a': 1, 'c': 3, 'b': 2}

4
{'a': 1, 'c': 3, 'b': 2, 'd': 4}

In [26]:
from pprint import pprint
pprint(d)


{'a': 1, 'b': 2, 'c': 3, 'd': 4}

In [39]:
# figure out how to make ts 1D using PCA
from sklearn.decomposition import TruncatedSVD
X = []
for i in range(5):
    v = np.sin(np.linspace(0, 6, 100))
    X.append(v)
    X.append(v + np.random.randn(*v.shape) / 2.)
X = np.vstack(X).T # each col of x is 1 dimension
plt.plot(X)

vect = TruncatedSVD(n_components=1).fit_transform(X)
plt.figure()
plt.plot(vect)


Out[39]:
[<matplotlib.lines.Line2D at 0x1170de090>]

In [46]:
print x
print np.append(x, np.zeros(0))


[0 1 2 3 4 5 6 7 8]
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.]

In [53]:
print x
idxs = np.array([1,2,4], dtype=np.int)
np.vstack([x[i:i+2] for i in idxs])


[0 1 2 3 4 5 6 7 8]
Out[53]:
array([[1, 2],
       [2, 3],
       [4, 5]])

In [55]:
np.mean(A, axis=1, keepdims=True)


Out[55]:
array([[  1.],
       [  4.],
       [  7.],
       [ 10.]])

In [58]:
np.ceil(1045 / 16)


Out[58]:
65.0

In [67]:
l = [1, 3, 2, 4]
l.insert(2, 99)
print l


[1, 3, 99, 2, 4]

In [70]:
ll = [(1, 2), (3, 4), (5, 6)]
print zip(*ll)
ll2 = [np.array(el) for el in ll]
print zip(*ll2) # works with arrays too (as it should)


[(1, 3, 5), (2, 4, 6)]
[(1, 3, 5), (2, 4, 6)]

In [64]:
idxs = np.arange(3)
print A
print A[[idxs, idxs]]


[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
[0 4 8]

In [67]:
print x
print np.hstack((x, x))


[0 1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7 8 0 1 2 3 4 5 6 7 8]

In [68]:
[None] * 5


Out[68]:
[None, None, None, None, None]

In [70]:
print np.logspace(-4, 4, 3)


[  1.00000000e-04   1.00000000e+00   1.00000000e+04]

In [7]:
print d
print zip(*d.items())
print list(d)
print dict([(v, k) for k, v in d.items()])


{'a': 1, 'c': 3, 'b': 2}
[('a', 'c', 'b'), (1, 3, 2)]
['a', 'c', 'b']
{1: 'a', 2: 'b', 3: 'c'}

In [9]:
np.random.randn(3,4)


Out[9]:
array([[-0.11030662,  0.59889602, -0.76816246,  0.06651416],
       [-0.73167015, -0.38723073,  0.03111508, -1.0512609 ],
       [-0.00806305,  0.7402428 ,  2.22282908, -0.05659071]])

In [14]:
a = np.random.randn(5)
print a
print a.astype(np.int32)
print np.floor(a).astype(np.int32)


[-1.04913536  1.18888735 -0.84217456  1.05444047  0.99907314]
[-1  1  0  1  0]
[-2  1 -1  1  0]

In [15]:
# from collections import defaultdict
# dd = defaultdict(defaultdict)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-b064a020db80> in <module>()
----> 1 dd

NameError: name 'dd' is not defined

In [18]:
x[::-1][:-1]


Out[18]:
array([8, 7, 6, 5, 4, 3, 2, 1])

In [22]:
B = A.astype(np.float)
print B
print B * (B > 3.5)


[[  0.   1.   2.]
 [  3.   4.   5.]
 [  6.   7.   8.]
 [  9.  10.  11.]]
[[  0.   0.   0.]
 [  0.   4.   5.]
 [  6.   7.   8.]
 [  9.  10.  11.]]

In [28]:
x
2 <= x <= 7


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-3b3f3555fba7> in <module>()
      1 x
----> 2 2 <= x <= 7

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [37]:
# is numpy pad func worth using?
# -> not really; super awkward to only pad 1 dimension
ax0Pad = 0
ax1Pad = 2
print np.lib.pad(A, [(ax0Pad, ax0Pad),(ax1Pad, ax1Pad)], mode='constant')
print np.lib.pad(A, [(ax0Pad, ax0Pad),(ax1Pad, ax1Pad)], mode='edge')


[[ 0  0  0  1  2  0  0]
 [ 0  0  3  4  5  0  0]
 [ 0  0  6  7  8  0  0]
 [ 0  0  9 10 11  0  0]]
[[ 0  0  0  1  2  2  2]
 [ 3  3  3  4  5  5  5]
 [ 6  6  6  7  8  8  8]
 [ 9  9  9 10 11 11 11]]

In [40]:
s = 'bletchKey=bletchVal'
print s.split('=')
print len(s.split('='))


['bletchKey', 'bletchVal']
2

In [42]:
range(5, 0-1, -1)


Out[42]:
[5, 4, 3, 2, 1, 0]

In [52]:
from collections import namedtuple
_fields = [
	'children',
	'is_leaf'
]

Node = namedtuple('Node', _fields)
Node.__new__.__defaults__ = ([], True)

Node()


Out[52]:
Node(children=[], is_leaf=True)

In [71]:
# bonferroni correction vs sidak correction
#
# Hmm...it appears that there's effectively no 
# difference between them whatsoever unless
# alpha is huge
k = np.linspace(1, 10, 100)
for alpha in (.01, .05, .1, .5):
    y1 = 1 - (1. - alpha) ** (1./k) # sidak
    y2 = alpha / k # bonferroni
    plt.plot(k, y1, 'b')
    plt.plot(k, y2, 'r')
    plt.xlim([1, np.max(k)])
    plt.ylim([0, alpha])
    plt.show()



In [6]:
l
l2 = l[:]
l3 = l[:]
print l2
print l2[1:-1]
l2[1:-1] = ''
print l2
l3[1] = ''
l3[2] = ''
print l3


[1, 3, 2, 4]
[3, 2]
[1, 4]
[1, '', '', 4]

In [9]:
print d
print {k: d[k] for k in ('a', 'c')}


{'a': 1, 'c': 3, 'b': 2}
{'a': 1, 'c': 3}

In [14]:
y = [3,2,-1,2,1]
print np.unique(y, return_inverse=True)


(array([-1,  1,  2,  3]), array([3, 2, 0, 2, 1]))

In [15]:
5.0 // 3.0


Out[15]:
1.0

In [18]:
lll = [[]] * 3
print lll
lll[0].append(3)
print lll # ugh, bad! copied same list pointer 3 times...
print
l3 = [[] for i in range(3)]
print l3
l3[0].append(3)
print l3


[[], [], []]
[[3], [3], [3]]

[[], [], []]
[[3], [], []]

In [23]:
# am I calling setattr right?
class C(object):
    pass

o = C()
print o
setattr(o, 'foo', 7)
print o.__dict__['foo']

print
print d
o.__dict__.update(d)
print [o.__dict__[k] for k in d]


<__main__.C object at 0x1147ca950>
7

{'a': 1, 'c': 3, 'b': 2}
[1, 3, 2]

In [26]:
lll = [[0,1],[2,3],[4,6]]
print zip(*lll)
print zip(*lll)[1]


[(0, 2, 4), (1, 3, 6)]
(1, 3, 6)

In [ ]: