NUMPY


In [1]:
%matplotlib inline

In [2]:
import numpy as np

高效的Numpy


In [4]:
import numpy as np
arr = np.arange(1000)
def get_sum(arr):
    acc = 0
    for item in arr:
        acc += item
    return acc
%timeit get_sum(arr)


10000 loops, best of 3: 118 µs per loop

In [5]:
%timeit sum(arr)


10000 loops, best of 3: 89.9 µs per loop

In [6]:
%timeit np.sum(arr)


The slowest run took 43.63 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 3.09 µs per loop

创建Ndarray


In [11]:
# List Tuple
a = np.array([1, 2, 3, 4])
b = np.array((5, 6, 7, 8))
c = np.array([[1, 2, 3, 4],[4, 5, 6, 7], [7, 8, 9, 10]], dtype=int)
b


Out[11]:
array([5, 6, 7, 8])

In [12]:
c


Out[12]:
array([[ 1,  2,  3,  4],
       [ 4,  5,  6,  7],
       [ 7,  8,  9, 10]])

In [13]:
c.dtype


Out[13]:
dtype('int64')

In [14]:
# shape属性
print a.shape
print c.shape


(4,)
(3, 4)

In [15]:
a.itemsize


Out[15]:
8

In [16]:
a.ndim


Out[16]:
1

In [17]:
a.size


Out[17]:
4

DTypes

改变类型 numpy中的数据类型转换,不能直接改原数据的dtype! 只能用函数astype()。


In [18]:
a = np.random.random(4)

In [19]:
a.dtype


Out[19]:
dtype('float64')

In [20]:
print a


[ 0.60821019  0.93451057  0.15028573  0.71730223]

In [21]:
a.dtype = 'float32'

In [22]:
print a


[  4.75686619e-07   1.77705252e+00  -1.29937671e-05   1.85862756e+00
   1.57849608e-21   1.52528572e+00  -7.68008346e+09   1.80432546e+00]

In [23]:
c = a.astype(np.float32)
c


Out[23]:
array([  4.75686619e-07,   1.77705252e+00,  -1.29937671e-05,
         1.85862756e+00,   1.57849608e-21,   1.52528572e+00,
        -7.68008346e+09,   1.80432546e+00], dtype=float32)

In [24]:
c = a.astype(np.int32)
c


Out[24]:
array([          0,           1,           0,           1,           0,
                 1, -2147483648,           1], dtype=int32)

创建特定形状的多维数组并进行填充


In [25]:
# 开始值、终值和步长来创建一维数组
np.arange(0,1,0.1)


Out[25]:
array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9])

In [28]:
np.linspace(0, 1, 12)


Out[28]:
array([ 0.        ,  0.09090909,  0.18181818,  0.27272727,  0.36363636,
        0.45454545,  0.54545455,  0.63636364,  0.72727273,  0.81818182,
        0.90909091,  1.        ])

In [29]:
np.logspace(0, 2, 20)


Out[29]:
array([   1.        ,    1.27427499,    1.62377674,    2.06913808,
          2.6366509 ,    3.35981829,    4.2813324 ,    5.45559478,
          6.95192796,    8.8586679 ,   11.28837892,   14.38449888,
         18.32980711,   23.35721469,   29.76351442,   37.92690191,
         48.32930239,   61.58482111,   78.47599704,  100.        ])

In [30]:
np.empty([2, 2])


Out[30]:
array([[ 0.,  0.],
       [ 0.,  0.]])

In [31]:
np.empty([2, 2], dtype=int)


Out[31]:
array([[0, 0],
       [0, 0]])

In [32]:
a = np.array([[1., 2., 3.],[4.,5.,6.]])
print np.empty_like(a)


[[  0.00000000e+000   0.00000000e+000   2.13553025e-314]
 [  2.16293988e-314   0.00000000e+000   0.00000000e+000]]

In [33]:
np.eye(2, dtype=int)


Out[33]:
array([[1, 0],
       [0, 1]])

In [34]:
np.identity(3)


Out[34]:
array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [35]:
np.ones(5)


Out[35]:
array([ 1.,  1.,  1.,  1.,  1.])

In [36]:
np.ones((5,), dtype=np.int)


Out[36]:
array([1, 1, 1, 1, 1])

In [37]:
np.ones((2, 1))


Out[37]:
array([[ 1.],
       [ 1.]])

In [38]:
np.ones((2,2))


Out[38]:
array([[ 1.,  1.],
       [ 1.,  1.]])

广播


In [39]:
a = np.array([1.0,2.0,3.0])
b = np.array([2.0,2.0,2.0])
a * b


Out[39]:
array([ 2.,  4.,  6.])

In [40]:
## 当不相等时,则会采用规则对其:
a = np.array([1.0,2.0,3.0])
b = 2
a * b


Out[40]:
array([ 2.,  4.,  6.])

In [42]:
a = np.arange(0, 6).reshape(6, 1)
a.shape
a


Out[42]:
array([[0],
       [1],
       [2],
       [3],
       [4],
       [5]])

In [44]:
b = np.arange(0, 5)
b.shape
b


Out[44]:
array([0, 1, 2, 3, 4])

In [45]:
c = a + b

In [46]:
print c


[[0 1 2 3 4]
 [1 2 3 4 5]
 [2 3 4 5 6]
 [3 4 5 6 7]
 [4 5 6 7 8]
 [5 6 7 8 9]]

变换,索引和切片


In [48]:
a = np.array([1, 2, 3, 4])
d = a.reshape((2,2))
d


Out[48]:
array([[1, 2],
       [3, 4]])

In [49]:
a = np.arange(6).reshape((3, 2))
a


Out[49]:
array([[0, 1],
       [2, 3],
       [4, 5]])

In [50]:
a=np.array([[0,1],[2,3]])
np.resize(a,(2,3))


Out[50]:
array([[0, 1, 2],
       [3, 0, 1]])

In [51]:
np.resize(a,(1,4))


Out[51]:
array([[0, 1, 2, 3]])

In [52]:
np.resize(a,(2,4))


Out[52]:
array([[0, 1, 2, 3],
       [0, 1, 2, 3]])

In [55]:
arr = np.arange(10) 
arr[5]


Out[55]:
5

In [56]:
arr[5:8]


Out[56]:
array([5, 6, 7])

In [57]:
arr_slice = arr[5:8]  
arr_slice[1] = 12345

In [58]:
arr


Out[58]:
array([    0,     1,     2,     3,     4,     5, 12345,     7,     8,     9])

In [59]:
arr_slice[:] = 64

In [60]:
arr


Out[60]:
array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])

In [61]:
arr_copy = arr[5:8].copy()

In [62]:
arr2d = np.array([[1,2,3],[4,5,6],[7,8,9]])

In [63]:
arr2d[2]


Out[63]:
array([7, 8, 9])

In [64]:
arr2d[0][2]


Out[64]:
3

In [65]:
arr2d[0,2]


Out[65]:
3

In [66]:
arr3d = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])  
arr3d


Out[66]:
array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [67]:
arr3d.shape


Out[67]:
(2, 2, 3)

In [68]:
arr3d[0]


Out[68]:
array([[1, 2, 3],
       [4, 5, 6]])

In [69]:
old_values = arr3d[0].copy()  
arr3d[0]= 42  
arr3d


Out[69]:
array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [70]:
arr2d


Out[70]:
array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [71]:
arr2d[:2]


Out[71]:
array([[1, 2, 3],
       [4, 5, 6]])

In [72]:
arr2d[:2,1:]


Out[72]:
array([[2, 3],
       [5, 6]])

In [73]:
arr2d[1,:2]


Out[73]:
array([4, 5])

In [74]:
arr2d[2,:1]


Out[74]:
array([7])

In [75]:
arr2d[:,:1]


Out[75]:
array([[1],
       [4],
       [7]])

In [76]:
arr2d[:2,1:] = 0  
arr2d


Out[76]:
array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])

In [77]:
names = np.array(['Bob','Joe','Will','Bob','Will','Joe','Joe'])

In [78]:
import numpy.random  
data = numpy.random.randn(7,4)  
data


Out[78]:
array([[ 0.18288168, -0.94369605, -0.11244431,  0.40186004],
       [ 0.06163719,  1.64483343,  0.33081673,  0.80889786],
       [-0.52448079,  0.36007662,  0.32996677, -0.20352982],
       [ 0.57113387,  1.29816816,  0.29646005, -0.1379642 ],
       [ 0.36725083,  0.48369201,  2.58987207,  0.66315957],
       [ 1.60421507,  0.57683305,  1.03288016,  0.08122319],
       [-0.12621481, -0.69359545,  0.67847661, -0.06644251]])

In [79]:
names


Out[79]:
array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], 
      dtype='|S4')

In [80]:
names == 'Bob'


Out[80]:
array([ True, False, False,  True, False, False, False], dtype=bool)

In [81]:
data[names == 'Bob']


Out[81]:
array([[ 0.18288168, -0.94369605, -0.11244431,  0.40186004],
       [ 0.57113387,  1.29816816,  0.29646005, -0.1379642 ]])

ufunc运算


In [82]:
x = np.linspace(0, 2*np.pi, 10)
x


Out[82]:
array([ 0.        ,  0.6981317 ,  1.3962634 ,  2.0943951 ,  2.7925268 ,
        3.4906585 ,  4.1887902 ,  4.88692191,  5.58505361,  6.28318531])

In [83]:
y = np.sin(x)

In [84]:
print y


[  0.00000000e+00   6.42787610e-01   9.84807753e-01   8.66025404e-01
   3.42020143e-01  -3.42020143e-01  -8.66025404e-01  -9.84807753e-01
  -6.42787610e-01  -2.44929360e-16]

In [85]:
t = np.sin(x,x)

In [86]:
id(t) == id(x)


Out[86]:
True

In [87]:
import time
import math
import numpy as np

x = [i * 0.001 for i in xrange(1000000)]
start = time.clock()
for i, t in enumerate(x):
    x[i] = math.sin(t)
print "math.sin:", time.clock() - start

x = [i * 0.001 for i in xrange(1000000)]
x = np.array(x)
start = time.clock()
np.sin(x,x)
print "numpy.sin:", time.clock() - start


math.sin: 0.263345
numpy.sin: 0.011054

In [88]:
a = np.arange(0,4)
print a
b = np.arange(1,5)
print b

np.add(a,b)


[0 1 2 3]
[1 2 3 4]
Out[88]:
array([1, 3, 5, 7])

封装ufuncs


In [90]:
def triangle_wave(x, c, c0, hc):
    x = x - int(x) # 三角波
    if x >= c: r = 0.0
    elif x < c0: r = x / c0 * hc
    else: r = (c-x) / (c-c0) * hc
    return r

In [91]:
x = np.linspace(0, 2, 1000)
y = np.array([triangle_wave(t, 0.6, 0.4, 1.0) for t in x])

In [92]:
triangle_ufunc = np.frompyfunc( lambda x: triangle_wave(x, 0.6, 0.4, 1.0), 1, 1)
y2 = triangle_ufunc(x)

In [93]:
data = np.array([
... [1,2,1],
... [0,3,1],
... [2,1,4],
... [1,3,1]])

In [94]:
data


Out[94]:
array([[1, 2, 1],
       [0, 3, 1],
       [2, 1, 4],
       [1, 3, 1]])

In [95]:
np.sum(data, axis=1)


Out[95]:
array([4, 4, 7, 5])

In [96]:
np.min(data, axis=0)


Out[96]:
array([0, 1, 1])

In [97]:
np.average(data)


Out[97]:
1.6666666666666667

In [98]:
data = np.random.randint(0, 5, [4,3,2,3])

In [99]:
data


Out[99]:
array([[[[3, 1, 2],
         [0, 0, 2]],

        [[2, 2, 4],
         [1, 4, 1]],

        [[2, 0, 0],
         [3, 0, 2]]],


       [[[2, 0, 0],
         [0, 1, 3]],

        [[4, 3, 4],
         [0, 0, 1]],

        [[4, 3, 0],
         [2, 3, 2]]],


       [[[1, 0, 2],
         [3, 4, 0]],

        [[4, 0, 4],
         [0, 1, 2]],

        [[0, 0, 2],
         [4, 1, 4]]],


       [[[2, 3, 1],
         [2, 4, 3]],

        [[1, 4, 1],
         [1, 2, 3]],

        [[2, 1, 2],
         [4, 1, 1]]]])

In [100]:
data.sum(axis=0)


Out[100]:
array([[[ 8,  4,  5],
        [ 5,  9,  8]],

       [[11,  9, 13],
        [ 2,  7,  7]],

       [[ 8,  4,  4],
        [13,  5,  9]]])

axis sort


In [101]:
data = np.random.randint(0, 5, [3,2,3])

In [102]:
data


Out[102]:
array([[[1, 3, 2],
        [1, 2, 0]],

       [[1, 2, 1],
        [4, 1, 0]],

       [[3, 1, 0],
        [1, 1, 2]]])

In [103]:
np.sort(data)  ## 默认对最大的axis进行排序,这里即是axis=2


Out[103]:
array([[[1, 2, 3],
        [0, 1, 2]],

       [[1, 1, 2],
        [0, 1, 4]],

       [[0, 1, 3],
        [1, 1, 2]]])

In [104]:
np.sort(data, axis=0)


Out[104]:
array([[[1, 1, 0],
        [1, 1, 0]],

       [[1, 2, 1],
        [1, 1, 0]],

       [[3, 3, 2],
        [4, 2, 2]]])

In [105]:
np.sort(data, axis=1)


Out[105]:
array([[[1, 2, 0],
        [1, 3, 2]],

       [[1, 1, 0],
        [4, 2, 1]],

       [[1, 1, 0],
        [3, 1, 2]]])

In [106]:
np.sort(data, axis=2)


Out[106]:
array([[[1, 2, 3],
        [0, 1, 2]],

       [[1, 1, 2],
        [0, 1, 4]],

       [[0, 1, 3],
        [1, 1, 2]]])

In [107]:
np.sort(data, axis=None)


Out[107]:
array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4])

prod(即product,乘积)

其他函数


In [108]:
import numpy as np
from tempfile import TemporaryFile

origin_array = np.array([1, 2, 3, 4])
np.save('/tmp/array', origin_array)

array_from_file = np.load('/tmp/array.npy')
print array_from_file


[1 2 3 4]

In [109]:
import numpy as np

origin_array = np.array([1, 2, 3, 4])
np.savetxt('array.txt', origin_array)

array_from_file = np.loadtxt('array.txt')
print array_from_file


[ 1.  2.  3.  4.]

In [110]:
array = np.fromstring('1 2 3 4', dtype=float, sep=' ')
print array


[ 1.  2.  3.  4.]

In [111]:
# 悲剧了
array = np.array([1, 2, 3, 4], dtype=int)
print np.fromstring(array.tostring())


[  4.94065646e-324   9.88131292e-324   1.48219694e-323   1.97626258e-323]

In [112]:
print np.fromstring(array.tostring(), dtype=int)


[1 2 3 4]

Pandas

from ndarray


In [113]:
import pandas as pd
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [114]:
s


Out[114]:
a   -0.336826
b    0.952793
c    0.333174
d    1.586247
e    0.717200
dtype: float64

In [115]:
s.index


Out[115]:
Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')

In [116]:
pd.Series(np.random.randn(5))


Out[116]:
0    0.556203
1   -0.288984
2   -0.291610
3    0.960958
4   -0.375165
dtype: float64

In [ ]:
### for dict

In [118]:
d = {'a' : 0., 'b' : 1., 'c' : 2.}

In [119]:
pd.Series(d)


Out[119]:
a    0.0
b    1.0
c    2.0
dtype: float64

In [120]:
pd.Series(d, index=['b', 'c', 'd', 'a'])


Out[120]:
b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64

In [ ]:
## 常量构造

In [121]:
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])


Out[121]:
a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [122]:
s[0]


Out[122]:
-0.3368258988445077

In [123]:
s[:3]


Out[123]:
a   -0.336826
b    0.952793
c    0.333174
dtype: float64

In [124]:
s[s > s.median()]


Out[124]:
b    0.952793
d    1.586247
dtype: float64

In [125]:
s[[4, 3, 1]]


Out[125]:
e    0.717200
d    1.586247
b    0.952793
dtype: float64

In [126]:
np.exp(s)


Out[126]:
a    0.714033
b    2.592941
c    1.395390
d    4.885381
e    2.048690
dtype: float64

In [ ]:
### Index Label

In [127]:
s['a']


Out[127]:
-0.3368258988445077

In [128]:
s['e'] = 12.

In [129]:
s


Out[129]:
a    -0.336826
b     0.952793
c     0.333174
d     1.586247
e    12.000000
dtype: float64

In [130]:
'e' in s


Out[130]:
True

In [131]:
't' in s


Out[131]:
False

In [ ]:
### 向量化操作

In [132]:
s + s


Out[132]:
a    -0.673652
b     1.905586
c     0.666348
d     3.172495
e    24.000000
dtype: float64

In [133]:
s * 2


Out[133]:
a    -0.673652
b     1.905586
c     0.666348
d     3.172495
e    24.000000
dtype: float64

In [134]:
np.exp(s)


Out[134]:
a         0.714033
b         2.592941
c         1.395390
d         4.885381
e    162754.791419
dtype: float64

In [135]:
s[1:] + s[:-1]


Out[135]:
a         NaN
b    1.905586
c    0.666348
d    3.172495
e         NaN
dtype: float64

In [ ]:
## DataFrame

In [136]:
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
    'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

In [137]:
df = pd.DataFrame(d)

In [138]:
df


Out[138]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0

In [139]:
pd.DataFrame(d, index=['d', 'b', 'a'])


Out[139]:
one two
d NaN 4.0
b 2.0 2.0
a 1.0 1.0

In [140]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])


Out[140]:
two three
d 4.0 NaN
b 2.0 NaN
a 1.0 NaN

In [141]:
### 字典构造
d = {'one' : [1., 2., 3., 4.],
    'two' : [4., 3., 2., 1.]}

In [142]:
pd.DataFrame(d)


Out[142]:
one two
0 1.0 4.0
1 2.0 3.0
2 3.0 2.0
3 4.0 1.0

In [143]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])


Out[143]:
one two
a 1.0 4.0
b 2.0 3.0
c 3.0 2.0
d 4.0 1.0

In [ ]:
## 结构化数据

In [144]:
data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])

In [145]:
data


Out[145]:
array([(0,  0., ''), (0,  0., '')], 
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [146]:
data[:] = [(1,2.,'Hello'), (2,3.,"World")]

In [147]:
data


Out[147]:
array([(1,  2., 'Hello'), (2,  3., 'World')], 
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])

In [148]:
pd.DataFrame(data)


Out[148]:
A B C
0 1 2.0 Hello
1 2 3.0 World

In [149]:
pd.DataFrame(data, index=['first', 'second'])


Out[149]:
A B C
first 1 2.0 Hello
second 2 3.0 World

In [150]:
pd.DataFrame(data, columns=['C', 'A', 'B'])


Out[150]:
C A B
0 Hello 1 2.0
1 World 2 3.0

In [151]:
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

In [152]:
pd.DataFrame(data2)


Out[152]:
a b c
0 1 2 NaN
1 5 10 20.0

In [153]:
pd.DataFrame(data2, index=['first', 'second'])


Out[153]:
a b c
first 1 2 NaN
second 5 10 20.0

In [154]:
pd.DataFrame(data2, columns=['a', 'b'])


Out[154]:
a b
0 1 2
1 5 10

In [ ]:
#### records

In [155]:
pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])],
orient='index', columns=['one', 'two', 'three'])


Out[155]:
one two three
A 1 2 3
B 4 5 6

In [156]:
df['one']


Out[156]:
a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [157]:
df['three'] = df['one'] * df['two']

In [158]:
df['flag'] = df['one'] > 2

In [159]:
df


Out[159]:
one two three flag
a 1.0 1.0 1.0 False
b 2.0 2.0 4.0 False
c 3.0 3.0 9.0 True
d NaN 4.0 NaN False

In [160]:
del df['two']

In [161]:
three = df.pop('three')

In [162]:
df


Out[162]:
one flag
a 1.0 False
b 2.0 False
c 3.0 True
d NaN False

In [163]:
df['foo'] = 'bar'

In [164]:
df['one_trunc'] = df['one'][:2]

In [165]:
df


Out[165]:
one flag foo one_trunc
a 1.0 False bar 1.0
b 2.0 False bar 2.0
c 3.0 True bar NaN
d NaN False bar NaN

In [166]:
df.insert(1, 'bar', df['one'])

In [167]:
df


Out[167]:
one bar flag foo one_trunc
a 1.0 1.0 False bar 1.0
b 2.0 2.0 False bar 2.0
c 3.0 3.0 True bar NaN
d NaN NaN False bar NaN

索引和选择


In [168]:
df.loc['b']


Out[168]:
one              2
bar              2
flag         False
foo            bar
one_trunc        2
Name: b, dtype: object

In [169]:
df.iloc[2]


Out[169]:
one             3
bar             3
flag         True
foo           bar
one_trunc     NaN
Name: c, dtype: object

数据清洗


In [170]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],
   columns=['one', 'two', 'three'])

In [171]:
df


Out[171]:
one two three
a 0.389612 1.024433 -0.813130
c 0.352398 0.808131 -0.305175
e -1.017042 -0.857841 -0.580242
f 0.179291 -1.993233 0.664604
h 0.456743 -1.174220 0.304638

In [172]:
df['four'] = 'bar'

In [173]:
df['five'] = df['one'] > 0

In [174]:
df


Out[174]:
one two three four five
a 0.389612 1.024433 -0.813130 bar True
c 0.352398 0.808131 -0.305175 bar True
e -1.017042 -0.857841 -0.580242 bar False
f 0.179291 -1.993233 0.664604 bar True
h 0.456743 -1.174220 0.304638 bar True

In [175]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

In [176]:
df2


Out[176]:
one two three four five
a 0.389612 1.024433 -0.813130 bar True
b NaN NaN NaN NaN NaN
c 0.352398 0.808131 -0.305175 bar True
d NaN NaN NaN NaN NaN
e -1.017042 -0.857841 -0.580242 bar False
f 0.179291 -1.993233 0.664604 bar True
g NaN NaN NaN NaN NaN
h 0.456743 -1.174220 0.304638 bar True

In [177]:
df2['one']


Out[177]:
a    0.389612
b         NaN
c    0.352398
d         NaN
e   -1.017042
f    0.179291
g         NaN
h    0.456743
Name: one, dtype: float64

In [178]:
pd.isnull(df2['one'])


Out[178]:
a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [179]:
df2['four'].notnull()


Out[179]:
a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool

In [180]:
df2.isnull()


Out[180]:
one two three four five
a False False False False False
b True True True True True
c False False False False False
d True True True True True
e False False False False False
f False False False False False
g True True True True True
h False False False False False

In [181]:
s = pd.Series([1, 2, 3])
s.loc[0] = None

In [182]:
s


Out[182]:
0    NaN
1    2.0
2    3.0
dtype: float64

In [183]:
df2


Out[183]:
one two three four five
a 0.389612 1.024433 -0.813130 bar True
b NaN NaN NaN NaN NaN
c 0.352398 0.808131 -0.305175 bar True
d NaN NaN NaN NaN NaN
e -1.017042 -0.857841 -0.580242 bar False
f 0.179291 -1.993233 0.664604 bar True
g NaN NaN NaN NaN NaN
h 0.456743 -1.174220 0.304638 bar True

In [184]:
df2.fillna(0)


Out[184]:
one two three four five
a 0.389612 1.024433 -0.813130 bar True
b 0.000000 0.000000 0.000000 0 0
c 0.352398 0.808131 -0.305175 bar True
d 0.000000 0.000000 0.000000 0 0
e -1.017042 -0.857841 -0.580242 bar False
f 0.179291 -1.993233 0.664604 bar True
g 0.000000 0.000000 0.000000 0 0
h 0.456743 -1.174220 0.304638 bar True

In [185]:
df2['four'].fillna('missing')


Out[185]:
a        bar
b    missing
c        bar
d    missing
e        bar
f        bar
g    missing
h        bar
Name: four, dtype: object

In [186]:
df.fillna(method='pad')


Out[186]:
one two three four five
a 0.389612 1.024433 -0.813130 bar True
c 0.352398 0.808131 -0.305175 bar True
e -1.017042 -0.857841 -0.580242 bar False
f 0.179291 -1.993233 0.664604 bar True
h 0.456743 -1.174220 0.304638 bar True

In [187]:
df.fillna(method='pad', limit=1)


Out[187]:
one two three four five
a 0.389612 1.024433 -0.813130 bar True
c 0.352398 0.808131 -0.305175 bar True
e -1.017042 -0.857841 -0.580242 bar False
f 0.179291 -1.993233 0.664604 bar True
h 0.456743 -1.174220 0.304638 bar True

Merge


In [188]:
df1=pd.DataFrame({'key':['a','b','b'],'data1':range(3)})

In [189]:
df2=pd.DataFrame({'key':['a','b','c'],'data2':range(3)})

In [190]:
df1


Out[190]:
data1 key
0 0 a
1 1 b
2 2 b

In [191]:
df2


Out[191]:
data2 key
0 0 a
1 1 b
2 2 c

In [192]:
df1.merge(df2)


Out[192]:
data1 key data2
0 0 a 0
1 1 b 1
2 2 b 1

In [193]:
pd.merge(df2,df1,how='left')


Out[193]:
data2 key data1
0 0 a 0.0
1 1 b 1.0
2 1 b 2.0
3 2 c NaN

In [195]:
left=pd.DataFrame({'key1':['foo','foo','bar'],  
'key2':['one','two','one'],  
'lval':[1,2,3]})  
  
right=pd.DataFrame({'key1':['foo','foo','bar','bar'],  
'key2':['one','one','one','two'],  
'lval':[4,5,6,7]})  
  
pd.merge(left,right,on=['key1','key2'],how='outer')


Out[195]:
key1 key2 lval_x lval_y
0 foo one 1.0 4.0
1 foo one 1.0 5.0
2 foo two 2.0 NaN
3 bar one 3.0 6.0
4 bar two NaN 7.0

In [196]:
df3=pd.DataFrame({'key3':['foo','foo','bar','bar'], #将上面的right的key 改了名字  如果两个对象的列名不同,可以分别指定
'key4':['one','one','one','two'],  
'lval':[4,5,6,7]})

In [197]:
pd.merge(left,df3,left_on='key1',right_on='key3')


Out[197]:
key1 key2 lval_x key3 key4 lval_y
0 foo one 1 foo one 4
1 foo one 1 foo one 5
2 foo two 2 foo one 4
3 foo two 2 foo one 5
4 bar one 3 bar one 6
5 bar one 3 bar two 7

In [198]:
df1=pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])

In [199]:
df2=pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])

In [200]:
pd.concat([df1,df2])


Out[200]:
a b c d
0 -0.968729 -0.323539 -0.519171 1.090456
1 0.081356 -0.440603 0.003373 -1.567866
2 1.155166 0.252753 0.339318 -0.010611
0 0.770177 -1.357404 NaN -0.067142
1 -0.576567 1.171635 NaN -0.068974

In [201]:
pd.concat([df1,df2],ignore_index=True)


Out[201]:
a b c d
0 -0.968729 -0.323539 -0.519171 1.090456
1 0.081356 -0.440603 0.003373 -1.567866
2 1.155166 0.252753 0.339318 -0.010611
3 0.770177 -1.357404 NaN -0.067142
4 -0.576567 1.171635 NaN -0.068974

Group By


In [202]:
df = pd.DataFrame({'key1':['a','a','b','b','a'],
                   'key2':['one','two','one','two','one'],
                   'data1':np.random.randn(5),
                   'data2':np.random.randn(5)})

In [203]:
df


Out[203]:
data1 data2 key1 key2
0 -1.132667 0.432118 a one
1 1.344343 -1.410204 a two
2 -2.579932 -0.464842 b one
3 0.770310 0.332682 b two
4 0.044376 0.530382 a one

In [204]:
grouped = df.groupby(df['key1'])

In [205]:
grouped


Out[205]:
<pandas.core.groupby.DataFrameGroupBy object at 0x10aff4190>

In [206]:
grouped.mean()


Out[206]:
data1 data2
key1
a 0.085351 -0.149235
b -0.904811 -0.066080

In [207]:
df['data1'].groupby(df['key1']).mean()


Out[207]:
key1
a    0.085351
b   -0.904811
Name: data1, dtype: float64

In [208]:
df.groupby(df['key2'])['data2'].mean()


Out[208]:
key2
one    0.165886
two   -0.538761
Name: data2, dtype: float64

In [209]:
df.groupby('key1')['data1','data2'].agg(lambda arr:arr.max()-arr.min())


Out[209]:
data1 data2
key1
a 2.477010 1.940585
b 3.350242 0.797524

In [210]:
df.groupby('key1')['data1','data2'].agg(['min','max'])


Out[210]:
data1 data2
min max min max
key1
a -1.132667 1.344343 -1.410204 0.530382
b -2.579932 0.770310 -0.464842 0.332682

绘图


In [211]:
np.random.seed(2)

In [212]:
ser = pd.Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37))

In [213]:
bad = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29])

In [214]:
ser[bad] = np.nan

In [215]:
methods = ['linear', 'quadratic', 'cubic']

In [216]:
df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods})

In [217]:
df.plot()


Out[217]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c754c50>

Matplotlib


In [218]:
import numpy as np
import matplotlib.pyplot as plt

In [219]:
X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
C,S = np.cos(X), np.sin(X)

plt.plot(X,C)
plt.plot(X,S)

# plt.show()


Out[219]:
[<matplotlib.lines.Line2D at 0x10c821190>]

In [ ]:
plt.figure(figsize=(8,6), dpi=80)

# 创建一个新的 1 * 1 的子图,接下来的图样绘制在其中的第 1 块(也是唯一的一块)
plt.subplot(1,1,1)

X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
C,S = np.cos(X), np.sin(X)

# 绘制余弦曲线,使用蓝色的、连续的、宽度为 1 (像素)的线条
plt.plot(X, C, color="blue", linewidth=1.0, linestyle="-")

# 绘制正弦曲线,使用绿色的、连续的、宽度为 1 (像素)的线条
plt.plot(X, S, color="green", linewidth=1.0, linestyle="-")

# 设置横轴的上下限
plt.xlim(-4.0,4.0)

# 设置横轴记号
plt.xticks(np.linspace(-4,4,9,endpoint=True))

# 设置纵轴的上下限
plt.ylim(-1.0,1.0)

# 设置纵轴记号
plt.yticks(np.linspace(-1,1,5,endpoint=True))

# 以分辨率 72 来保存图片
# savefig("exercice_2.png",dpi=72)

# 在屏幕上显示
# plt.show()

In [220]:
plt.figure(figsize=(10,6), dpi=80)
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")


Out[220]:
[<matplotlib.lines.Line2D at 0x10c9d37d0>]

In [221]:
plt.xlim(X.min()*1.1, X.max()*1.1)
plt.ylim(C.min()*1.1, C.max()*1.1)
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")


Out[221]:
[<matplotlib.lines.Line2D at 0x10c8f0610>]

In [222]:
plt.xticks( [-np.pi, -np.pi/2, 0, np.pi/2, np.pi])
plt.yticks([-1, 0, +1])
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")


Out[222]:
[<matplotlib.lines.Line2D at 0x10cef3e90>]

In [223]:
plt.xticks([-np.pi, -np.pi/2, 0, np.pi/2, np.pi],
       [r'$-\pi$', r'$-\pi/2$', r'$0$', r'$+\pi/2$', r'$+\pi$'])

plt.yticks([-1, 0, +1],
       [r'$-1$', r'$0$', r'$+1$'])
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")


Out[223]:
[<matplotlib.lines.Line2D at 0x10d077910>]

In [224]:
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.spines['bottom'].set_position(('data',0))
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data',0))
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")


Out[224]:
[<matplotlib.lines.Line2D at 0x10d1ac8d0>]

In [225]:
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-", label="cosine")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-", label="sine")

plt.legend(loc='upper left')


Out[225]:
<matplotlib.legend.Legend at 0x10d3928d0>

In [226]:
t = 2*np.pi/3
plt.plot([t,t],[0,np.cos(t)], color ='blue', linewidth=2.5, linestyle="--")
plt.scatter([t,],[np.cos(t),], 50, color ='blue')

plt.annotate(r'$\sin(\frac{2\pi}{3})=\frac{\sqrt{3}}{2}$',
         xy=(t, np.sin(t)), xycoords='data',
         xytext=(+10, +30), textcoords='offset points', fontsize=16,
         arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))

plt.plot([t,t],[0,np.sin(t)], color ='red', linewidth=2.5, linestyle="--")
plt.scatter([t,],[np.sin(t),], 50, color ='red')

plt.annotate(r'$\cos(\frac{2\pi}{3})=-\frac{1}{2}$',
         xy=(t, np.cos(t)), xycoords='data',
         xytext=(-90, -50), textcoords='offset points', fontsize=16,
         arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))

plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-", label="cosine")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-", label="sine")


Out[226]:
[<matplotlib.lines.Line2D at 0x10d4f7210>]

In [227]:
plt.axes([.1,.1,1,1])
plt.axes([.2,.2,.3,.3],axisbg='green')


/Users/heming03/python-env/lib/python2.7/site-packages/matplotlib-2.0.0-py2.7-macosx-10.6-intel.egg/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)
Out[227]:
<matplotlib.axes._axes.Axes at 0x10d636190>

In [228]:
plt.subplot(2,1,1,axisbg='y')
plt.subplot(2,1,2)


Out[228]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d6fb8d0>

In [229]:
for idx, color in enumerate('rgbyck'):
    plt.subplot(3,2,1+idx,axisbg=color)



In [ ]:


In [230]:
n = 1024
X = np.random.normal(0,1,n)
Y = np.random.normal(0,1,n)
T = np.arctan2(Y,X)
plt.scatter(X,Y,s=75,c=T,alpha=.5)


Out[230]:
<matplotlib.collections.PathCollection at 0x10dd3e690>

In [231]:
n = 12
X = np.arange(n)
Y1 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)
Y2 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)

plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white')
plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white')

for x,y in zip(X,Y1):
    plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va= 'bottom')

plt.ylim(-1.25,+1.25)


Out[231]:
(-1.25, 1.25)

In [232]:
def f(x,y): return (1-x/2+x**5+y**3)*np.exp(-x**2-y**2)

n = 256
x = np.linspace(-3,3,n)
y = np.linspace(-3,3,n)
X,Y = np.meshgrid(x,y)

plt.contourf(X, Y, f(X,Y), 8, alpha=.75, cmap='jet')
C = plt.contour(X, Y, f(X,Y), 8, colors='black', linewidth=.5)



In [ ]: