NUMPY



In [1]:

    
%matplotlib inline



In [2]:

    
import numpy as np

高效的Numpy



In [4]:

    
import numpy as np
arr = np.arange(1000)
def get_sum(arr):
    acc = 0
    for item in arr:
        acc += item
    return acc
%timeit get_sum(arr)









    



10000 loops, best of 3: 118 µs per loop



In [5]:

    
%timeit sum(arr)









    



10000 loops, best of 3: 89.9 µs per loop



In [6]:

    
%timeit np.sum(arr)









    



The slowest run took 43.63 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 3.09 µs per loop

创建Ndarray



In [11]:

    
# List Tuple
a = np.array([1, 2, 3, 4])
b = np.array((5, 6, 7, 8))
c = np.array([[1, 2, 3, 4],[4, 5, 6, 7], [7, 8, 9, 10]], dtype=int)
b









    Out[11]:





array([5, 6, 7, 8])



In [12]:

    
c









    Out[12]:





array([[ 1,  2,  3,  4],
       [ 4,  5,  6,  7],
       [ 7,  8,  9, 10]])



In [13]:

    
c.dtype









    Out[13]:





dtype('int64')



In [14]:

    
# shape属性
print a.shape
print c.shape









    



(4,)
(3, 4)



In [15]:

    
a.itemsize









    Out[15]:





8



In [16]:

    
a.ndim









    Out[16]:





1



In [17]:

    
a.size









    Out[17]:





4

DTypes

改变类型 numpy中的数据类型转换，不能直接改原数据的dtype! 只能用函数astype()。



In [18]:

    
a = np.random.random(4)



In [19]:

    
a.dtype









    Out[19]:





dtype('float64')



In [20]:

    
print a









    



[ 0.60821019  0.93451057  0.15028573  0.71730223]



In [21]:

    
a.dtype = 'float32'



In [22]:

    
print a









    



[  4.75686619e-07   1.77705252e+00  -1.29937671e-05   1.85862756e+00
   1.57849608e-21   1.52528572e+00  -7.68008346e+09   1.80432546e+00]



In [23]:

    
c = a.astype(np.float32)
c









    Out[23]:





array([  4.75686619e-07,   1.77705252e+00,  -1.29937671e-05,
         1.85862756e+00,   1.57849608e-21,   1.52528572e+00,
        -7.68008346e+09,   1.80432546e+00], dtype=float32)



In [24]:

    
c = a.astype(np.int32)
c









    Out[24]:





array([          0,           1,           0,           1,           0,
                 1, -2147483648,           1], dtype=int32)

创建特定形状的多维数组并进行填充



In [25]:

    
# 开始值、终值和步长来创建一维数组
np.arange(0,1,0.1)









    Out[25]:





array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9])



In [28]:

    
np.linspace(0, 1, 12)









    Out[28]:





array([ 0.        ,  0.09090909,  0.18181818,  0.27272727,  0.36363636,
        0.45454545,  0.54545455,  0.63636364,  0.72727273,  0.81818182,
        0.90909091,  1.        ])



In [29]:

    
np.logspace(0, 2, 20)









    Out[29]:





array([   1.        ,    1.27427499,    1.62377674,    2.06913808,
          2.6366509 ,    3.35981829,    4.2813324 ,    5.45559478,
          6.95192796,    8.8586679 ,   11.28837892,   14.38449888,
         18.32980711,   23.35721469,   29.76351442,   37.92690191,
         48.32930239,   61.58482111,   78.47599704,  100.        ])



In [30]:

    
np.empty([2, 2])









    Out[30]:





array([[ 0.,  0.],
       [ 0.,  0.]])



In [31]:

    
np.empty([2, 2], dtype=int)









    Out[31]:





array([[0, 0],
       [0, 0]])



In [32]:

    
a = np.array([[1., 2., 3.],[4.,5.,6.]])
print np.empty_like(a)









    



[[  0.00000000e+000   0.00000000e+000   2.13553025e-314]
 [  2.16293988e-314   0.00000000e+000   0.00000000e+000]]



In [33]:

    
np.eye(2, dtype=int)









    Out[33]:





array([[1, 0],
       [0, 1]])



In [34]:

    
np.identity(3)









    Out[34]:





array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])



In [35]:

    
np.ones(5)









    Out[35]:





array([ 1.,  1.,  1.,  1.,  1.])



In [36]:

    
np.ones((5,), dtype=np.int)









    Out[36]:





array([1, 1, 1, 1, 1])



In [37]:

    
np.ones((2, 1))









    Out[37]:





array([[ 1.],
       [ 1.]])



In [38]:

    
np.ones((2,2))









    Out[38]:





array([[ 1.,  1.],
       [ 1.,  1.]])

广播



In [39]:

    
a = np.array([1.0,2.0,3.0])
b = np.array([2.0,2.0,2.0])
a * b









    Out[39]:





array([ 2.,  4.,  6.])



In [40]:

    
## 当不相等时，则会采用规则对其：
a = np.array([1.0,2.0,3.0])
b = 2
a * b









    Out[40]:





array([ 2.,  4.,  6.])



In [42]:

    
a = np.arange(0, 6).reshape(6, 1)
a.shape
a









    Out[42]:





array([[0],
       [1],
       [2],
       [3],
       [4],
       [5]])



In [44]:

    
b = np.arange(0, 5)
b.shape
b









    Out[44]:





array([0, 1, 2, 3, 4])



In [45]:

    
c = a + b



In [46]:

    
print c









    



[[0 1 2 3 4]
 [1 2 3 4 5]
 [2 3 4 5 6]
 [3 4 5 6 7]
 [4 5 6 7 8]
 [5 6 7 8 9]]

变换，索引和切片



In [48]:

    
a = np.array([1, 2, 3, 4])
d = a.reshape((2,2))
d









    Out[48]:





array([[1, 2],
       [3, 4]])



In [49]:

    
a = np.arange(6).reshape((3, 2))
a









    Out[49]:





array([[0, 1],
       [2, 3],
       [4, 5]])



In [50]:

    
a=np.array([[0,1],[2,3]])
np.resize(a,(2,3))









    Out[50]:





array([[0, 1, 2],
       [3, 0, 1]])



In [51]:

    
np.resize(a,(1,4))









    Out[51]:





array([[0, 1, 2, 3]])



In [52]:

    
np.resize(a,(2,4))









    Out[52]:





array([[0, 1, 2, 3],
       [0, 1, 2, 3]])



In [55]:

    
arr = np.arange(10) 
arr[5]









    Out[55]:





5



In [56]:

    
arr[5:8]









    Out[56]:





array([5, 6, 7])



In [57]:

    
arr_slice = arr[5:8]  
arr_slice[1] = 12345



In [58]:

    
arr









    Out[58]:





array([    0,     1,     2,     3,     4,     5, 12345,     7,     8,     9])



In [59]:

    
arr_slice[:] = 64



In [60]:

    
arr









    Out[60]:





array([ 0,  1,  2,  3,  4, 64, 64, 64,  8,  9])



In [61]:

    
arr_copy = arr[5:8].copy()



In [62]:

    
arr2d = np.array([[1,2,3],[4,5,6],[7,8,9]])



In [63]:

    
arr2d[2]









    Out[63]:





array([7, 8, 9])



In [64]:

    
arr2d[0][2]









    Out[64]:





3



In [65]:

    
arr2d[0,2]









    Out[65]:





3



In [66]:

    
arr3d = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])  
arr3d









    Out[66]:





array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])



In [67]:

    
arr3d.shape









    Out[67]:





(2, 2, 3)



In [68]:

    
arr3d[0]









    Out[68]:





array([[1, 2, 3],
       [4, 5, 6]])



In [69]:

    
old_values = arr3d[0].copy()  
arr3d[0]= 42  
arr3d









    Out[69]:





array([[[42, 42, 42],
        [42, 42, 42]],

       [[ 7,  8,  9],
        [10, 11, 12]]])



In [70]:

    
arr2d









    Out[70]:





array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])



In [71]:

    
arr2d[:2]









    Out[71]:





array([[1, 2, 3],
       [4, 5, 6]])



In [72]:

    
arr2d[:2,1:]









    Out[72]:





array([[2, 3],
       [5, 6]])



In [73]:

    
arr2d[1,:2]









    Out[73]:





array([4, 5])



In [74]:

    
arr2d[2,:1]









    Out[74]:





array([7])



In [75]:

    
arr2d[:,:1]









    Out[75]:





array([[1],
       [4],
       [7]])



In [76]:

    
arr2d[:2,1:] = 0  
arr2d









    Out[76]:





array([[1, 0, 0],
       [4, 0, 0],
       [7, 8, 9]])



In [77]:

    
names = np.array(['Bob','Joe','Will','Bob','Will','Joe','Joe'])



In [78]:

    
import numpy.random  
data = numpy.random.randn(7,4)  
data









    Out[78]:





array([[ 0.18288168, -0.94369605, -0.11244431,  0.40186004],
       [ 0.06163719,  1.64483343,  0.33081673,  0.80889786],
       [-0.52448079,  0.36007662,  0.32996677, -0.20352982],
       [ 0.57113387,  1.29816816,  0.29646005, -0.1379642 ],
       [ 0.36725083,  0.48369201,  2.58987207,  0.66315957],
       [ 1.60421507,  0.57683305,  1.03288016,  0.08122319],
       [-0.12621481, -0.69359545,  0.67847661, -0.06644251]])



In [79]:

    
names









    Out[79]:





array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'], 
      dtype='|S4')



In [80]:

    
names == 'Bob'









    Out[80]:





array([ True, False, False,  True, False, False, False], dtype=bool)



In [81]:

    
data[names == 'Bob']









    Out[81]:





array([[ 0.18288168, -0.94369605, -0.11244431,  0.40186004],
       [ 0.57113387,  1.29816816,  0.29646005, -0.1379642 ]])

ufunc运算



In [82]:

    
x = np.linspace(0, 2*np.pi, 10)
x









    Out[82]:





array([ 0.        ,  0.6981317 ,  1.3962634 ,  2.0943951 ,  2.7925268 ,
        3.4906585 ,  4.1887902 ,  4.88692191,  5.58505361,  6.28318531])



In [83]:

    
y = np.sin(x)



In [84]:

    
print y









    



[  0.00000000e+00   6.42787610e-01   9.84807753e-01   8.66025404e-01
   3.42020143e-01  -3.42020143e-01  -8.66025404e-01  -9.84807753e-01
  -6.42787610e-01  -2.44929360e-16]



In [85]:

    
t = np.sin(x,x)



In [86]:

    
id(t) == id(x)









    Out[86]:





True



In [87]:

    
import time
import math
import numpy as np

x = [i * 0.001 for i in xrange(1000000)]
start = time.clock()
for i, t in enumerate(x):
    x[i] = math.sin(t)
print "math.sin:", time.clock() - start

x = [i * 0.001 for i in xrange(1000000)]
x = np.array(x)
start = time.clock()
np.sin(x,x)
print "numpy.sin:", time.clock() - start









    



math.sin: 0.263345
numpy.sin: 0.011054



In [88]:

    
a = np.arange(0,4)
print a
b = np.arange(1,5)
print b

np.add(a,b)









    



[0 1 2 3]
[1 2 3 4]






    Out[88]:





array([1, 3, 5, 7])

封装ufuncs



In [90]:

    
def triangle_wave(x, c, c0, hc):
    x = x - int(x) # 三角波
    if x >= c: r = 0.0
    elif x < c0: r = x / c0 * hc
    else: r = (c-x) / (c-c0) * hc
    return r



In [91]:

    
x = np.linspace(0, 2, 1000)
y = np.array([triangle_wave(t, 0.6, 0.4, 1.0) for t in x])



In [92]:

    
triangle_ufunc = np.frompyfunc( lambda x: triangle_wave(x, 0.6, 0.4, 1.0), 1, 1)
y2 = triangle_ufunc(x)



In [93]:

    
data = np.array([
... [1,2,1],
... [0,3,1],
... [2,1,4],
... [1,3,1]])



In [94]:

    
data









    Out[94]:





array([[1, 2, 1],
       [0, 3, 1],
       [2, 1, 4],
       [1, 3, 1]])



In [95]:

    
np.sum(data, axis=1)









    Out[95]:





array([4, 4, 7, 5])



In [96]:

    
np.min(data, axis=0)









    Out[96]:





array([0, 1, 1])



In [97]:

    
np.average(data)









    Out[97]:





1.6666666666666667



In [98]:

    
data = np.random.randint(0, 5, [4,3,2,3])



In [99]:

    
data









    Out[99]:





array([[[[3, 1, 2],
         [0, 0, 2]],

        [[2, 2, 4],
         [1, 4, 1]],

        [[2, 0, 0],
         [3, 0, 2]]],


       [[[2, 0, 0],
         [0, 1, 3]],

        [[4, 3, 4],
         [0, 0, 1]],

        [[4, 3, 0],
         [2, 3, 2]]],


       [[[1, 0, 2],
         [3, 4, 0]],

        [[4, 0, 4],
         [0, 1, 2]],

        [[0, 0, 2],
         [4, 1, 4]]],


       [[[2, 3, 1],
         [2, 4, 3]],

        [[1, 4, 1],
         [1, 2, 3]],

        [[2, 1, 2],
         [4, 1, 1]]]])



In [100]:

    
data.sum(axis=0)









    Out[100]:





array([[[ 8,  4,  5],
        [ 5,  9,  8]],

       [[11,  9, 13],
        [ 2,  7,  7]],

       [[ 8,  4,  4],
        [13,  5,  9]]])

axis sort



In [101]:

    
data = np.random.randint(0, 5, [3,2,3])



In [102]:

    
data









    Out[102]:





array([[[1, 3, 2],
        [1, 2, 0]],

       [[1, 2, 1],
        [4, 1, 0]],

       [[3, 1, 0],
        [1, 1, 2]]])



In [103]:

    
np.sort(data)  ## 默认对最大的axis进行排序，这里即是axis=2









    Out[103]:





array([[[1, 2, 3],
        [0, 1, 2]],

       [[1, 1, 2],
        [0, 1, 4]],

       [[0, 1, 3],
        [1, 1, 2]]])



In [104]:

    
np.sort(data, axis=0)









    Out[104]:





array([[[1, 1, 0],
        [1, 1, 0]],

       [[1, 2, 1],
        [1, 1, 0]],

       [[3, 3, 2],
        [4, 2, 2]]])



In [105]:

    
np.sort(data, axis=1)









    Out[105]:





array([[[1, 2, 0],
        [1, 3, 2]],

       [[1, 1, 0],
        [4, 2, 1]],

       [[1, 1, 0],
        [3, 1, 2]]])



In [106]:

    
np.sort(data, axis=2)









    Out[106]:





array([[[1, 2, 3],
        [0, 1, 2]],

       [[1, 1, 2],
        [0, 1, 4]],

       [[0, 1, 3],
        [1, 1, 2]]])



In [107]:

    
np.sort(data, axis=None)









    Out[107]:





array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 4])

prod(即product，乘积)

其他函数



In [108]:

    
import numpy as np
from tempfile import TemporaryFile

origin_array = np.array([1, 2, 3, 4])
np.save('/tmp/array', origin_array)

array_from_file = np.load('/tmp/array.npy')
print array_from_file



In [109]:

    
import numpy as np

origin_array = np.array([1, 2, 3, 4])
np.savetxt('array.txt', origin_array)

array_from_file = np.loadtxt('array.txt')
print array_from_file









    



[ 1.  2.  3.  4.]



In [110]:

    
array = np.fromstring('1 2 3 4', dtype=float, sep=' ')
print array









    



[ 1.  2.  3.  4.]



In [111]:

    
# 悲剧了
array = np.array([1, 2, 3, 4], dtype=int)
print np.fromstring(array.tostring())









    



[  4.94065646e-324   9.88131292e-324   1.48219694e-323   1.97626258e-323]



In [112]:

    
print np.fromstring(array.tostring(), dtype=int)

Pandas

from ndarray



In [113]:

    
import pandas as pd
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])



In [114]:

    
s









    Out[114]:





a   -0.336826
b    0.952793
c    0.333174
d    1.586247
e    0.717200
dtype: float64



In [115]:

    
s.index









    Out[115]:





Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')



In [116]:

    
pd.Series(np.random.randn(5))









    Out[116]:





0    0.556203
1   -0.288984
2   -0.291610
3    0.960958
4   -0.375165
dtype: float64



In [ ]:

    
### for dict



In [118]:

    
d = {'a' : 0., 'b' : 1., 'c' : 2.}



In [119]:

    
pd.Series(d)









    Out[119]:





a    0.0
b    1.0
c    2.0
dtype: float64



In [120]:

    
pd.Series(d, index=['b', 'c', 'd', 'a'])









    Out[120]:





b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64



In [ ]:

    
## 常量构造



In [121]:

    
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])









    Out[121]:





a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64



In [122]:

    
s[0]









    Out[122]:





-0.3368258988445077



In [123]:

    
s[:3]









    Out[123]:





a   -0.336826
b    0.952793
c    0.333174
dtype: float64



In [124]:

    
s[s > s.median()]









    Out[124]:





b    0.952793
d    1.586247
dtype: float64



In [125]:

    
s[[4, 3, 1]]









    Out[125]:





e    0.717200
d    1.586247
b    0.952793
dtype: float64



In [126]:

    
np.exp(s)









    Out[126]:





a    0.714033
b    2.592941
c    1.395390
d    4.885381
e    2.048690
dtype: float64



In [ ]:

    
### Index Label



In [127]:

    
s['a']









    Out[127]:





-0.3368258988445077



In [128]:

    
s['e'] = 12.



In [129]:

    
s









    Out[129]:





a    -0.336826
b     0.952793
c     0.333174
d     1.586247
e    12.000000
dtype: float64



In [130]:

    
'e' in s









    Out[130]:





True



In [131]:

    
't' in s









    Out[131]:





False



In [ ]:

    
### 向量化操作



In [132]:

    
s + s









    Out[132]:





a    -0.673652
b     1.905586
c     0.666348
d     3.172495
e    24.000000
dtype: float64



In [133]:

    
s * 2









    Out[133]:





a    -0.673652
b     1.905586
c     0.666348
d     3.172495
e    24.000000
dtype: float64



In [134]:

    
np.exp(s)









    Out[134]:





a         0.714033
b         2.592941
c         1.395390
d         4.885381
e    162754.791419
dtype: float64



In [135]:

    
s[1:] + s[:-1]









    Out[135]:





a         NaN
b    1.905586
c    0.666348
d    3.172495
e         NaN
dtype: float64



In [ ]:

    
## DataFrame



In [136]:

    
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
    'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}



In [137]:

    
df = pd.DataFrame(d)



In [138]:

    
df



In [139]:

    
pd.DataFrame(d, index=['d', 'b', 'a'])



In [140]:

    
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])



In [141]:

    
### 字典构造
d = {'one' : [1., 2., 3., 4.],
    'two' : [4., 3., 2., 1.]}



In [142]:

    
pd.DataFrame(d)



In [143]:

    
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])



In [ ]:

    
## 结构化数据



In [144]:

    
data = np.zeros((2,), dtype=[('A', 'i4'),('B', 'f4'),('C', 'a10')])



In [145]:

    
data









    Out[145]:





array([(0,  0., ''), (0,  0., '')], 
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])



In [146]:

    
data[:] = [(1,2.,'Hello'), (2,3.,"World")]



In [147]:

    
data









    Out[147]:





array([(1,  2., 'Hello'), (2,  3., 'World')], 
      dtype=[('A', '<i4'), ('B', '<f4'), ('C', 'S10')])



In [148]:

    
pd.DataFrame(data)



In [149]:

    
pd.DataFrame(data, index=['first', 'second'])



In [150]:

    
pd.DataFrame(data, columns=['C', 'A', 'B'])



In [151]:

    
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]



In [152]:

    
pd.DataFrame(data2)



In [153]:

    
pd.DataFrame(data2, index=['first', 'second'])



In [154]:

    
pd.DataFrame(data2, columns=['a', 'b'])



In [ ]:

    
#### records



In [155]:

    
pd.DataFrame.from_items([('A', [1, 2, 3]), ('B', [4, 5, 6])],
orient='index', columns=['one', 'two', 'three'])



In [156]:

    
df['one']









    Out[156]:





a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64



In [157]:

    
df['three'] = df['one'] * df['two']



In [158]:

    
df['flag'] = df['one'] > 2



In [159]:

    
df



In [160]:

    
del df['two']



In [161]:

    
three = df.pop('three')



In [162]:

    
df



In [163]:

    
df['foo'] = 'bar'



In [164]:

    
df['one_trunc'] = df['one'][:2]



In [165]:

    
df









    Out[165]:






  
    
      
      one
      flag
      foo
      one_trunc
    
  
  
    
      a
      1.0
      False
      bar
      1.0
    
    
      b
      2.0
      False
      bar
      2.0
    
    
      c
      3.0
      True
      bar
      NaN
    
    
      d
      NaN
      False
      bar
      NaN



In [166]:

    
df.insert(1, 'bar', df['one'])



In [167]:

    
df









    Out[167]:






  
    
      
      one
      bar
      flag
      foo
      one_trunc
    
  
  
    
      a
      1.0
      1.0
      False
      bar
      1.0
    
    
      b
      2.0
      2.0
      False
      bar
      2.0
    
    
      c
      3.0
      3.0
      True
      bar
      NaN
    
    
      d
      NaN
      NaN
      False
      bar
      NaN

索引和选择



In [168]:

    
df.loc['b']









    Out[168]:





one              2
bar              2
flag         False
foo            bar
one_trunc        2
Name: b, dtype: object



In [169]:

    
df.iloc[2]









    Out[169]:





one             3
bar             3
flag         True
foo           bar
one_trunc     NaN
Name: c, dtype: object

数据清洗



In [170]:

    
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],
   columns=['one', 'two', 'three'])



In [171]:

    
df



In [172]:

    
df['four'] = 'bar'



In [173]:

    
df['five'] = df['one'] > 0



In [174]:

    
df



In [175]:

    
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])



In [176]:

    
df2



In [177]:

    
df2['one']









    Out[177]:





a    0.389612
b         NaN
c    0.352398
d         NaN
e   -1.017042
f    0.179291
g         NaN
h    0.456743
Name: one, dtype: float64



In [178]:

    
pd.isnull(df2['one'])









    Out[178]:





a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool



In [179]:

    
df2['four'].notnull()









    Out[179]:





a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: four, dtype: bool



In [180]:

    
df2.isnull()









    Out[180]:






  
    
      
      one
      two
      three
      four
      five
    
  
  
    
      a
      False
      False
      False
      False
      False
    
    
      b
      True
      True
      True
      True
      True
    
    
      c
      False
      False
      False
      False
      False
    
    
      d
      True
      True
      True
      True
      True
    
    
      e
      False
      False
      False
      False
      False
    
    
      f
      False
      False
      False
      False
      False
    
    
      g
      True
      True
      True
      True
      True
    
    
      h
      False
      False
      False
      False
      False



In [181]:

    
s = pd.Series([1, 2, 3])
s.loc[0] = None



In [182]:

    
s









    Out[182]:





0    NaN
1    2.0
2    3.0
dtype: float64



In [183]:

    
df2



In [184]:

    
df2.fillna(0)



In [185]:

    
df2['four'].fillna('missing')









    Out[185]:





a        bar
b    missing
c        bar
d    missing
e        bar
f        bar
g    missing
h        bar
Name: four, dtype: object



In [186]:

    
df.fillna(method='pad')



In [187]:

    
df.fillna(method='pad', limit=1)

Merge



In [188]:

    
df1=pd.DataFrame({'key':['a','b','b'],'data1':range(3)})



In [189]:

    
df2=pd.DataFrame({'key':['a','b','c'],'data2':range(3)})



In [190]:

    
df1



In [191]:

    
df2



In [192]:

    
df1.merge(df2)



In [193]:

    
pd.merge(df2,df1,how='left')



In [195]:

    
left=pd.DataFrame({'key1':['foo','foo','bar'],  
'key2':['one','two','one'],  
'lval':[1,2,3]})  
  
right=pd.DataFrame({'key1':['foo','foo','bar','bar'],  
'key2':['one','one','one','two'],  
'lval':[4,5,6,7]})  
  
pd.merge(left,right,on=['key1','key2'],how='outer')



In [196]:

    
df3=pd.DataFrame({'key3':['foo','foo','bar','bar'], #将上面的right的key 改了名字  如果两个对象的列名不同，可以分别指定
'key4':['one','one','one','two'],  
'lval':[4,5,6,7]})



In [197]:

    
pd.merge(left,df3,left_on='key1',right_on='key3')



In [198]:

    
df1=pd.DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])



In [199]:

    
df2=pd.DataFrame(np.random.randn(2,3),columns=['b','d','a'])



In [200]:

    
pd.concat([df1,df2])



In [201]:

    
pd.concat([df1,df2],ignore_index=True)

Group By



In [202]:

    
df = pd.DataFrame({'key1':['a','a','b','b','a'],
                   'key2':['one','two','one','two','one'],
                   'data1':np.random.randn(5),
                   'data2':np.random.randn(5)})



In [203]:

    
df



In [204]:

    
grouped = df.groupby(df['key1'])



In [205]:

    
grouped









    Out[205]:





<pandas.core.groupby.DataFrameGroupBy object at 0x10aff4190>



In [206]:

    
grouped.mean()



In [207]:

    
df['data1'].groupby(df['key1']).mean()









    Out[207]:





key1
a    0.085351
b   -0.904811
Name: data1, dtype: float64



In [208]:

    
df.groupby(df['key2'])['data2'].mean()









    Out[208]:





key2
one    0.165886
two   -0.538761
Name: data2, dtype: float64



In [209]:

    
df.groupby('key1')['data1','data2'].agg(lambda arr:arr.max()-arr.min())



In [210]:

    
df.groupby('key1')['data1','data2'].agg(['min','max'])

绘图



In [211]:

    
np.random.seed(2)



In [212]:

    
ser = pd.Series(np.arange(1, 10.1, .25)**2 + np.random.randn(37))



In [213]:

    
bad = np.array([4, 13, 14, 15, 16, 17, 18, 20, 29])



In [214]:

    
ser[bad] = np.nan



In [215]:

    
methods = ['linear', 'quadratic', 'cubic']



In [216]:

    
df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods})



In [217]:

    
df.plot()









    Out[217]:





<matplotlib.axes._subplots.AxesSubplot at 0x10c754c50>

Matplotlib



In [218]:

    
import numpy as np
import matplotlib.pyplot as plt



In [219]:

    
X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
C,S = np.cos(X), np.sin(X)

plt.plot(X,C)
plt.plot(X,S)

# plt.show()









    Out[219]:





[<matplotlib.lines.Line2D at 0x10c821190>]



In [ ]:

    
plt.figure(figsize=(8,6), dpi=80)

# 创建一个新的 1 * 1 的子图，接下来的图样绘制在其中的第 1 块（也是唯一的一块）
plt.subplot(1,1,1)

X = np.linspace(-np.pi, np.pi, 256,endpoint=True)
C,S = np.cos(X), np.sin(X)

# 绘制余弦曲线，使用蓝色的、连续的、宽度为 1 （像素）的线条
plt.plot(X, C, color="blue", linewidth=1.0, linestyle="-")

# 绘制正弦曲线，使用绿色的、连续的、宽度为 1 （像素）的线条
plt.plot(X, S, color="green", linewidth=1.0, linestyle="-")

# 设置横轴的上下限
plt.xlim(-4.0,4.0)

# 设置横轴记号
plt.xticks(np.linspace(-4,4,9,endpoint=True))

# 设置纵轴的上下限
plt.ylim(-1.0,1.0)

# 设置纵轴记号
plt.yticks(np.linspace(-1,1,5,endpoint=True))

# 以分辨率 72 来保存图片
# savefig("exercice_2.png",dpi=72)

# 在屏幕上显示
# plt.show()



In [220]:

    
plt.figure(figsize=(10,6), dpi=80)
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")









    Out[220]:





[<matplotlib.lines.Line2D at 0x10c9d37d0>]



In [221]:

    
plt.xlim(X.min()*1.1, X.max()*1.1)
plt.ylim(C.min()*1.1, C.max()*1.1)
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")









    Out[221]:





[<matplotlib.lines.Line2D at 0x10c8f0610>]



In [222]:

    
plt.xticks( [-np.pi, -np.pi/2, 0, np.pi/2, np.pi])
plt.yticks([-1, 0, +1])
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")









    Out[222]:





[<matplotlib.lines.Line2D at 0x10cef3e90>]



In [223]:

    
plt.xticks([-np.pi, -np.pi/2, 0, np.pi/2, np.pi],
       [r'$-\pi$', r'$-\pi/2$', r'$0$', r'$+\pi/2$', r'$+\pi$'])

plt.yticks([-1, 0, +1],
       [r'$-1$', r'$0$', r'$+1$'])
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")









    Out[223]:





[<matplotlib.lines.Line2D at 0x10d077910>]



In [224]:

    
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.xaxis.set_ticks_position('bottom')
ax.spines['bottom'].set_position(('data',0))
ax.yaxis.set_ticks_position('left')
ax.spines['left'].set_position(('data',0))
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-")









    Out[224]:





[<matplotlib.lines.Line2D at 0x10d1ac8d0>]



In [225]:

    
plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-", label="cosine")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-", label="sine")

plt.legend(loc='upper left')









    Out[225]:





<matplotlib.legend.Legend at 0x10d3928d0>



In [226]:

    
t = 2*np.pi/3
plt.plot([t,t],[0,np.cos(t)], color ='blue', linewidth=2.5, linestyle="--")
plt.scatter([t,],[np.cos(t),], 50, color ='blue')

plt.annotate(r'$\sin(\frac{2\pi}{3})=\frac{\sqrt{3}}{2}$',
         xy=(t, np.sin(t)), xycoords='data',
         xytext=(+10, +30), textcoords='offset points', fontsize=16,
         arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))

plt.plot([t,t],[0,np.sin(t)], color ='red', linewidth=2.5, linestyle="--")
plt.scatter([t,],[np.sin(t),], 50, color ='red')

plt.annotate(r'$\cos(\frac{2\pi}{3})=-\frac{1}{2}$',
         xy=(t, np.cos(t)), xycoords='data',
         xytext=(-90, -50), textcoords='offset points', fontsize=16,
         arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2"))

plt.plot(X, C, color="blue", linewidth=2.5, linestyle="-", label="cosine")
plt.plot(X, S, color="red",  linewidth=2.5, linestyle="-", label="sine")









    Out[226]:





[<matplotlib.lines.Line2D at 0x10d4f7210>]



In [227]:

    
plt.axes([.1,.1,1,1])
plt.axes([.2,.2,.3,.3],axisbg='green')









    



/Users/heming03/python-env/lib/python2.7/site-packages/matplotlib-2.0.0-py2.7-macosx-10.6-intel.egg/matplotlib/cbook.py:136: MatplotlibDeprecationWarning: The axisbg attribute was deprecated in version 2.0. Use facecolor instead.
  warnings.warn(message, mplDeprecation, stacklevel=1)






    Out[227]:





<matplotlib.axes._axes.Axes at 0x10d636190>



In [228]:

    
plt.subplot(2,1,1,axisbg='y')
plt.subplot(2,1,2)









    Out[228]:





<matplotlib.axes._subplots.AxesSubplot at 0x10d6fb8d0>



In [229]:

    
for idx, color in enumerate('rgbyck'):
    plt.subplot(3,2,1+idx,axisbg=color)



In [ ]:



In [230]:

    
n = 1024
X = np.random.normal(0,1,n)
Y = np.random.normal(0,1,n)
T = np.arctan2(Y,X)
plt.scatter(X,Y,s=75,c=T,alpha=.5)









    Out[230]:





<matplotlib.collections.PathCollection at 0x10dd3e690>



In [231]:

    
n = 12
X = np.arange(n)
Y1 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)
Y2 = (1-X/float(n)) * np.random.uniform(0.5,1.0,n)

plt.bar(X, +Y1, facecolor='#9999ff', edgecolor='white')
plt.bar(X, -Y2, facecolor='#ff9999', edgecolor='white')

for x,y in zip(X,Y1):
    plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va= 'bottom')

plt.ylim(-1.25,+1.25)









    Out[231]:





(-1.25, 1.25)



In [232]:

    
def f(x,y): return (1-x/2+x**5+y**3)*np.exp(-x**2-y**2)

n = 256
x = np.linspace(-3,3,n)
y = np.linspace(-3,3,n)
X,Y = np.meshgrid(x,y)

plt.contourf(X, Y, f(X,Y), 8, alpha=.75, cmap='jet')
C = plt.contour(X, Y, f(X,Y), 8, colors='black', linewidth=.5)



In [ ]:

	one	two	three
a	0.389612	1.024433	-0.813130
c	0.352398	0.808131	-0.305175
e	-1.017042	-0.857841	-0.580242
f	0.179291	-1.993233	0.664604
h	0.456743	-1.174220	0.304638

	a	b	c	d
0	-0.968729	-0.323539	-0.519171	1.090456
1	0.081356	-0.440603	0.003373	-1.567866
2	1.155166	0.252753	0.339318	-0.010611
0	0.770177	-1.357404	NaN	-0.067142
1	-0.576567	1.171635	NaN	-0.068974

	a	b	c	d
0	-0.968729	-0.323539	-0.519171	1.090456
1	0.081356	-0.440603	0.003373	-1.567866
2	1.155166	0.252753	0.339318	-0.010611
3	0.770177	-1.357404	NaN	-0.067142
4	-0.576567	1.171635	NaN	-0.068974

	data1	data2	key1	key2
0	-1.132667	0.432118	a	one
1	1.344343	-1.410204	a	two
2	-2.579932	-0.464842	b	one
3	0.770310	0.332682	b	two
4	0.044376	0.530382	a	one

	data1	data2
key1
a	2.477010	1.940585
b	3.350242	0.797524

	one	bar	flag	foo	one_trunc
a	1.0	1.0	False	bar	1.0
b	2.0	2.0	False	bar	2.0
c	3.0	3.0	True	bar	NaN
d	NaN	NaN	False	bar	NaN

	key1	key2	lval_x	lval_y
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2	lval_x	key3	key4	lval_y
0	foo	one	1	foo	one	4
1	foo	one	1	foo	one	5
2	foo	two	2	foo	one	4
3	foo	two	2	foo	one	5
4	bar	one	3	bar	one	6
5	bar	one	3	bar	two	7

	key1	key2	lval_x	lval_y
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2	lval_x	key3	key4	lval_y
0	foo	one	1	foo	one	4
1	foo	one	1	foo	one	5
2	foo	two	2	foo	one	4
3	foo	two	2	foo	one	5
4	bar	one	3	bar	one	6
5	bar	one	3	bar	two	7

	key1	key2	lval_x	lval_y
0	foo	one	1.0	4.0
1	foo	one	1.0	5.0
2	foo	two	2.0	NaN
3	bar	one	3.0	6.0
4	bar	two	NaN	7.0

	key1	key2	lval_x	key3	key4	lval_y
0	foo	one	1	foo	one	4
1	foo	one	1	foo	one	5
2	foo	two	2	foo	one	4
3	foo	two	2	foo	one	5
4	bar	one	3	bar	one	6
5	bar	one	3	bar	two	7