このノートブックは、pandasのドキュメントの10 Minutes to pandasを写経したものです。


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Object Creation


In [2]:
s = pd.Series([1,3,5,np.nan,6,8])

s


Out[2]:
0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20130101', periods=6)

dates


Out[3]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

df


Out[4]:
A B C D
2013-01-01 0.618253 0.502765 -0.067995 -0.066270
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153
2013-01-04 0.007697 -0.088263 -0.987374 -0.188286
2013-01-05 -0.046063 -0.592812 -0.544709 -0.744876
2013-01-06 -0.033579 -0.937799 -2.322357 0.766281

In [5]:
df2 = pd.DataFrame({ 'A': 1.,
                     'B': pd.Timestamp('20130102'),
                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                     'D': np.array([3] * 4, dtype='int32'),
                     'E': pd.Categorical(['test', 'train', 'test', 'train']),
                     'F': 'foo' })

df2


Out[5]:
A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo

In [6]:
df2.dtypes


Out[6]:
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

Viewing Data


In [7]:
df.head()


Out[7]:
A B C D
2013-01-01 0.618253 0.502765 -0.067995 -0.066270
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153
2013-01-04 0.007697 -0.088263 -0.987374 -0.188286
2013-01-05 -0.046063 -0.592812 -0.544709 -0.744876

In [8]:
df.tail(3)


Out[8]:
A B C D
2013-01-04 0.007697 -0.088263 -0.987374 -0.188286
2013-01-05 -0.046063 -0.592812 -0.544709 -0.744876
2013-01-06 -0.033579 -0.937799 -2.322357 0.766281

In [9]:
df.index


Out[9]:
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns


Out[10]:
Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.values


Out[11]:
array([[ 0.61825274,  0.50276504, -0.06799545, -0.06627048],
       [-0.99512073,  0.91932031, -1.22151646, -1.01605336],
       [ 1.37969906, -0.93675579, -1.27619681, -0.02015276],
       [ 0.00769677, -0.08826275, -0.98737373, -0.18828636],
       [-0.04606315, -0.5928116 , -0.54470861, -0.74487555],
       [-0.03357903, -0.93779933, -2.32235739,  0.76628084]])

In [12]:
df.describe()


Out[12]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.155148 -0.188924 -1.070025 -0.211560
std 0.791828 0.774592 0.764283 0.623874
min -0.995121 -0.937799 -2.322357 -1.016053
25% -0.042942 -0.850770 -1.262527 -0.605728
50% -0.012941 -0.340537 -1.104445 -0.127278
75% 0.465614 0.355008 -0.655375 -0.031682
max 1.379699 0.919320 -0.067995 0.766281

In [13]:
df.T


Out[13]:
2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00 2013-01-04 00:00:00 2013-01-05 00:00:00 2013-01-06 00:00:00
A 0.618253 -0.995121 1.379699 0.007697 -0.046063 -0.033579
B 0.502765 0.919320 -0.936756 -0.088263 -0.592812 -0.937799
C -0.067995 -1.221516 -1.276197 -0.987374 -0.544709 -2.322357
D -0.066270 -1.016053 -0.020153 -0.188286 -0.744876 0.766281

In [14]:
df.sort_values(by='B')


Out[14]:
A B C D
2013-01-06 -0.033579 -0.937799 -2.322357 0.766281
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153
2013-01-05 -0.046063 -0.592812 -0.544709 -0.744876
2013-01-04 0.007697 -0.088263 -0.987374 -0.188286
2013-01-01 0.618253 0.502765 -0.067995 -0.066270
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053

Selection

Getting


In [15]:
df['A']


Out[15]:
2013-01-01    0.618253
2013-01-02   -0.995121
2013-01-03    1.379699
2013-01-04    0.007697
2013-01-05   -0.046063
2013-01-06   -0.033579
Freq: D, Name: A, dtype: float64

In [16]:
df[0:3]


Out[16]:
A B C D
2013-01-01 0.618253 0.502765 -0.067995 -0.066270
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153

Selection by Label


In [17]:
df.loc[dates[0]]


Out[17]:
A    0.618253
B    0.502765
C   -0.067995
D   -0.066270
Name: 2013-01-01 00:00:00, dtype: float64

In [18]:
df.loc[:, ['A', 'B']]


Out[18]:
A B
2013-01-01 0.618253 0.502765
2013-01-02 -0.995121 0.919320
2013-01-03 1.379699 -0.936756
2013-01-04 0.007697 -0.088263
2013-01-05 -0.046063 -0.592812
2013-01-06 -0.033579 -0.937799

In [19]:
df.loc['20130102': '20130104', ['A', 'B']]


Out[19]:
A B
2013-01-02 -0.995121 0.919320
2013-01-03 1.379699 -0.936756
2013-01-04 0.007697 -0.088263

In [20]:
df.loc['20130102', ['A', 'B']]


Out[20]:
A   -0.995121
B    0.919320
Name: 2013-01-02 00:00:00, dtype: float64

In [21]:
df.loc[dates[0], 'A']


Out[21]:
0.61825274455811385

In [22]:
df.at[dates[0], 'A']


Out[22]:
0.61825274455811385

Selection by Position


In [23]:
df.iloc[3]


Out[23]:
A    0.007697
B   -0.088263
C   -0.987374
D   -0.188286
Name: 2013-01-04 00:00:00, dtype: float64

In [24]:
df.iloc[3:5, 0:2]


Out[24]:
A B
2013-01-04 0.007697 -0.088263
2013-01-05 -0.046063 -0.592812

In [25]:
df.iloc[[1, 2, 4], [0, 2]]


Out[25]:
A C
2013-01-02 -0.995121 -1.221516
2013-01-03 1.379699 -1.276197
2013-01-05 -0.046063 -0.544709

In [26]:
df.iloc[1:3, :]


Out[26]:
A B C D
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153

In [27]:
df.iloc[:, 1:3]


Out[27]:
B C
2013-01-01 0.502765 -0.067995
2013-01-02 0.919320 -1.221516
2013-01-03 -0.936756 -1.276197
2013-01-04 -0.088263 -0.987374
2013-01-05 -0.592812 -0.544709
2013-01-06 -0.937799 -2.322357

In [28]:
df.iloc[1, 1]


Out[28]:
0.91932030913998264

In [29]:
df.iat[1, 1]


Out[29]:
0.91932030913998264

Boolean Indexing


In [30]:
df[df.A > 0]


Out[30]:
A B C D
2013-01-01 0.618253 0.502765 -0.067995 -0.066270
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153
2013-01-04 0.007697 -0.088263 -0.987374 -0.188286

In [31]:
df[df > 0]


Out[31]:
A B C D
2013-01-01 0.618253 0.502765 NaN NaN
2013-01-02 NaN 0.919320 NaN NaN
2013-01-03 1.379699 NaN NaN NaN
2013-01-04 0.007697 NaN NaN NaN
2013-01-05 NaN NaN NaN NaN
2013-01-06 NaN NaN NaN 0.766281

In [32]:
df2 = df.copy()

df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

df2


Out[32]:
A B C D E
2013-01-01 0.618253 0.502765 -0.067995 -0.066270 one
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053 one
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153 two
2013-01-04 0.007697 -0.088263 -0.987374 -0.188286 three
2013-01-05 -0.046063 -0.592812 -0.544709 -0.744876 four
2013-01-06 -0.033579 -0.937799 -2.322357 0.766281 three

In [33]:
df2[df2['E'].isin(['two', 'four'])]


Out[33]:
A B C D E
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153 two
2013-01-05 -0.046063 -0.592812 -0.544709 -0.744876 four

Setting

Missing Data


In [34]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

df1.loc[dates[0]: dates[1], 'E'] = 1

df1


Out[34]:
A B C D E
2013-01-01 0.618253 0.502765 -0.067995 -0.066270 1.0
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053 1.0
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153 NaN
2013-01-04 0.007697 -0.088263 -0.987374 -0.188286 NaN

In [44]:
df1.dropna(how='any')


Out[44]:
A B C D E
2013-01-01 0.618253 0.502765 -0.067995 -0.066270 1.0
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053 1.0

In [45]:
df1.fillna(value=5)


Out[45]:
A B C D E
2013-01-01 0.618253 0.502765 -0.067995 -0.066270 1.0
2013-01-02 -0.995121 0.919320 -1.221516 -1.016053 1.0
2013-01-03 1.379699 -0.936756 -1.276197 -0.020153 5.0
2013-01-04 0.007697 -0.088263 -0.987374 -0.188286 5.0

In [46]:
pd.isnull(df1)


Out[46]:
A B C D E
2013-01-01 False False False False False
2013-01-02 False False False False False
2013-01-03 False False False False True
2013-01-04 False False False False True

Operations

Stats


In [47]:
df.mean()


Out[47]:
A    -2.646609
B     8.000440
C    14.582645
D    15.084920
dtype: float64

In [48]:
df.mean(1)


Out[48]:
2000-01-01     0.005063
2000-01-02    -0.043278
2000-01-03     0.056917
2000-01-04     0.299292
2000-01-05     0.584717
2000-01-06    -0.060725
2000-01-07    -0.276199
2000-01-08    -0.134982
2000-01-09     0.157134
2000-01-10    -0.550964
2000-01-11    -0.039723
2000-01-12    -0.083385
2000-01-13    -0.785855
2000-01-14    -0.649820
2000-01-15    -0.844099
2000-01-16    -1.414139
2000-01-17    -0.704719
2000-01-18    -0.672511
2000-01-19     0.218788
2000-01-20     1.159569
2000-01-21     1.100745
2000-01-22     1.213534
2000-01-23     1.410487
2000-01-24     2.531551
2000-01-25     2.960139
2000-01-26     3.262218
2000-01-27     3.322619
2000-01-28     3.291990
2000-01-29     3.562352
2000-01-30     3.752605
                ...    
2002-08-28    28.105632
2002-08-29    28.663800
2002-08-30    29.268673
2002-08-31    29.475474
2002-09-01    30.079525
2002-09-02    30.271997
2002-09-03    30.510502
2002-09-04    30.214186
2002-09-05    30.829085
2002-09-06    30.984539
2002-09-07    31.545869
2002-09-08    30.869414
2002-09-09    30.678260
2002-09-10    31.002729
2002-09-11    31.528121
2002-09-12    31.617298
2002-09-13    32.468719
2002-09-14    31.402521
2002-09-15    32.388622
2002-09-16    32.634057
2002-09-17    32.790612
2002-09-18    33.104956
2002-09-19    32.698720
2002-09-20    32.362241
2002-09-21    32.570436
2002-09-22    32.169981
2002-09-23    32.528613
2002-09-24    31.987734
2002-09-25    32.282687
2002-09-26    31.805595
Freq: D, Length: 1000, dtype: float64

In [49]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

s


Out[49]:
2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [50]:
df.sub(s, axis='index')


Out[50]:
A B C D
2000-01-01 NaN NaN NaN NaN
2000-01-02 NaN NaN NaN NaN
2000-01-03 NaN NaN NaN NaN
2000-01-04 NaN NaN NaN NaN
2000-01-05 NaN NaN NaN NaN
2000-01-06 NaN NaN NaN NaN
2000-01-07 NaN NaN NaN NaN
2000-01-08 NaN NaN NaN NaN
2000-01-09 NaN NaN NaN NaN
2000-01-10 NaN NaN NaN NaN
2000-01-11 NaN NaN NaN NaN
2000-01-12 NaN NaN NaN NaN
2000-01-13 NaN NaN NaN NaN
2000-01-14 NaN NaN NaN NaN
2000-01-15 NaN NaN NaN NaN
2000-01-16 NaN NaN NaN NaN
2000-01-17 NaN NaN NaN NaN
2000-01-18 NaN NaN NaN NaN
2000-01-19 NaN NaN NaN NaN
2000-01-20 NaN NaN NaN NaN
2000-01-21 NaN NaN NaN NaN
2000-01-22 NaN NaN NaN NaN
2000-01-23 NaN NaN NaN NaN
2000-01-24 NaN NaN NaN NaN
2000-01-25 NaN NaN NaN NaN
2000-01-26 NaN NaN NaN NaN
2000-01-27 NaN NaN NaN NaN
2000-01-28 NaN NaN NaN NaN
2000-01-29 NaN NaN NaN NaN
2000-01-30 NaN NaN NaN NaN
... ... ... ... ...
2002-09-03 NaN NaN NaN NaN
2002-09-04 NaN NaN NaN NaN
2002-09-05 NaN NaN NaN NaN
2002-09-06 NaN NaN NaN NaN
2002-09-07 NaN NaN NaN NaN
2002-09-08 NaN NaN NaN NaN
2002-09-09 NaN NaN NaN NaN
2002-09-10 NaN NaN NaN NaN
2002-09-11 NaN NaN NaN NaN
2002-09-12 NaN NaN NaN NaN
2002-09-13 NaN NaN NaN NaN
2002-09-14 NaN NaN NaN NaN
2002-09-15 NaN NaN NaN NaN
2002-09-16 NaN NaN NaN NaN
2002-09-17 NaN NaN NaN NaN
2002-09-18 NaN NaN NaN NaN
2002-09-19 NaN NaN NaN NaN
2002-09-20 NaN NaN NaN NaN
2002-09-21 NaN NaN NaN NaN
2002-09-22 NaN NaN NaN NaN
2002-09-23 NaN NaN NaN NaN
2002-09-24 NaN NaN NaN NaN
2002-09-25 NaN NaN NaN NaN
2002-09-26 NaN NaN NaN NaN
2013-01-01 NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN
2013-01-03 NaN NaN NaN NaN
2013-01-04 NaN NaN NaN NaN
2013-01-05 NaN NaN NaN NaN
2013-01-06 NaN NaN NaN NaN

1006 rows × 4 columns

Apply


In [51]:
df.apply(np.cumsum)


Out[51]:
A B C D
2000-01-01 -0.832751 0.868059 -0.301560 0.286505
2000-01-02 -0.901755 1.216812 -1.244315 0.776400
2000-01-03 0.092781 1.200232 -2.613368 1.395166
2000-01-04 1.668834 1.266200 -3.896847 2.233795
2000-01-05 3.619437 1.374661 -4.296806 2.913557
2000-01-06 4.394162 1.436736 -4.906988 2.444037
2000-01-07 5.469487 0.550584 -5.727608 1.970690
2000-01-08 7.317802 -1.023033 -8.385486 3.813942
2000-01-09 9.121496 -2.423652 -11.713516 7.367431
2000-01-10 11.493081 -3.722708 -16.267937 8.645467
2000-01-11 14.604035 -5.398061 -19.387465 10.170504
2000-01-12 17.574793 -8.060002 -22.297502 12.438186
2000-01-13 20.176255 -12.464648 -25.771082 14.571529
2000-01-14 23.131906 -17.103087 -28.595645 16.479600
2000-01-15 28.084441 -23.383633 -31.043158 16.878727
2000-01-16 32.723050 -30.699453 -34.409929 17.266155
2000-01-17 37.504343 -37.520318 -37.499614 19.576536
2000-01-18 44.323879 -44.907395 -42.307611 22.262028
2000-01-19 50.712152 -49.418754 -45.664866 24.617520
2000-01-20 56.048450 -50.806740 -49.057163 28.699780
2000-01-21 61.004151 -52.114429 -52.365757 32.763343
2000-01-22 65.634774 -53.853400 -53.956996 36.317064
2000-01-23 70.765192 -55.611716 -56.252967 40.882884
2000-01-24 77.680972 -57.885157 -56.375395 46.489176
2000-01-25 85.629536 -59.615416 -56.098205 51.834236
2000-01-26 93.993709 -60.217129 -56.045743 57.068186
2000-01-27 102.601749 -61.489320 -55.264530 62.241602
2000-01-28 110.699293 -62.810002 -53.891056 67.259228
2000-01-29 120.125364 -64.286849 -53.954188 73.622547
2000-01-30 130.346283 -65.995252 -53.777664 79.943926
... ... ... ... ...
2002-08-28 -2750.014748 6798.776003 13513.498671 13816.196392
2002-08-29 -2744.906713 6835.660705 13548.653659 13853.703868
2002-08-30 -2738.864155 6873.798210 13582.436657 13892.815497
2002-08-31 -2732.605255 6912.349135 13616.731759 13931.612466
2002-09-01 -2724.937717 6951.841135 13650.682327 13970.820458
2002-09-02 -2717.817443 6992.564634 13685.044583 14009.702418
2002-09-03 -2710.960580 7033.140516 13721.198635 14048.157627
2002-09-04 -2705.334518 7072.701812 13756.643657 14088.381993
2002-09-05 -2699.124539 7113.062310 13792.695599 14129.075913
2002-09-06 -2693.809292 7154.297129 13829.058597 14170.101004
2002-09-07 -2689.874133 7196.745975 13866.976960 14211.982110
2002-09-08 -2687.266328 7237.315569 13904.364187 14254.895140
2002-09-09 -2685.041762 7278.458676 13941.405859 14297.198836
2002-09-10 -2682.509622 7319.975541 13977.714929 14340.851677
2002-09-11 -2679.241926 7363.194476 14014.037104 14384.155356
2002-09-12 -2676.968340 7407.446864 14050.158886 14427.976793
2002-09-13 -2673.983015 7452.253796 14086.495029 14473.723269
2002-09-14 -2671.661746 7495.044147 14121.903244 14518.813517
2002-09-15 -2668.278537 7537.264505 14159.531264 14565.136417
2002-09-16 -2665.246356 7580.466614 14197.782158 14611.187461
2002-09-17 -2661.610429 7623.238284 14236.280176 14657.444292
2002-09-18 -2658.514934 7665.214743 14276.732242 14704.340099
2002-09-19 -2656.062347 7705.757092 14315.966111 14752.906172
2002-09-20 -2654.625917 7748.167631 14354.350355 14800.123922
2002-09-21 -2653.104650 7789.579894 14394.023088 14847.799403
2002-09-22 -2652.577133 7831.742507 14432.316971 14895.495314
2002-09-23 -2651.147482 7874.825718 14469.990931 14943.422943
2002-09-24 -2650.277139 7917.546875 14507.176645 14990.596667
2002-09-25 -2648.458831 7960.045668 14545.104888 15037.482067
2002-09-26 -2646.608734 8000.439699 14582.645073 15084.920137

1000 rows × 4 columns


In [52]:
df.apply(lambda x: x.max() - x.min())


Out[52]:
A    28.918161
B    58.627017
C    57.080590
D    49.180040
dtype: float64

Histogramming


In [53]:
s = pd.Series(np.random.randint(0, 7, size=10))

s


Out[53]:
0    4
1    5
2    4
3    3
4    1
5    0
6    4
7    5
8    1
9    6
dtype: int64

In [54]:
s.value_counts()


Out[54]:
4    3
5    2
1    2
6    1
3    1
0    1
dtype: int64

String Methods


In [55]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

s.str.lower()


Out[55]:
0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

Merge

Concat


In [56]:
df = pd.DataFrame(np.random.randn(10, 4))

df


Out[56]:
0 1 2 3
0 0.321902 1.205070 0.330259 0.693629
1 0.597168 0.261625 1.299816 0.657233
2 0.633787 2.421194 0.397054 -0.137348
3 -0.398237 1.582425 -1.448677 -1.714734
4 -0.183996 1.236898 -0.153148 -0.172407
5 1.392010 -1.034959 0.346614 -0.370872
6 -0.292955 -1.016325 -0.215982 -0.576879
7 -0.408416 0.048120 0.231508 0.110683
8 -0.893417 1.454152 -0.874285 -0.681164
9 0.533998 1.208251 -1.159619 0.159926

In [58]:
pieces = [df[:3], df[3:7], df[7:]]

pd.concat(pieces)


Out[58]:
0 1 2 3
0 0.321902 1.205070 0.330259 0.693629
1 0.597168 0.261625 1.299816 0.657233
2 0.633787 2.421194 0.397054 -0.137348
3 -0.398237 1.582425 -1.448677 -1.714734
4 -0.183996 1.236898 -0.153148 -0.172407
5 1.392010 -1.034959 0.346614 -0.370872
6 -0.292955 -1.016325 -0.215982 -0.576879
7 -0.408416 0.048120 0.231508 0.110683
8 -0.893417 1.454152 -0.874285 -0.681164
9 0.533998 1.208251 -1.159619 0.159926

Join


In [59]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left


Out[59]:
key lval
0 foo 1
1 foo 2

In [60]:
right


Out[60]:
key rval
0 foo 4
1 foo 5

In [61]:
pd.merge(left, right, on='key')


Out[61]:
key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5

In [62]:
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})

left


Out[62]:
key lval
0 foo 1
1 bar 2

In [63]:
right


Out[63]:
key rval
0 foo 4
1 bar 5

In [64]:
pd.merge(left, right, on='key')


Out[64]:
key lval rval
0 foo 1 4
1 bar 2 5

Append


In [65]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])

df


Out[65]:
A B C D
0 -0.487518 1.274625 -0.485657 -0.016763
1 -0.441470 -2.051882 -0.632473 -1.595755
2 -0.029285 0.353873 -0.376998 0.842352
3 0.027088 0.507215 -1.145043 -0.639576
4 0.948615 0.321521 -0.306540 -0.837198
5 -0.523604 -0.135446 1.478583 -0.150725
6 2.326930 -1.069170 1.302856 2.774401
7 0.492611 0.554848 0.013007 0.100322

In [66]:
s = df.iloc[3]

df.append(s, ignore_index=True)


Out[66]:
A B C D
0 -0.487518 1.274625 -0.485657 -0.016763
1 -0.441470 -2.051882 -0.632473 -1.595755
2 -0.029285 0.353873 -0.376998 0.842352
3 0.027088 0.507215 -1.145043 -0.639576
4 0.948615 0.321521 -0.306540 -0.837198
5 -0.523604 -0.135446 1.478583 -0.150725
6 2.326930 -1.069170 1.302856 2.774401
7 0.492611 0.554848 0.013007 0.100322
8 0.027088 0.507215 -1.145043 -0.639576

Grouping


In [67]:
df = pd.DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                    'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                    'C': np.random.randn(8),
                    'D': np.random.randn(8) })

df


Out[67]:
A B C D
0 foo one -0.651992 0.106746
1 bar one -0.552285 -0.392568
2 foo two -1.254844 0.000028
3 bar three -1.115552 0.566673
4 foo two -0.746000 -0.923376
5 bar two -0.502222 1.426331
6 foo one 0.420041 0.866670
7 foo three 0.675060 -0.423116

In [68]:
df.groupby('A').sum()


Out[68]:
C D
A
bar -2.170058 1.600436
foo -1.557735 -0.373048

In [69]:
df.groupby(['A', 'B']).sum()


Out[69]:
C D
A B
bar one -0.552285 -0.392568
three -1.115552 0.566673
two -0.502222 1.426331
foo one -0.231951 0.973416
three 0.675060 -0.423116
two -2.000844 -0.923348

Reshaping

Stack


In [70]:
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

df2 = df[:4]

df2


Out[70]:
A B
first second
bar one 0.944384 -1.027474
two -1.285602 -1.459610
baz one 0.959814 -2.647089
two -2.352397 0.297181

In [71]:
stacked = df2.stack()

stacked


Out[71]:
first  second   
bar    one     A    0.944384
               B   -1.027474
       two     A   -1.285602
               B   -1.459610
baz    one     A    0.959814
               B   -2.647089
       two     A   -2.352397
               B    0.297181
dtype: float64

In [72]:
stacked.unstack()


Out[72]:
A B
first second
bar one 0.944384 -1.027474
two -1.285602 -1.459610
baz one 0.959814 -2.647089
two -2.352397 0.297181

In [73]:
stacked.unstack(1)


Out[73]:
second one two
first
bar A 0.944384 -1.285602
B -1.027474 -1.459610
baz A 0.959814 -2.352397
B -2.647089 0.297181

In [74]:
stacked.unstack(0)


Out[74]:
first bar baz
second
one A 0.944384 0.959814
B -1.027474 -2.647089
two A -1.285602 -2.352397
B -1.459610 0.297181

Pivot Tables


In [76]:
df = pd.DataFrame({ 'A': ['one', 'one', 'two', 'three'] * 3,
                    'B': ['A', 'B', 'C'] * 4,
                    'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                    'D' : np.random.randn(12),
                    'E' : np.random.randn(12) })

In [77]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])


Out[77]:
C bar foo
A B
one A 0.475295 -1.371719
B 0.248364 0.556322
C -0.536821 0.310601
three A -0.503781 NaN
B NaN 0.338301
C -0.536639 NaN
two A NaN 0.186360
B 1.284389 NaN
C NaN -1.132025

Time Series


In [ ]:
rng = pd.date_range('1/1/2012', periods=100, freq='S')

Categoricals


In [35]:
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

In [36]:
df["grade"] = df["raw_grade"].astype("category")

df["grade"]


Out[36]:
0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

In [37]:
df["grade"].cat.categories = ["very good", "good", "very bad"]

In [38]:
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium",
                                              "good", "very good"])

df["grade"]


Out[38]:
0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

In [39]:
df.sort_values(by="grade")


Out[39]:
id raw_grade grade
5 6 e very bad
1 2 b good
2 3 b good
0 1 a very good
3 4 a very good
4 5 a very good

In [40]:
df.groupby("grade").size()


Out[40]:
grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

Plotting


In [41]:
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

ts = ts.cumsum()

ts.plot()


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fba3f7f6240>

In [42]:
df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index, columns=['A', 'B', 'C', 'D'])

df = df.cumsum()

plt.figure(); df.plot(); plt.legend(loc='best')


Out[42]:
<matplotlib.legend.Legend at 0x7fba3d767898>

Getting Data In/Out

CSV

HDF5

Excel

Gotchas


In [43]:
if pd.Series([False, True, False]):
    print("I was true")


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-43-5c782b38cd2f> in <module>()
----> 1 if pd.Series([False, True, False]):
      2     print("I was true")

/usr/local/lib/python3.5/dist-packages/pandas/core/generic.py in __nonzero__(self)
    953         raise ValueError("The truth value of a {0} is ambiguous. "
    954                          "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
--> 955                          .format(self.__class__.__name__))
    956 
    957     __bool__ = __nonzero__

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().