You are currently looking at version 1.0 of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the Jupyter Notebook FAQ course resource.


The Series Data Structure


In [1]:
import pandas as pd
pd.Series?


Vendor:  Continuum Analytics, Inc.
Package: mkl
Message: trial mode expires in 30 days

In [2]:
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)


Out[2]:
0    Tiger
1     Bear
2    Moose
dtype: object

In [3]:
numbers = [1, 2, 3]
pd.Series(numbers)


Out[3]:
0    1
1    2
2    3
dtype: int64

In [4]:
animals = ['Tiger', 'Bear', None]
pd.Series(animals)


Out[4]:
0    Tiger
1     Bear
2     None
dtype: object

In [5]:
numbers = [1, 2, None]
pd.Series(numbers)


Out[5]:
0     1
1     2
2   NaN
dtype: float64

In [6]:
import numpy as np
np.nan == None


Out[6]:
False

In [7]:
np.nan == np.nan


Out[7]:
False

In [8]:
np.isnan(np.nan)


Out[8]:
True

In [9]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s


Out[9]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [10]:
s.index


Out[10]:
Index([u'Archery', u'Golf', u'Sumo', u'Taekwondo'], dtype='object')

In [11]:
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s


Out[11]:
India      Tiger
America     Bear
Canada     Moose
dtype: object

In [12]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s


Out[12]:
Golf      Scotland
Sumo         Japan
Hockey         NaN
dtype: object

Querying a Series


In [13]:
sports = {'Archery': 'Bhutan',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s


Out[13]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [14]:
s.iloc[3]


Out[14]:
'South Korea'

In [15]:
s.loc['Golf']


Out[15]:
'Scotland'

In [16]:
s[3]


Out[16]:
'South Korea'

In [17]:
s['Golf']


Out[17]:
'Scotland'

In [18]:
sports = {99: 'Bhutan',
          100: 'Scotland',
          101: 'Japan',
          102: 'South Korea'}
s = pd.Series(sports)

In [19]:
s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-19-a5f43d492595> in <module>()
----> 1 s[0] #This won't call s.iloc[0] as one might expect, it generates an error instead

/Users/fuyangliu/anaconda/lib/python2.7/site-packages/pandas/core/series.pyc in __getitem__(self, key)
    555     def __getitem__(self, key):
    556         try:
--> 557             result = self.index.get_value(self, key)
    558 
    559             if not np.isscalar(result):

/Users/fuyangliu/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_value(self, series, key)
   1788 
   1789         try:
-> 1790             return self._engine.get_value(s, k)
   1791         except KeyError as e1:
   1792             if len(self) > 0 and self.inferred_type in ['integer','boolean']:

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3204)()

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:2903)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3843)()

pandas/hashtable.pyx in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:6525)()

pandas/hashtable.pyx in pandas.hashtable.Int64HashTable.get_item (pandas/hashtable.c:6463)()

KeyError: 0

In [20]:
s.iloc[0]


Out[20]:
'Bhutan'

In [21]:
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s


Out[21]:
0    100
1    120
2    101
3      3
dtype: float64

In [22]:
total = 0
for item in s:
    total+=item
print(total)


324.0

In [23]:
import numpy as np

total = np.sum(s)
print(total)


324.0

In [24]:
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()


Out[24]:
0    858
1    133
2     33
3    866
4    945
dtype: int64

In [25]:
len(s)


Out[25]:
10000

Timing code with timeit


In [26]:
%%timeit -n 100
summary = 0
for item in s:
    summary+=item


100 loops, best of 3: 1.03 ms per loop

In [27]:
%%timeit -n 100
summary = np.sum(s)


100 loops, best of 3: 22.2 µs per loop

In [28]:
s+=2 #adds two to each item in s using broadcasting
s.head()


Out[28]:
0    860
1    135
2     35
3    868
4    947
dtype: int64

In [29]:
for label, value in s.iteritems():
    s.set_value(label, value+2)
s.head()


Out[29]:
0    862
1    137
2     37
3    870
4    949
dtype: int64

In [30]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
    s.loc[label]= value+2


10 loops, best of 3: 559 ms per loop

In [31]:
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2


10 loops, best of 3: 259 µs per loop

More about .loc[] function

It also allows to manipulate the data or add data


In [32]:
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s


Out[32]:
0             1
1             2
2             3
Animal    Bears
dtype: object

In [33]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})
cricket_loving_countries = pd.Series(['Australia',
                                      'Barbados',
                                      'Pakistan',
                                      'England'], 
                                   index=['Cricket',
                                          'Cricket',
                                          'Cricket',
                                          'Cricket'])
all_countries = original_sports.append(cricket_loving_countries)

In [34]:
original_sports


Out[34]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [35]:
cricket_loving_countries


Out[35]:
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [36]:
all_countries


Out[36]:
Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

In [37]:
all_countries.loc['Cricket']


Out[37]:
Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

The DataFrame Data Structure


In [38]:
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()


Out[38]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [39]:
df.loc['Store 2']


Out[39]:
Cost                      5
Item Purchased    Bird Seed
Name                  Vinod
Name: Store 2, dtype: object

In [40]:
type(df.loc['Store 2'])


Out[40]:
pandas.core.series.Series

In [41]:
df.loc['Store 1']


Out[41]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn

In [42]:
df.loc['Store 1', 'Cost']


Out[42]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [43]:
df.T


Out[43]:
Store 1 Store 1 Store 2
Cost 22.5 2.5 5
Item Purchased Dog Food Kitty Litter Bird Seed
Name Chris Kevyn Vinod

In [47]:
df.T.loc['Cost']


Out[47]:
Store 1    22.5
Store 1     2.5
Store 2       5
Name: Cost, dtype: object

In [45]:
df['Cost'] # the right and simple way to select a column


Out[45]:
Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [50]:
df.loc['Store 1']['Cost'] # Chaining will return copy!! Do not use chaining!!! Use .loc to slice (see below)


Out[50]:
Store 1    22.5
Store 1     2.5
Name: Cost, dtype: float64

In [49]:
df.loc[:,['Name', 'Cost']]


Out[49]:
Name Cost
Store 1 Chris 22.5
Store 1 Kevyn 2.5
Store 2 Vinod 5.0

About dropping/deleting data


In [51]:
df.drop('Store 1') # returns a copy


Out[51]:
Cost Item Purchased Name
Store 2 5 Bird Seed Vinod

In [52]:
df


Out[52]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [58]:
copy_df = df.copy()
copy_df1 = copy_df.drop('Store 1', inplace=True)

copy_df1

In [59]:
copy_df


Out[59]:
Cost Item Purchased Name
Store 2 5 Bird Seed Vinod

In [61]:
df


Out[61]:
Cost Item Purchased Name
Store 1 22.5 Dog Food Chris
Store 1 2.5 Kitty Litter Kevyn
Store 2 5.0 Bird Seed Vinod

In [60]:
copy_df.drop?

In [62]:
del copy_df['Name']
copy_df


Out[62]:
Cost Item Purchased
Store 2 5 Bird Seed

In [63]:
df['Location'] = None
df


Out[63]:
Cost Item Purchased Name Location
Store 1 22.5 Dog Food Chris None
Store 1 2.5 Kitty Litter Kevyn None
Store 2 5.0 Bird Seed Vinod None

Dataframe Indexing and Loading


In [65]:
costs = df['Cost']
costs


Out[65]:
Store 1    22.5
Store 1     2.5
Store 2     5.0
Name: Cost, dtype: float64

In [66]:
costs+=2
costs


Out[66]:
Store 1    24.5
Store 1     4.5
Store 2     7.0
Name: Cost, dtype: float64

In [67]:
df


Out[67]:
Cost Item Purchased Name Location
Store 1 24.5 Dog Food Chris None
Store 1 4.5 Kitty Litter Kevyn None
Store 2 7.0 Bird Seed Vinod None

In [68]:
!cat olympics.csv


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !,02 !,03 !,Total,№ Games,01 !,02 !,03 !,Combined total
Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12
Australia (AUS) [AUS] [Z],25,139,152,177,468,18,5,3,4,12,43,144,155,181,480
Austria (AUT),26,18,33,35,86,22,59,78,81,218,48,77,111,116,304
Azerbaijan (AZE),5,6,5,15,26,5,0,0,0,0,10,6,5,15,26
Bahamas (BAH),15,5,2,5,12,0,0,0,0,0,15,5,2,5,12
Bahrain (BRN),8,0,0,1,1,0,0,0,0,0,8,0,0,1,1
Barbados (BAR) [BAR],11,0,0,1,1,0,0,0,0,0,11,0,0,1,1
Belarus (BLR),5,12,24,39,75,6,6,4,5,15,11,18,28,44,90
Belgium (BEL),25,37,52,53,142,20,1,1,3,5,45,38,53,56,147
Bermuda (BER),17,0,0,1,1,7,0,0,0,0,24,0,0,1,1
Bohemia (BOH) [BOH] [Z],3,0,1,3,4,0,0,0,0,0,3,0,1,3,4
Botswana (BOT),9,0,1,0,1,0,0,0,0,0,9,0,1,0,1
Brazil (BRA),21,23,30,55,108,7,0,0,0,0,28,23,30,55,108
British West Indies (BWI) [BWI],1,0,0,2,2,0,0,0,0,0,1,0,0,2,2
Bulgaria (BUL) [H],19,51,85,78,214,19,1,2,3,6,38,52,87,81,220
Burundi (BDI),5,1,0,0,1,0,0,0,0,0,5,1,0,0,1
Cameroon (CMR),13,3,1,1,5,1,0,0,0,0,14,3,1,1,5
Canada (CAN),25,59,99,121,279,22,62,56,52,170,47,121,155,173,449
Chile (CHI) [I],22,2,7,4,13,16,0,0,0,0,38,2,7,4,13
China (CHN) [CHN],9,201,146,126,473,10,12,22,19,53,19,213,168,145,526
Colombia (COL),18,2,6,11,19,1,0,0,0,0,19,2,6,11,19
Costa Rica (CRC),14,1,1,2,4,6,0,0,0,0,20,1,1,2,4
Ivory Coast (CIV) [CIV],12,0,1,0,1,0,0,0,0,0,12,0,1,0,1
Croatia (CRO),6,6,7,10,23,7,4,6,1,11,13,10,13,11,34
Cuba (CUB) [Z],19,72,67,70,209,0,0,0,0,0,19,72,67,70,209
Cyprus (CYP),9,0,1,0,1,10,0,0,0,0,19,0,1,0,1
Czech Republic (CZE) [CZE],5,14,15,15,44,6,7,9,8,24,11,21,24,23,68
Czechoslovakia (TCH) [TCH],16,49,49,45,143,16,2,8,15,25,32,51,57,60,168
Denmark (DEN) [Z],26,43,68,68,179,13,0,1,0,1,39,43,69,68,180
Djibouti (DJI) [B],7,0,0,1,1,0,0,0,0,0,7,0,0,1,1
Dominican Republic (DOM),13,3,2,1,6,0,0,0,0,0,13,3,2,1,6
Ecuador (ECU),13,1,1,0,2,0,0,0,0,0,13,1,1,0,2
Egypt (EGY) [EGY] [Z],21,7,9,10,26,1,0,0,0,0,22,7,9,10,26
Eritrea (ERI),4,0,0,1,1,0,0,0,0,0,4,0,0,1,1
Estonia (EST),11,9,9,15,33,9,4,2,1,7,20,13,11,16,40
Ethiopia (ETH),12,21,7,17,45,2,0,0,0,0,14,21,7,17,45
Finland (FIN),24,101,84,117,302,22,42,62,57,161,46,143,146,174,463
France (FRA) [O] [P] [Z],27,202,223,246,671,22,31,31,47,109,49,233,254,293,780
Gabon (GAB),9,0,1,0,1,0,0,0,0,0,9,0,1,0,1
Georgia (GEO),5,6,5,14,25,6,0,0,0,0,11,6,5,14,25
Germany (GER) [GER] [Z],15,174,182,217,573,11,78,78,53,209,26,252,260,270,782
United Team of Germany (EUA) [EUA],3,28,54,36,118,3,8,6,5,19,6,36,60,41,137
East Germany (GDR) [GDR],5,153,129,127,409,6,39,36,35,110,11,192,165,162,519
West Germany (FRG) [FRG],5,56,67,81,204,6,11,15,13,39,11,67,82,94,243
Ghana (GHA) [GHA],13,0,1,3,4,1,0,0,0,0,14,0,1,3,4
Great Britain (GBR) [GBR] [Z],27,236,272,272,780,22,10,4,12,26,49,246,276,284,806
Greece (GRE) [Z],27,30,42,39,111,18,0,0,0,0,45,30,42,39,111
Grenada (GRN),8,1,0,0,1,0,0,0,0,0,8,1,0,0,1
Guatemala (GUA),13,0,1,0,1,1,0,0,0,0,14,0,1,0,1
Guyana (GUY) [GUY],16,0,0,1,1,0,0,0,0,0,16,0,0,1,1
Haiti (HAI) [J],14,0,1,1,2,0,0,0,0,0,14,0,1,1,2
Hong Kong (HKG) [HKG],15,1,1,1,3,4,0,0,0,0,19,1,1,1,3
Hungary (HUN),25,167,144,165,476,22,0,2,4,6,47,167,146,169,482
Iceland (ISL),19,0,2,2,4,17,0,0,0,0,36,0,2,2,4
India (IND) [F],23,9,6,11,26,9,0,0,0,0,32,9,6,11,26
Indonesia (INA),14,6,10,11,27,0,0,0,0,0,14,6,10,11,27
Iran (IRI) [K],15,15,20,25,60,10,0,0,0,0,25,15,20,25,60
Iraq (IRQ),13,0,0,1,1,0,0,0,0,0,13,0,0,1,1
Ireland (IRL),20,9,8,12,29,6,0,0,0,0,26,9,8,12,29
Israel (ISR),15,1,1,5,7,6,0,0,0,0,21,1,1,5,7
Italy (ITA) [M] [S],26,198,166,185,549,22,37,34,43,114,48,235,200,228,663
Jamaica (JAM) [JAM],16,17,30,20,67,7,0,0,0,0,23,17,30,20,67
Japan (JPN),21,130,126,142,398,20,10,17,18,45,41,140,143,160,443
Kazakhstan (KAZ),5,16,17,19,52,6,1,3,3,7,11,17,20,22,59
Kenya (KEN),13,25,32,29,86,3,0,0,0,0,16,25,32,29,86
North Korea (PRK),9,14,12,21,47,8,0,1,1,2,17,14,13,22,49
South Korea (KOR),16,81,82,80,243,17,26,17,10,53,33,107,99,90,296
Kuwait (KUW),12,0,0,2,2,0,0,0,0,0,12,0,0,2,2
Kyrgyzstan (KGZ),5,0,1,2,3,6,0,0,0,0,11,0,1,2,3
Latvia (LAT),10,3,11,5,19,10,0,4,3,7,20,3,15,8,26
Lebanon (LIB),16,0,2,2,4,16,0,0,0,0,32,0,2,2,4
Liechtenstein (LIE),16,0,0,0,0,18,2,2,5,9,34,2,2,5,9
Lithuania (LTU),8,6,5,10,21,8,0,0,0,0,16,6,5,10,21
Luxembourg (LUX) [O],22,1,1,0,2,8,0,2,0,2,30,1,3,0,4
Macedonia (MKD),5,0,0,1,1,5,0,0,0,0,10,0,0,1,1
Malaysia (MAS) [MAS],12,0,3,3,6,0,0,0,0,0,12,0,3,3,6
Mauritius (MRI),8,0,0,1,1,0,0,0,0,0,8,0,0,1,1
Mexico (MEX),22,13,21,28,62,8,0,0,0,0,30,13,21,28,62
Moldova (MDA),5,0,2,5,7,6,0,0,0,0,11,0,2,5,7
Mongolia (MGL),12,2,9,13,24,13,0,0,0,0,25,2,9,13,24
Montenegro (MNE),2,0,1,0,1,2,0,0,0,0,4,0,1,0,1
Morocco (MAR),13,6,5,11,22,6,0,0,0,0,19,6,5,11,22
Mozambique (MOZ),9,1,0,1,2,0,0,0,0,0,9,1,0,1,2
Namibia (NAM),6,0,4,0,4,0,0,0,0,0,6,0,4,0,4
Netherlands (NED) [Z],25,77,85,104,266,20,37,38,35,110,45,114,123,139,376
Netherlands Antilles (AHO) [AHO] [I],13,0,1,0,1,2,0,0,0,0,15,0,1,0,1
New Zealand (NZL) [NZL],22,42,18,39,99,15,0,1,0,1,37,42,19,39,100
Niger (NIG),11,0,0,1,1,0,0,0,0,0,11,0,0,1,1
Nigeria (NGR),15,3,8,12,23,0,0,0,0,0,15,3,8,12,23
Norway (NOR) [Q],24,56,49,43,148,22,118,111,100,329,46,174,160,143,477
Pakistan (PAK),16,3,3,4,10,2,0,0,0,0,18,3,3,4,10
Panama (PAN),16,1,0,2,3,0,0,0,0,0,16,1,0,2,3
Paraguay (PAR),11,0,1,0,1,1,0,0,0,0,12,0,1,0,1
Peru (PER) [L],17,1,3,0,4,2,0,0,0,0,19,1,3,0,4
Philippines (PHI),20,0,2,7,9,4,0,0,0,0,24,0,2,7,9
Poland (POL),20,64,82,125,271,22,6,7,7,20,42,70,89,132,291
Portugal (POR),23,4,8,11,23,7,0,0,0,0,30,4,8,11,23
Puerto Rico (PUR),17,0,2,6,8,6,0,0,0,0,23,0,2,6,8
Qatar (QAT),8,0,0,4,4,0,0,0,0,0,8,0,0,4,4
Romania (ROU),20,88,94,119,301,20,0,0,1,1,40,88,94,120,302
Russia (RUS) [RUS],5,132,121,142,395,6,49,40,35,124,11,181,161,177,519
Russian Empire (RU1) [RU1],3,1,4,3,8,0,0,0,0,0,3,1,4,3,8
Soviet Union (URS) [URS],9,395,319,296,1010,9,78,57,59,194,18,473,376,355,1204
Unified Team (EUN) [EUN],1,45,38,29,112,1,9,6,8,23,2,54,44,37,135
Saudi Arabia (KSA),10,0,1,2,3,0,0,0,0,0,10,0,1,2,3
Senegal (SEN),13,0,1,0,1,5,0,0,0,0,18,0,1,0,1
Serbia (SRB) [SRB],3,1,2,4,7,2,0,0,0,0,5,1,2,4,7
Serbia and Montenegro (SCG) [SCG],3,2,4,3,9,3,0,0,0,0,6,2,4,3,9
Singapore (SIN),15,0,2,2,4,0,0,0,0,0,15,0,2,2,4
Slovakia (SVK) [SVK],5,7,9,8,24,6,2,2,1,5,11,9,11,9,29
Slovenia (SLO),6,4,6,9,19,7,2,4,9,15,13,6,10,18,34
South Africa (RSA),18,23,26,27,76,6,0,0,0,0,24,23,26,27,76
Spain (ESP) [Z],22,37,59,35,131,19,1,0,1,2,41,38,59,36,133
Sri Lanka (SRI) [SRI],16,0,2,0,2,0,0,0,0,0,16,0,2,0,2
Sudan (SUD),11,0,1,0,1,0,0,0,0,0,11,0,1,0,1
Suriname (SUR) [E],11,1,0,1,2,0,0,0,0,0,11,1,0,1,2
Sweden (SWE) [Z],26,143,164,176,483,22,50,40,54,144,48,193,204,230,627
Switzerland (SUI),27,47,73,65,185,22,50,40,48,138,49,97,113,113,323
Syria (SYR),12,1,1,1,3,0,0,0,0,0,12,1,1,1,3
Chinese Taipei (TPE) [TPE] [TPE2],13,2,7,12,21,11,0,0,0,0,24,2,7,12,21
Tajikistan (TJK),5,0,1,2,3,4,0,0,0,0,9,0,1,2,3
Tanzania (TAN) [TAN],12,0,2,0,2,0,0,0,0,0,12,0,2,0,2
Thailand (THA),15,7,6,11,24,3,0,0,0,0,18,7,6,11,24
Togo (TOG),9,0,0,1,1,1,0,0,0,0,10,0,0,1,1
Tonga (TGA),8,0,1,0,1,1,0,0,0,0,9,0,1,0,1
Trinidad and Tobago (TRI) [TRI],16,2,5,11,18,3,0,0,0,0,19,2,5,11,18
Tunisia (TUN),13,3,3,4,10,0,0,0,0,0,13,3,3,4,10
Turkey (TUR),21,39,25,24,88,16,0,0,0,0,37,39,25,24,88
Uganda (UGA),14,2,3,2,7,0,0,0,0,0,14,2,3,2,7
Ukraine (UKR),5,33,27,55,115,6,2,1,4,7,11,35,28,59,122
United Arab Emirates (UAE),8,1,0,0,1,0,0,0,0,0,8,1,0,0,1
United States (USA) [P] [Q] [R] [Z],26,976,757,666,2399,22,96,102,84,282,48,1072,859,750,2681
Uruguay (URU),20,2,2,6,10,1,0,0,0,0,21,2,2,6,10
Uzbekistan (UZB),5,5,5,10,20,6,1,0,0,1,11,6,5,10,21
Venezuela (VEN),17,2,2,8,12,4,0,0,0,0,21,2,2,8,12
Vietnam (VIE),14,0,2,0,2,0,0,0,0,0,14,0,2,0,2
Virgin Islands (ISV),11,0,1,0,1,7,0,0,0,0,18,0,1,0,1
Yugoslavia (YUG) [YUG],16,26,29,28,83,14,0,3,1,4,30,26,32,29,87
Independent Olympic Participants (IOP) [IOP],1,0,1,2,3,0,0,0,0,0,1,0,1,2,3
Zambia (ZAM) [ZAM],12,0,1,1,2,0,0,0,0,0,12,0,1,1,2
Zimbabwe (ZIM) [ZIM],12,3,4,1,8,1,0,0,0,0,13,3,4,1,8
Mixed team (ZZX) [ZZX],3,8,5,4,17,0,0,0,0,0,3,8,5,4,17
Totals,27,4809,4775,5130,14714,22,959,958,948,2865,49,5768,5733,6078,17579

In [69]:
df = pd.read_csv('olympics.csv')
df.head()


Out[69]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
0 NaN № Summer 01 ! 02 ! 03 ! Total № Winter 01 ! 02 ! 03 ! Total № Games 01 ! 02 ! 03 ! Combined total
1 Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
2 Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
3 Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
4 Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12

In [70]:
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()


Out[70]:
№ Summer 01 ! 02 ! 03 ! Total № Winter 01 !.1 02 !.1 03 !.1 Total.1 № Games 01 !.2 02 !.2 03 !.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

In [71]:
df.columns


Out[71]:
Index([u'№ Summer', u'01 !', u'02 !', u'03 !', u'Total', u'№ Winter',
       u'01 !.1', u'02 !.1', u'03 !.1', u'Total.1', u'№ Games', u'01 !.2',
       u'02 !.2', u'03 !.2', u'Combined total'],
      dtype='object')

In [72]:
for col in df.columns:
    if col[:2]=='01':
        df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
    if col[:2]=='02':
        df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
    if col[:2]=='03':
        df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
    if col[:1]=='№':
        df.rename(columns={col:'#' + col[1:]}, inplace=True) 

df.head()


Out[72]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

Querying a DataFrame


In [73]:
df['Gold'] > 0


Out[73]:
Afghanistan (AFG)                               False
Algeria (ALG)                                    True
Argentina (ARG)                                  True
Armenia (ARM)                                    True
Australasia (ANZ) [ANZ]                          True
Australia (AUS) [AUS] [Z]                        True
Austria (AUT)                                    True
Azerbaijan (AZE)                                 True
Bahamas (BAH)                                    True
Bahrain (BRN)                                   False
Barbados (BAR) [BAR]                            False
Belarus (BLR)                                    True
Belgium (BEL)                                    True
Bermuda (BER)                                   False
Bohemia (BOH) [BOH] [Z]                         False
Botswana (BOT)                                  False
Brazil (BRA)                                     True
British West Indies (BWI) [BWI]                 False
Bulgaria (BUL) [H]                               True
Burundi (BDI)                                    True
Cameroon (CMR)                                   True
Canada (CAN)                                     True
Chile (CHI) [I]                                  True
China (CHN) [CHN]                                True
Colombia (COL)                                   True
Costa Rica (CRC)                                 True
Ivory Coast (CIV) [CIV]                         False
Croatia (CRO)                                    True
Cuba (CUB) [Z]                                   True
Cyprus (CYP)                                    False
                                                ...  
Sri Lanka (SRI) [SRI]                           False
Sudan (SUD)                                     False
Suriname (SUR) [E]                               True
Sweden (SWE) [Z]                                 True
Switzerland (SUI)                                True
Syria (SYR)                                      True
Chinese Taipei (TPE) [TPE] [TPE2]                True
Tajikistan (TJK)                                False
Tanzania (TAN) [TAN]                            False
Thailand (THA)                                   True
Togo (TOG)                                      False
Tonga (TGA)                                     False
Trinidad and Tobago (TRI) [TRI]                  True
Tunisia (TUN)                                    True
Turkey (TUR)                                     True
Uganda (UGA)                                     True
Ukraine (UKR)                                    True
United Arab Emirates (UAE)                       True
United States (USA) [P] [Q] [R] [Z]              True
Uruguay (URU)                                    True
Uzbekistan (UZB)                                 True
Venezuela (VEN)                                  True
Vietnam (VIE)                                   False
Virgin Islands (ISV)                            False
Yugoslavia (YUG) [YUG]                           True
Independent Olympic Participants (IOP) [IOP]    False
Zambia (ZAM) [ZAM]                              False
Zimbabwe (ZIM) [ZIM]                             True
Mixed team (ZZX) [ZZX]                           True
Totals                                           True
Name: Gold, dtype: bool

In [79]:
only_gold = df.where(df['Gold'] > 0) # this won't drop the na data...
only_gold.head()


Out[79]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

In [75]:
only_gold['Gold'].count()


Out[75]:
100

In [76]:
df['Gold'].count()


Out[76]:
147

In [77]:
only_gold = only_gold.dropna()
only_gold.head()


Out[77]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12
Australia (AUS) [AUS] [Z] 25 139 152 177 468 18 5 3 4 12 43 144 155 181 480

In [80]:
only_gold = df[df['Gold'] > 0] # this will automatically drop the NaN in Gold column, nice...
only_gold.head()


Out[80]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12
Australia (AUS) [AUS] [Z] 25 139 152 177 468 18 5 3 4 12 43 144 155 181 480

In [81]:
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])


Out[81]:
101

In [82]:
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]


Out[82]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Liechtenstein (LIE) 16 0 0 0 0 18 2 2 5 9 34 2 2 5 9

Indexing Dataframes


In [83]:
df.head()


Out[83]:
№ Summer Gold Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total
Afghanistan (AFG) 13 0 0 2 2 0 0 0 0 0 13 0 0 2 2
Algeria (ALG) 12 5 2 8 15 3 0 0 0 0 15 5 2 8 15
Argentina (ARG) 23 18 24 28 70 18 0 0 0 0 41 18 24 28 70
Armenia (ARM) 5 1 2 9 12 6 0 0 0 0 11 1 2 9 12
Australasia (ANZ) [ANZ] 2 3 4 5 12 0 0 0 0 0 2 3 4 5 12

In [84]:
df['country'] = df.index
df = df.set_index('Gold')
df.head()


Out[84]:
№ Summer Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total country
Gold
0 13 0 2 2 0 0 0 0 0 13 0 0 2 2 Afghanistan (AFG)
5 12 2 8 15 3 0 0 0 0 15 5 2 8 15 Algeria (ALG)
18 23 24 28 70 18 0 0 0 0 41 18 24 28 70 Argentina (ARG)
1 5 2 9 12 6 0 0 0 0 11 1 2 9 12 Armenia (ARM)
3 2 4 5 12 0 0 0 0 0 2 3 4 5 12 Australasia (ANZ) [ANZ]

In [85]:
df = df.reset_index()
df.head()


Out[85]:
Gold № Summer Silver Bronze Total № Winter Gold.1 Silver.1 Bronze.1 Total.1 № Games Gold.2 Silver.2 Bronze.2 Combined total country
0 0 13 0 2 2 0 0 0 0 0 13 0 0 2 2 Afghanistan (AFG)
1 5 12 2 8 15 3 0 0 0 0 15 5 2 8 15 Algeria (ALG)
2 18 23 24 28 70 18 0 0 0 0 41 18 24 28 70 Argentina (ARG)
3 1 5 2 9 12 6 0 0 0 0 11 1 2 9 12 Armenia (ARM)
4 3 2 4 5 12 0 0 0 0 0 2 3 4 5 12 Australasia (ANZ) [ANZ]

In [86]:
df = pd.read_csv('census.csv')
df.head()


Out[86]:
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
0 40 3 6 1 0 Alabama Alabama 4779736 4780127 4785161 ... 0.002295 -0.193196 0.381066 0.582002 -0.467369 1.030015 0.826644 1.383282 1.724718 0.712594
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861

5 rows × 100 columns


In [87]:
df['SUMLEV'].unique()


Out[87]:
array([40, 50])

In [88]:
df=df[df['SUMLEV'] == 50]
df.head()


Out[88]:
SUMLEV REGION DIVISION STATE COUNTY STNAME CTYNAME CENSUS2010POP ESTIMATESBASE2010 POPESTIMATE2010 ... RDOMESTICMIG2011 RDOMESTICMIG2012 RDOMESTICMIG2013 RDOMESTICMIG2014 RDOMESTICMIG2015 RNETMIG2011 RNETMIG2012 RNETMIG2013 RNETMIG2014 RNETMIG2015
1 50 3 6 1 1 Alabama Autauga County 54571 54571 54660 ... 7.242091 -2.915927 -3.012349 2.265971 -2.530799 7.606016 -2.626146 -2.722002 2.592270 -2.187333
2 50 3 6 1 3 Alabama Baldwin County 182265 182265 183193 ... 14.832960 17.647293 21.845705 19.243287 17.197872 15.844176 18.559627 22.727626 20.317142 18.293499
3 50 3 6 1 5 Alabama Barbour County 27457 27457 27341 ... -4.728132 -2.500690 -7.056824 -3.904217 -10.543299 -4.874741 -2.758113 -7.167664 -3.978583 -10.543299
4 50 3 6 1 7 Alabama Bibb County 22915 22919 22861 ... -5.527043 -5.068871 -6.201001 -0.177537 0.177258 -5.088389 -4.363636 -5.403729 0.754533 1.107861
5 50 3 6 1 9 Alabama Blount County 57322 57322 57373 ... 1.807375 -1.177622 -1.748766 -2.062535 -1.369970 1.859511 -0.848580 -1.402476 -1.577232 -0.884411

5 rows × 100 columns


In [89]:
columns_to_keep = ['STNAME',
                   'CTYNAME',
                   'BIRTHS2010',
                   'BIRTHS2011',
                   'BIRTHS2012',
                   'BIRTHS2013',
                   'BIRTHS2014',
                   'BIRTHS2015',
                   'POPESTIMATE2010',
                   'POPESTIMATE2011',
                   'POPESTIMATE2012',
                   'POPESTIMATE2013',
                   'POPESTIMATE2014',
                   'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()


Out[89]:
STNAME CTYNAME BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
1 Alabama Autauga County 151 636 615 574 623 600 54660 55253 55175 55038 55290 55347
2 Alabama Baldwin County 517 2187 2092 2160 2186 2240 183193 186659 190396 195126 199713 203709
3 Alabama Barbour County 70 335 300 283 260 269 27341 27226 27159 26973 26815 26489
4 Alabama Bibb County 44 266 245 259 247 253 22861 22733 22642 22512 22549 22583
5 Alabama Blount County 183 744 710 646 618 603 57373 57711 57776 57734 57658 57673

In [90]:
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()


Out[90]:
BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
STNAME CTYNAME
Alabama Autauga County 151 636 615 574 623 600 54660 55253 55175 55038 55290 55347
Baldwin County 517 2187 2092 2160 2186 2240 183193 186659 190396 195126 199713 203709
Barbour County 70 335 300 283 260 269 27341 27226 27159 26973 26815 26489
Bibb County 44 266 245 259 247 253 22861 22733 22642 22512 22549 22583
Blount County 183 744 710 646 618 603 57373 57711 57776 57734 57658 57673

In [91]:
df.loc['Michigan', 'Washtenaw County']


Out[91]:
BIRTHS2010            977
BIRTHS2011           3826
BIRTHS2012           3780
BIRTHS2013           3662
BIRTHS2014           3683
BIRTHS2015           3709
POPESTIMATE2010    345563
POPESTIMATE2011    349048
POPESTIMATE2012    351213
POPESTIMATE2013    354289
POPESTIMATE2014    357029
POPESTIMATE2015    358880
Name: (Michigan, Washtenaw County), dtype: int64

In [92]:
df.loc[ [('Michigan', 'Washtenaw County'),
         ('Michigan', 'Wayne County')] ]


Out[92]:
BIRTHS2010 BIRTHS2011 BIRTHS2012 BIRTHS2013 BIRTHS2014 BIRTHS2015 POPESTIMATE2010 POPESTIMATE2011 POPESTIMATE2012 POPESTIMATE2013 POPESTIMATE2014 POPESTIMATE2015
STNAME CTYNAME
Michigan Washtenaw County 977 3826 3780 3662 3683 3709 345563 349048 351213 354289 357029 358880
Wayne County 5918 23819 23270 23377 23607 23586 1815199 1801273 1792514 1775713 1766008 1759335

In [93]:
# Homework:
purchase_1 = pd.Series({'Name': 'Chris',
                        'Item Purchased': 'Dog Food',
                        'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
                        'Item Purchased': 'Kitty Litter',
                        'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
                        'Item Purchased': 'Bird Seed',
                        'Cost': 5.00})

df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])


df = df.set_index([df.index, 'Name'])
df.index.names = ['Location', 'Name']
df = df.append(pd.Series(data={'Cost': 3.00, 'Item Purchased': 'Kitty Food'}, name=('Store 2', 'Kevyn')))
df


Out[93]:
Cost Item Purchased
Location Name
Store 1 Chris 22.5 Dog Food
Kevyn 2.5 Kitty Litter
Store 2 Vinod 5.0 Bird Seed
Kevyn 3.0 Kitty Food

Missing values


In [101]:
df = pd.read_csv('log.csv')
df


Out[101]:
time user video playback position paused volume
0 1469974424 cheryl intro.html 5 False 10
1 1469974454 cheryl intro.html 6 NaN NaN
2 1469974544 cheryl intro.html 9 NaN NaN
3 1469974574 cheryl intro.html 10 NaN NaN
4 1469977514 bob intro.html 1 NaN NaN
5 1469977544 bob intro.html 1 NaN NaN
6 1469977574 bob intro.html 1 NaN NaN
7 1469977604 bob intro.html 1 NaN NaN
8 1469974604 cheryl intro.html 11 NaN NaN
9 1469974694 cheryl intro.html 14 NaN NaN
10 1469974724 cheryl intro.html 15 NaN NaN
11 1469974454 sue advanced.html 24 NaN NaN
12 1469974524 sue advanced.html 25 NaN NaN
13 1469974424 sue advanced.html 23 False 10
14 1469974554 sue advanced.html 26 NaN NaN
15 1469974624 sue advanced.html 27 NaN NaN
16 1469974654 sue advanced.html 28 NaN 5
17 1469974724 sue advanced.html 29 NaN NaN
18 1469974484 cheryl intro.html 7 NaN NaN
19 1469974514 cheryl intro.html 8 NaN NaN
20 1469974754 sue advanced.html 30 NaN NaN
21 1469974824 sue advanced.html 31 NaN NaN
22 1469974854 sue advanced.html 32 NaN NaN
23 1469974924 sue advanced.html 33 NaN NaN
24 1469977424 bob intro.html 1 True 10
25 1469977454 bob intro.html 1 NaN NaN
26 1469977484 bob intro.html 1 NaN NaN
27 1469977634 bob intro.html 1 NaN NaN
28 1469977664 bob intro.html 1 NaN NaN
29 1469974634 cheryl intro.html 12 NaN NaN
30 1469974664 cheryl intro.html 13 NaN NaN
31 1469977694 bob intro.html 1 NaN NaN
32 1469977724 bob intro.html 1 NaN NaN

In [99]:
df.fillna?

In [102]:
df = df.set_index('time')
df = df.sort_index()
df


Out[102]:
user video playback position paused volume
time
1469974424 cheryl intro.html 5 False 10
1469974424 sue advanced.html 23 False 10
1469974454 cheryl intro.html 6 NaN NaN
1469974454 sue advanced.html 24 NaN NaN
1469974484 cheryl intro.html 7 NaN NaN
1469974514 cheryl intro.html 8 NaN NaN
1469974524 sue advanced.html 25 NaN NaN
1469974544 cheryl intro.html 9 NaN NaN
1469974554 sue advanced.html 26 NaN NaN
1469974574 cheryl intro.html 10 NaN NaN
1469974604 cheryl intro.html 11 NaN NaN
1469974624 sue advanced.html 27 NaN NaN
1469974634 cheryl intro.html 12 NaN NaN
1469974654 sue advanced.html 28 NaN 5
1469974664 cheryl intro.html 13 NaN NaN
1469974694 cheryl intro.html 14 NaN NaN
1469974724 cheryl intro.html 15 NaN NaN
1469974724 sue advanced.html 29 NaN NaN
1469974754 sue advanced.html 30 NaN NaN
1469974824 sue advanced.html 31 NaN NaN
1469974854 sue advanced.html 32 NaN NaN
1469974924 sue advanced.html 33 NaN NaN
1469977424 bob intro.html 1 True 10
1469977454 bob intro.html 1 NaN NaN
1469977484 bob intro.html 1 NaN NaN
1469977514 bob intro.html 1 NaN NaN
1469977544 bob intro.html 1 NaN NaN
1469977574 bob intro.html 1 NaN NaN
1469977604 bob intro.html 1 NaN NaN
1469977634 bob intro.html 1 NaN NaN
1469977664 bob intro.html 1 NaN NaN
1469977694 bob intro.html 1 NaN NaN
1469977724 bob intro.html 1 NaN NaN

In [104]:
df = df.reset_index()
df = df.set_index(['time', 'user'])
df


Out[104]:
video playback position paused volume
time user
1469974424 cheryl intro.html 5 False 10
sue advanced.html 23 False 10
1469974454 cheryl intro.html 6 False 10
sue advanced.html 24 False 10
1469974484 cheryl intro.html 7 False 10
1469974514 cheryl intro.html 8 False 10
1469974524 sue advanced.html 25 False 10
1469974544 cheryl intro.html 9 False 10
1469974554 sue advanced.html 26 False 10
1469974574 cheryl intro.html 10 False 10
1469974604 cheryl intro.html 11 False 10
1469974624 sue advanced.html 27 False 10
1469974634 cheryl intro.html 12 False 10
1469974654 sue advanced.html 28 False 5
1469974664 cheryl intro.html 13 False 5
1469974694 cheryl intro.html 14 False 5
1469974724 cheryl intro.html 15 False 5
sue advanced.html 29 False 5
1469974754 sue advanced.html 30 False 5
1469974824 sue advanced.html 31 False 5
1469974854 sue advanced.html 32 False 5
1469974924 sue advanced.html 33 False 5
1469977424 bob intro.html 1 True 10
1469977454 bob intro.html 1 True 10
1469977484 bob intro.html 1 True 10
1469977514 bob intro.html 1 True 10
1469977544 bob intro.html 1 True 10
1469977574 bob intro.html 1 True 10
1469977604 bob intro.html 1 True 10
1469977634 bob intro.html 1 True 10
1469977664 bob intro.html 1 True 10
1469977694 bob intro.html 1 True 10
1469977724 bob intro.html 1 True 10

In [105]:
df = df.fillna(method='ffill')
df


Out[105]:
video playback position paused volume
time user
1469974424 cheryl intro.html 5 False 10
sue advanced.html 23 False 10
1469974454 cheryl intro.html 6 False 10
sue advanced.html 24 False 10
1469974484 cheryl intro.html 7 False 10
1469974514 cheryl intro.html 8 False 10
1469974524 sue advanced.html 25 False 10
1469974544 cheryl intro.html 9 False 10
1469974554 sue advanced.html 26 False 10
1469974574 cheryl intro.html 10 False 10
1469974604 cheryl intro.html 11 False 10
1469974624 sue advanced.html 27 False 10
1469974634 cheryl intro.html 12 False 10
1469974654 sue advanced.html 28 False 5
1469974664 cheryl intro.html 13 False 5
1469974694 cheryl intro.html 14 False 5
1469974724 cheryl intro.html 15 False 5
sue advanced.html 29 False 5
1469974754 sue advanced.html 30 False 5
1469974824 sue advanced.html 31 False 5
1469974854 sue advanced.html 32 False 5
1469974924 sue advanced.html 33 False 5
1469977424 bob intro.html 1 True 10
1469977454 bob intro.html 1 True 10
1469977484 bob intro.html 1 True 10
1469977514 bob intro.html 1 True 10
1469977544 bob intro.html 1 True 10
1469977574 bob intro.html 1 True 10
1469977604 bob intro.html 1 True 10
1469977634 bob intro.html 1 True 10
1469977664 bob intro.html 1 True 10
1469977694 bob intro.html 1 True 10
1469977724 bob intro.html 1 True 10

In [ ]: