Pandas



In [1]:

    
import pandas as pd
import numpy as np
print("Pandas version:",pd.__version__)
print("Numpy version:",np.__version__)

import sys
print ("Python version:", sys.version)









    



Pandas version: 0.20.1
Numpy version: 1.12.1
Python version: 3.6.1 |Anaconda 4.4.0 (x86_64)| (default, May 11 2017, 13:04:09) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]

Pandas Series objects



In [2]:

    
data = pd.Series([5, 10, 15, 20, 25])
data









    Out[2]:





0     5
1    10
2    15
3    20
4    25
dtype: int64



In [3]:

    
# to access the values of pandas series
data.values









    Out[3]:





array([ 5, 10, 15, 20, 25])



In [4]:

    
# access the index of pandas series
data.index









    Out[4]:





RangeIndex(start=0, stop=5, step=1)



In [5]:

    
# access data via associated index
print(data[1])
print('--------')
print(data[3:5])









    



10
--------
3    20
4    25
dtype: int64



In [6]:

    
#index need not be an integer
dt = pd.Series([1, 2, 3, 4, 5],
              index=['a', 'b', 'c', 'd', 'e'])
dt









    Out[6]:





a    1
b    2
c    3
d    4
e    5
dtype: int64



In [7]:

    
# access the data as expected 
dt['c']









    Out[7]:





3



In [8]:

    
# Series as specialized dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population









    Out[8]:





California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64



In [9]:

    
population['California']









    Out[9]:





38332521



In [10]:

    
# unlike dictionary, Series support slicing
population['California' : 'Illinous']









    Out[10]:





California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

Constracting Series objects (pd.Series(data, index=index))



In [11]:

    
# 1- from a list or numpy array
pd.Series([1, 2, 4])









    Out[11]:





0    1
1    2
2    4
dtype: int64



In [12]:

    
# 2a- from a dictionary
pd.Series({1:'a', 5:'c' , 2:'g'}, index=[1,5,2])









    Out[12]:





1    a
5    c
2    g
dtype: object



In [13]:

    
# 2b- from a dictionary
pd.Series({'a': 1, 'c':5, 'b':67}, index=['a','b','c'])









    Out[13]:





a     1
b    67
c     5
dtype: int64

Pandas DataFrame object



In [14]:

    
# lets create another pandas series object first
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area









    Out[14]:





California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64



In [15]:

    
# use dictionary to create a DataFrame from population and area Series objects
states = pd.DataFrame({'population': population , 'area':area})
states



In [16]:

    
# access the index
states.index









    Out[16]:





Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')



In [17]:

    
# access column names
states.columns









    Out[17]:





Index(['area', 'population'], dtype='object')



In [18]:

    
# access values
states.values









    Out[18]:





array([[  423967, 38332521],
       [  170312, 19552860],
       [  149995, 12882135],
       [  141297, 19651127],
       [  695662, 26448193]])



In [19]:

    
# access a column
states['area']









    Out[19]:





California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

Constracting DataFrame objects



In [20]:

    
# 1- from a Series object
pd.DataFrame(population, columns=['population'])









    Out[20]:







  
    
      
      population
    
  
  
    
      California
      38332521
    
    
      Florida
      19552860
    
    
      Illinois
      12882135
    
    
      New York
      19651127
    
    
      Texas
      26448193



In [21]:

    
# 2- from a list of dictionary
data = [{'a': 1, 'b':2}
       for i in range(3)]
pd.DataFrame(data)



In [22]:

    
# 3- from a dictionary of Series object
pd.DataFrame({'population':population , 'area':area})



In [23]:

    
# 4- from 2d Numpy array
pd.DataFrame(np.random.rand(3,2),
            columns=['aaa', 'bbb'],
            index=['a','b','c'])

Pandas Index object



In [24]:

    
ind = pd.Index([1,2,5,16])
ind









    Out[24]:





Int64Index([1, 2, 5, 16], dtype='int64')



In [25]:

    
# access index values
ind[3]









    Out[25]:





16



In [26]:

    
print(ind.size, ind.shape, ind.ndim, ind.dtype)









    



4 (4,) 1 int64



In [27]:

    
# indexes are immutable. code below would throw an error message
#ind[3] = 0

Data Indexing and Selection



In [28]:

    
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data









    Out[28]:





a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64



In [29]:

    
data['b']









    Out[29]:





0.5



In [30]:

    
'a' in data









    Out[30]:





True



In [31]:

    
1.0 in data









    Out[31]:





False



In [32]:

    
data.keys()









    Out[32]:





Index(['a', 'b', 'c', 'd'], dtype='object')



In [33]:

    
list(data.items())









    Out[33]:





[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]



In [34]:

    
# assign a new index and an index value
data['e'] = 1.25
data









    Out[34]:





a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64



In [35]:

    
# slicing explicitly
data['b':'e']









    Out[35]:





b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64



In [36]:

    
# slicing implicitly
data[0:2]









    Out[36]:





a    0.25
b    0.50
dtype: float64



In [37]:

    
# masking
data[(data > 0.3) & (data < 0.8)]









    Out[37]:





b    0.50
c    0.75
dtype: float64



In [38]:

    
# fancy indexing
data[['a', 'e']]









    Out[38]:





a    0.25
e    1.25
dtype: float64



In [39]:

    
data.loc['a']









    Out[39]:





0.25



In [40]:

    
data.loc[['a', 'c']]









    Out[40]:





a    0.25
c    0.75
dtype: float64



In [41]:

    
data.loc['a':'d']









    Out[41]:





a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64



In [42]:

    
data.iloc[1]









    Out[42]:





0.5



In [43]:

    
data.iloc[1:3]









    Out[43]:





b    0.50
c    0.75
dtype: float64

Data Selection in DataFrames



In [44]:

    
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'population':pop})
data



In [45]:

    
# access the individual Series that make up the column
data['area']









    Out[45]:





California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64



In [46]:

    
# equivalent code
data.area









    Out[46]:





California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64



In [47]:

    
data.area is data['area']









    Out[47]:





True



In [48]:

    
data.area == data['area']









    Out[48]:





California    True
Florida       True
Illinois      True
New York      True
Texas         True
Name: area, dtype: bool



In [49]:

    
data['density'] = data['population'] / data['area']
data



In [50]:

    
data.values









    Out[50]:





array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])



In [51]:

    
# 2D DataFrame
data.T









    Out[51]:







  
    
      
      California
      Florida
      Illinois
      New York
      Texas
    
  
  
    
      area
      4.239670e+05
      1.703120e+05
      1.499950e+05
      1.412970e+05
      6.956620e+05
    
    
      population
      3.833252e+07
      1.955286e+07
      1.288214e+07
      1.965113e+07
      2.644819e+07
    
    
      density
      9.041393e+01
      1.148061e+02
      8.588376e+01
      1.390767e+02
      3.801874e+01



In [52]:

    
data.values[0]









    Out[52]:





array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])



In [53]:

    
data.values[1]









    Out[53]:





array([  1.70312000e+05,   1.95528600e+07,   1.14806121e+02])



In [54]:

    
data['area']









    Out[54]:





California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64



In [55]:

    
data



In [56]:

    
# iloc
data.iloc[1]









    Out[56]:





area          1.703120e+05
population    1.955286e+07
density       1.148061e+02
Name: Florida, dtype: float64



In [57]:

    
data.iloc[1:3]



In [58]:

    
data.iloc[1:3, 0:2]









    Out[58]:







  
    
      
      area
      population
    
  
  
    
      Florida
      170312
      19552860
    
    
      Illinois
      149995
      12882135



In [59]:

    
data.loc['Florida']









    Out[59]:





area          1.703120e+05
population    1.955286e+07
density       1.148061e+02
Name: Florida, dtype: float64



In [60]:

    
data.loc[['Florida', 'Illinois']]



In [61]:

    
data.loc[['Florida','Illinois']][['area']]



In [62]:

    
data.loc['Florida':'Illinois', :'population']









    Out[62]:







  
    
      
      area
      population
    
  
  
    
      Florida
      170312
      19552860
    
    
      Illinois
      149995
      12882135



In [63]:

    
# ix indexer is hybrid of loc and iloc
data.ix[1:3 , :'population']









    



/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  






    Out[63]:







  
    
      
      area
      population
    
  
  
    
      Florida
      170312
      19552860
    
    
      Illinois
      149995
      12882135



In [64]:

    
data.ix[1:3 , ['area', 'population']]









    Out[64]:







  
    
      
      area
      population
    
  
  
    
      Florida
      170312
      19552860
    
    
      Illinois
      149995
      12882135



In [65]:

    
# use masking and fancy indexing with loc
data.loc[data['density'] > 100 , ['area', 'density']]

Operating on DataFrames



In [66]:

    
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))
ser









    Out[66]:





0    6
1    3
2    7
3    4
dtype: int64



In [67]:

    
df = pd.DataFrame(rng.randint(0, 10, (3,4)),
                  columns=['A','B','C','D'])
df



In [68]:

    
np.exp(ser)









    Out[68]:





0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64



In [69]:

    
def expo(x):
    y = np.e ** x
    return y



In [70]:

    
expo(6)









    Out[70]:





403.428793492735



In [71]:

    
np.sin(df * np.pi / 4)









    Out[71]:







  
    
      
      A
      B
      C
      D
    
  
  
    
      0
      -1.000000
      7.071068e-01
      1.000000
      -1.000000e+00
    
    
      1
      -0.707107
      1.224647e-16
      0.707107
      -7.071068e-01
    
    
      2
      -0.707107
      1.000000e+00
      -0.707107
      1.224647e-16



In [72]:

    
# index alignment
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'Alaska':1723337})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, })



In [73]:

    
pop / area









    Out[73]:





Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64



In [74]:

    
# any missing values are filled in with NaN by default in Python
A = pd.Series([2, 4, 6], index=[0,1,2])
B = pd.Series([1, 3, 5], index=[1,2,3])
A + B









    Out[74]:





0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64



In [75]:

    
# use fill_value 
A.add(B, fill_value=0)









    Out[75]:





0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64



In [76]:

    
# index aligment in DataFrames
A = pd.DataFrame(rng.randint(0, 20, (2,2)),
                columns=list('AB'))
B = pd.DataFrame(rng.randint(0, 10, (3,3)),
                columns=list('BAC'))
A



In [77]:

    
B



In [78]:

    
A + B



In [79]:

    
# fill the missing values with the mean of all values in A 
fill = A.stack().mean()
A.add(B, fill_value=fill)

Hierarhical Indexing



In [80]:

    
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]



In [81]:

    
index = pd.MultiIndex.from_tuples(index)
index









    Out[81]:





MultiIndex(levels=[['California', 'New York', 'Texas'], [2000, 2010]],
           labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]])



In [82]:

    
pop = pop.reindex(index)
pop









    Out[82]:





California  2000   NaN
            2010   NaN
New York    2000   NaN
            2010   NaN
Texas       2000   NaN
            2010   NaN
dtype: float64



In [83]:

    
# all data for which the second index is 2010
pop[:,2010]









    Out[83]:





California   NaN
New York     NaN
Texas        NaN
dtype: float64



In [84]:

    
pop.loc['California']









    Out[84]:





2000   NaN
2010   NaN
dtype: float64



In [85]:

    
pop.ix['California', 2010]









    



/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.






    Out[85]:





nan



In [86]:

    
pop.unstack()









    Out[86]:







  
    
      
      2000
      2010
    
  
  
    
      California
      NaN
      NaN
    
    
      New York
      NaN
      NaN
    
    
      Texas
      NaN
      NaN



In [87]:

    
pop_df = pd.DataFrame({'total': pop,
                       'under18': [9267089, 9284094,
                                   4687374, 4318033,
                                   5906301, 6879014]})
pop_df

indexing and slicing a MultiIndex



In [88]:

    
pop_df[pop_df['under18'] > 6000000]



In [89]:

    
pop.unstack().reset_index()









    Out[89]:







  
    
      
      index
      2000
      2010
    
  
  
    
      0
      California
      NaN
      NaN
    
    
      1
      New York
      NaN
      NaN
    
    
      2
      Texas
      NaN
      NaN

merge and join



In [90]:

    
pop = pd.read_csv('data/state-population.csv')
areas = pd.read_csv('data/state-areas.csv')
abbrevs = pd.read_csv('data/state-abbrevs.csv')



In [91]:

    
print(pop.head()); print(areas.head()); print(abbrevs.head())









    



  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA



In [92]:

    
merged = pd.merge(pop, abbrevs, how='outer', left_on='state/region', right_on='abbreviation')
merged.head()









    Out[92]:







  
    
      
      state/region
      ages
      year
      population
      state
      abbreviation
    
  
  
    
      0
      AL
      under18
      2012
      1117489.0
      Alabama
      AL
    
    
      1
      AL
      total
      2012
      4817528.0
      Alabama
      AL
    
    
      2
      AL
      under18
      2010
      1130966.0
      Alabama
      AL
    
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
      AL
    
    
      4
      AL
      under18
      2011
      1125763.0
      Alabama
      AL



In [93]:

    
merged = merged.drop('abbreviation', 1)



In [94]:

    
merged.head()









    Out[94]:







  
    
      
      state/region
      ages
      year
      population
      state
    
  
  
    
      0
      AL
      under18
      2012
      1117489.0
      Alabama
    
    
      1
      AL
      total
      2012
      4817528.0
      Alabama
    
    
      2
      AL
      under18
      2010
      1130966.0
      Alabama
    
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
    
    
      4
      AL
      under18
      2011
      1125763.0
      Alabama



In [95]:

    
merged.isnull().any()









    Out[95]:





state/region    False
ages            False
year            False
population       True
state            True
dtype: bool



In [96]:

    
# number of nulls
merged.isnull().sum()









    Out[96]:





state/region     0
ages             0
year             0
population      20
state           96
dtype: int64



In [97]:

    
merged[merged['population'].isnull()].head()









    Out[97]:







  
    
      
      state/region
      ages
      year
      population
      state
    
  
  
    
      2448
      PR
      under18
      1990
      NaN
      NaN
    
    
      2449
      PR
      total
      1990
      NaN
      NaN
    
    
      2450
      PR
      total
      1991
      NaN
      NaN
    
    
      2451
      PR
      under18
      1991
      NaN
      NaN
    
    
      2452
      PR
      total
      1993
      NaN
      NaN



In [98]:

    
merged.loc[merged['state'].isnull(), 'state/region'].unique()









    Out[98]:





array(['PR', 'USA'], dtype=object)



In [99]:

    
merged.loc[merged['state/region']=='PR', 'state'] = 'Puerto Rico'



In [100]:

    
merged.loc[merged['state/region']=='USA', 'state'] = 'United States'



In [101]:

    
merged.isnull().sum()









    Out[101]:





state/region     0
ages             0
year             0
population      20
state            0
dtype: int64



In [102]:

    
final = pd.merge(merged, areas, on='state', how='left')
final.head()









    Out[102]:







  
    
      
      state/region
      ages
      year
      population
      state
      area (sq. mi)
    
  
  
    
      0
      AL
      under18
      2012
      1117489.0
      Alabama
      52423.0
    
    
      1
      AL
      total
      2012
      4817528.0
      Alabama
      52423.0
    
    
      2
      AL
      under18
      2010
      1130966.0
      Alabama
      52423.0
    
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
      52423.0
    
    
      4
      AL
      under18
      2011
      1125763.0
      Alabama
      52423.0



In [103]:

    
final.isnull().sum()









    Out[103]:





state/region      0
ages              0
year              0
population       20
state             0
area (sq. mi)    48
dtype: int64



In [104]:

    
final.dropna(inplace=True)



In [105]:

    
final.head()









    Out[105]:







  
    
      
      state/region
      ages
      year
      population
      state
      area (sq. mi)
    
  
  
    
      0
      AL
      under18
      2012
      1117489.0
      Alabama
      52423.0
    
    
      1
      AL
      total
      2012
      4817528.0
      Alabama
      52423.0
    
    
      2
      AL
      under18
      2010
      1130966.0
      Alabama
      52423.0
    
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
      52423.0
    
    
      4
      AL
      under18
      2011
      1125763.0
      Alabama
      52423.0



In [106]:

    
data2010 = final.query("year==2010 & ages == 'total'" )
data2010.head()









    Out[106]:







  
    
      
      state/region
      ages
      year
      population
      state
      area (sq. mi)
    
  
  
    
      3
      AL
      total
      2010
      4785570.0
      Alabama
      52423.0
    
    
      91
      AK
      total
      2010
      713868.0
      Alaska
      656425.0
    
    
      101
      AZ
      total
      2010
      6408790.0
      Arizona
      114006.0
    
    
      189
      AR
      total
      2010
      2922280.0
      Arkansas
      53182.0
    
    
      197
      CA
      total
      2010
      37333601.0
      California
      163707.0



In [107]:

    
df = pd.DataFrame({'A' : np.random.randint(0, 10, size=4),
                   'B' : np.random.randint(0, 10, size=4)})
df



In [108]:

    
df['A'].sum()









    Out[108]:





18



In [109]:

    
df['B'].mean()









    Out[109]:





3.0



In [110]:

    
df.mean(axis='rows')









    Out[110]:





A    4.5
B    3.0
dtype: float64



In [111]:

    
# product of all items
df.prod()









    Out[111]:





A    196
B      0
dtype: int64



In [112]:

    
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape









    Out[112]:





(1035, 6)



In [113]:

    
planets.head()









    Out[113]:







  
    
      
      method
      number
      orbital_period
      mass
      distance
      year
    
  
  
    
      0
      Radial Velocity
      1
      269.300
      7.10
      77.40
      2006
    
    
      1
      Radial Velocity
      1
      874.774
      2.21
      56.95
      2008
    
    
      2
      Radial Velocity
      1
      763.000
      2.60
      19.84
      2011
    
    
      3
      Radial Velocity
      1
      326.030
      19.40
      110.62
      2007
    
    
      4
      Radial Velocity
      1
      516.220
      10.50
      119.47
      2009



In [114]:

    
planets.dropna().describe()









    Out[114]:







  
    
      
      number
      orbital_period
      mass
      distance
      year
    
  
  
    
      count
      498.00000
      498.000000
      498.000000
      498.000000
      498.000000
    
    
      mean
      1.73494
      835.778671
      2.509320
      52.068213
      2007.377510
    
    
      std
      1.17572
      1469.128259
      3.636274
      46.596041
      4.167284
    
    
      min
      1.00000
      1.328300
      0.003600
      1.350000
      1989.000000
    
    
      25%
      1.00000
      38.272250
      0.212500
      24.497500
      2005.000000
    
    
      50%
      1.00000
      357.000000
      1.245000
      39.940000
      2009.000000
    
    
      75%
      2.00000
      999.600000
      2.867500
      59.332500
      2011.000000
    
    
      max
      6.00000
      17337.500000
      25.000000
      354.000000
      2014.000000



In [115]:

    
planets.describe()









    Out[115]:







  
    
      
      number
      orbital_period
      mass
      distance
      year
    
  
  
    
      count
      1035.000000
      992.000000
      513.000000
      808.000000
      1035.000000
    
    
      mean
      1.785507
      2002.917596
      2.638161
      264.069282
      2009.070531
    
    
      std
      1.240976
      26014.728304
      3.818617
      733.116493
      3.972567
    
    
      min
      1.000000
      0.090706
      0.003600
      1.350000
      1989.000000
    
    
      25%
      1.000000
      5.442540
      0.229000
      32.560000
      2007.000000
    
    
      50%
      1.000000
      39.979500
      1.260000
      55.250000
      2010.000000
    
    
      75%
      2.000000
      526.005000
      3.040000
      178.500000
      2012.000000
    
    
      max
      7.000000
      730000.000000
      25.000000
      8500.000000
      2014.000000



In [116]:

    
df = pd.DataFrame({'key':['A','B','C','A','B','C'],
                   'data':range(6)}, columns=['key', 'data'])
df



In [117]:

    
df.groupby('key')









    Out[117]:





<pandas.core.groupby.DataFrameGroupBy object at 0x11bf528d0>



In [118]:

    
df.groupby('key').sum()



In [119]:

    
df.groupby('key')['data'].median()









    Out[119]:





key
A    1.5
B    2.5
C    3.5
Name: data, dtype: float64



In [120]:

    
df



In [121]:

    
df.groupby('key').aggregate([sum, max, np.median])



In [122]:

    
planets.groupby('method')['orbital_period'].median()









    Out[122]:





method
Astrometry                         631.180000
Eclipse Timing Variations         4343.500000
Imaging                          27500.000000
Microlensing                      3300.000000
Orbital Brightness Modulation        0.342887
Pulsar Timing                       66.541900
Pulsation Timing Variations       1170.000000
Radial Velocity                    360.200000
Transit                              5.714932
Transit Timing Variations           57.011000
Name: orbital_period, dtype: float64



In [123]:

    
planets.groupby('method')['year'].describe()









    Out[123]:







  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
    
      method
      
      
      
      
      
      
      
      
    
  
  
    
      Astrometry
      2.0
      2011.500000
      2.121320
      2010.0
      2010.75
      2011.5
      2012.25
      2013.0
    
    
      Eclipse Timing Variations
      9.0
      2010.000000
      1.414214
      2008.0
      2009.00
      2010.0
      2011.00
      2012.0
    
    
      Imaging
      38.0
      2009.131579
      2.781901
      2004.0
      2008.00
      2009.0
      2011.00
      2013.0
    
    
      Microlensing
      23.0
      2009.782609
      2.859697
      2004.0
      2008.00
      2010.0
      2012.00
      2013.0
    
    
      Orbital Brightness Modulation
      3.0
      2011.666667
      1.154701
      2011.0
      2011.00
      2011.0
      2012.00
      2013.0
    
    
      Pulsar Timing
      5.0
      1998.400000
      8.384510
      1992.0
      1992.00
      1994.0
      2003.00
      2011.0
    
    
      Pulsation Timing Variations
      1.0
      2007.000000
      NaN
      2007.0
      2007.00
      2007.0
      2007.00
      2007.0
    
    
      Radial Velocity
      553.0
      2007.518987
      4.249052
      1989.0
      2005.00
      2009.0
      2011.00
      2014.0
    
    
      Transit
      397.0
      2011.236776
      2.077867
      2002.0
      2010.00
      2012.0
      2013.00
      2014.0
    
    
      Transit Timing Variations
      4.0
      2012.500000
      1.290994
      2011.0
      2011.75
      2012.5
      2013.25
      2014.0



In [124]:

    
len(pd.unique(planets.method))









    Out[124]:





10



In [125]:

    
pd.unique(planets.method).shape









    Out[125]:





(10,)



In [126]:

    
planets.groupby('method').sum()









    Out[126]:







  
    
      
      number
      orbital_period
      mass
      distance
      year
    
    
      method
      
      
      
      
      
    
  
  
    
      Astrometry
      2
      1.262360e+03
      NaN
      35.75
      4023
    
    
      Eclipse Timing Variations
      15
      4.276480e+04
      10.25000
      1261.44
      18090
    
    
      Imaging
      50
      1.418973e+06
      NaN
      2166.91
      76347
    
    
      Microlensing
      27
      2.207500e+04
      NaN
      41440.00
      46225
    
    
      Orbital Brightness Modulation
      5
      2.127920e+00
      NaN
      2360.00
      6035
    
    
      Pulsar Timing
      11
      3.671511e+04
      NaN
      1200.00
      9992
    
    
      Pulsation Timing Variations
      1
      1.170000e+03
      NaN
      NaN
      2007
    
    
      Radial Velocity
      952
      4.553151e+05
      1341.65638
      27348.11
      1110158
    
    
      Transit
      776
      8.377523e+03
      1.47000
      134242.77
      798461
    
    
      Transit Timing Variations
      9
      2.393505e+02
      NaN
      3313.00
      8050



In [127]:

    
planets.groupby('method')[['year', 'mass']].sum()









    Out[127]:







  
    
      
      year
      mass
    
    
      method
      
      
    
  
  
    
      Astrometry
      4023
      NaN
    
    
      Eclipse Timing Variations
      18090
      10.25000
    
    
      Imaging
      76347
      NaN
    
    
      Microlensing
      46225
      NaN
    
    
      Orbital Brightness Modulation
      6035
      NaN
    
    
      Pulsar Timing
      9992
      NaN
    
    
      Pulsation Timing Variations
      2007
      NaN
    
    
      Radial Velocity
      1110158
      1341.65638
    
    
      Transit
      798461
      1.47000
    
    
      Transit Timing Variations
      8050
      NaN



In [128]:

    
planets.groupby('method').aggregate([sum, np.mean])









    Out[128]:







  
    
      
      number
      orbital_period
      mass
      distance
      year
    
    
      
      sum
      mean
      sum
      mean
      sum
      mean
      sum
      mean
      sum
      mean
    
    
      method
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      Astrometry
      2
      1.000000
      1.262360e+03
      631.180000
      NaN
      NaN
      35.75
      17.875000
      4023
      2011.500000
    
    
      Eclipse Timing Variations
      15
      1.666667
      4.276480e+04
      4751.644444
      10.25000
      5.125000
      1261.44
      315.360000
      18090
      2010.000000
    
    
      Imaging
      50
      1.315789
      1.418973e+06
      118247.737500
      NaN
      NaN
      2166.91
      67.715937
      76347
      2009.131579
    
    
      Microlensing
      27
      1.173913
      2.207500e+04
      3153.571429
      NaN
      NaN
      41440.00
      4144.000000
      46225
      2009.782609
    
    
      Orbital Brightness Modulation
      5
      1.666667
      2.127920e+00
      0.709307
      NaN
      NaN
      2360.00
      1180.000000
      6035
      2011.666667
    
    
      Pulsar Timing
      11
      2.200000
      3.671511e+04
      7343.021201
      NaN
      NaN
      1200.00
      1200.000000
      9992
      1998.400000
    
    
      Pulsation Timing Variations
      1
      1.000000
      1.170000e+03
      1170.000000
      NaN
      NaN
      NaN
      NaN
      2007
      2007.000000
    
    
      Radial Velocity
      952
      1.721519
      4.553151e+05
      823.354680
      1341.65638
      2.630699
      27348.11
      51.600208
      1110158
      2007.518987
    
    
      Transit
      776
      1.954660
      8.377523e+03
      21.102073
      1.47000
      1.470000
      134242.77
      599.298080
      798461
      2011.236776
    
    
      Transit Timing Variations
      9
      2.250000
      2.393505e+02
      79.783500
      NaN
      NaN
      3313.00
      1104.333333
      8050
      2012.500000



In [129]:

    
%matplotlib inline
planets.groupby('method').sum().plot(kind='bar')









    Out[129]:





<matplotlib.axes._subplots.AxesSubplot at 0x11bfe26a0>



In [130]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [131]:

    
planets.groupby('year').sum()['number']









    Out[131]:





year
1989      1
1992      6
1994      3
1995      1
1996     15
1997      1
1998     11
1999     24
2000     27
2001     15
2002     46
2003     35
2004     41
2005     64
2006     43
2007     65
2008    120
2009    131
2010    193
2011    354
2012    258
2013    277
2014    117
Name: number, dtype: int64



In [132]:

    
plt.figure(figsize=(12, 5))
plt.plot(planets.groupby('year').sum()['number'])
plt.style.use('fivethirtyeight')
plt.title('Number of planets discovered by year')









    Out[132]:





<matplotlib.text.Text at 0x11c419668>



In [133]:

    
planets.head()









    Out[133]:







  
    
      
      method
      number
      orbital_period
      mass
      distance
      year
    
  
  
    
      0
      Radial Velocity
      1
      269.300
      7.10
      77.40
      2006
    
    
      1
      Radial Velocity
      1
      874.774
      2.21
      56.95
      2008
    
    
      2
      Radial Velocity
      1
      763.000
      2.60
      19.84
      2011
    
    
      3
      Radial Velocity
      1
      326.030
      19.40
      110.62
      2007
    
    
      4
      Radial Velocity
      1
      516.220
      10.50
      119.47
      2009



In [134]:

    
planets.boxplot(column=['distance'],
                figsize=(12, 10),
                by=['method'])









    Out[134]:





<matplotlib.axes._subplots.AxesSubplot at 0x11c4144a8>



In [135]:

    
planets[['distance', 'year']].head()



In [136]:

    
plt.figure(figsize=(13, 7))
plt.plot(planets.year, planets.distance, 'o');



In [137]:

    
import seaborn as sns



In [138]:

    
# regression
sns.lmplot(x='year', y='distance', data=planets, size=8);

Grouping example



In [139]:

    
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'



In [140]:

    
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)









    Out[140]:







  
    
      decade
      1980s
      1990s
      2000s
      2010s
    
    
      method
      
      
      
      
    
  
  
    
      Astrometry
      0.0
      0.0
      0.0
      2.0
    
    
      Eclipse Timing Variations
      0.0
      0.0
      5.0
      10.0
    
    
      Imaging
      0.0
      0.0
      29.0
      21.0
    
    
      Microlensing
      0.0
      0.0
      12.0
      15.0
    
    
      Orbital Brightness Modulation
      0.0
      0.0
      0.0
      5.0
    
    
      Pulsar Timing
      0.0
      9.0
      1.0
      1.0
    
    
      Pulsation Timing Variations
      0.0
      0.0
      1.0
      0.0
    
    
      Radial Velocity
      1.0
      52.0
      475.0
      424.0
    
    
      Transit
      0.0
      0.0
      64.0
      712.0
    
    
      Transit Timing Variations
      0.0
      0.0
      0.0
      9.0

Pivot Table



In [141]:

    
titanic = sns.load_dataset('titanic')
titanic.head()









    Out[141]:







  
    
      
      survived
      pclass
      sex
      age
      sibsp
      parch
      fare
      embarked
      class
      who
      adult_male
      deck
      embark_town
      alive
      alone
    
  
  
    
      0
      0
      3
      male
      22.0
      1
      0
      7.2500
      S
      Third
      man
      True
      NaN
      Southampton
      no
      False
    
    
      1
      1
      1
      female
      38.0
      1
      0
      71.2833
      C
      First
      woman
      False
      C
      Cherbourg
      yes
      False
    
    
      2
      1
      3
      female
      26.0
      0
      0
      7.9250
      S
      Third
      woman
      False
      NaN
      Southampton
      yes
      True
    
    
      3
      1
      1
      female
      35.0
      1
      0
      53.1000
      S
      First
      woman
      False
      C
      Southampton
      yes
      False
    
    
      4
      0
      3
      male
      35.0
      0
      0
      8.0500
      S
      Third
      man
      True
      NaN
      Southampton
      no
      True



In [142]:

    
titanic.shape









    Out[142]:





(891, 15)



In [143]:

    
# number of missing data for each column
titanic.isnull().sum()









    Out[143]:





survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64



In [144]:

    
titanic.groupby('sex')[['survived']].mean()



In [145]:

    
titanic.groupby(['sex', 'class'])[['survived']].aggregate('mean').unstack()



In [146]:

    
# pivot table
titanic.pivot_table('survived', index='sex', columns='class')



In [147]:

    
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')



In [148]:

    
fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])









    Out[148]:





fare            (-0.001, 14.454]                     (14.454, 512.329]  \
class                      First    Second     Third             First   
sex    age                                                               
female (0, 18]               NaN  1.000000  0.714286          0.909091   
       (18, 80]              NaN  0.880000  0.444444          0.972973   
male   (0, 18]               NaN  0.000000  0.260870          0.800000   
       (18, 80]              0.0  0.098039  0.125000          0.391304   

fare                                 
class              Second     Third  
sex    age                           
female (0, 18]   1.000000  0.318182  
       (18, 80]  0.914286  0.391304  
male   (0, 18]   0.818182  0.178571  
       (18, 80]  0.030303  0.192308



In [149]:

    
titanic.pivot_table('survived', index='sex', columns='class', margins=True)



In [150]:

    
births = pd.read_csv('data//births.csv')
births.head()



In [151]:

    
births['decade'] =  10 * (births['year'] // 10)
births.head()



In [152]:

    
births.pivot_table('births', index='year',columns='gender', aggfunc='sum' ).plot()
plt.ylabel('total births per year');



In [153]:

    
# sigma clipping  to remove outliars
quartiles = np.percentile(births['births'], [25,50,75])
quartiles









    Out[153]:





array([ 4358. ,  4814. ,  5289.5])



In [154]:

    
mu = quartiles[1]
mu









    Out[154]:





4814.0



In [155]:

    
sig = 0.74 * (quartiles[2] - quartiles[0])
sig









    Out[155]:





689.30999999999995



In [156]:

    
births = births.query('(births > @mu - 5 * @sig) & (births < @mu + 5 * @sig)')



In [157]:

    
# set day column to integer
births['day'] = births['day'].astype(int)



In [158]:

    
births.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 14610 entries, 0 to 15066
Data columns (total 6 columns):
year      14610 non-null int64
month     14610 non-null int64
day       14610 non-null int64
gender    14610 non-null object
births    14610 non-null int64
decade    14610 non-null int64
dtypes: int64(5), object(1)
memory usage: 799.0+ KB



In [159]:

    
births.index = pd.to_datetime(10000 * births.year +
                              100 * births.month +
                              births.day, 
                             format='%Y%m%d')
births['dayofweek'] = births.index.dayofweek
births.head()



In [160]:

    
births.pivot_table('births', index='dayofweek', columns='decade', aggfunc='mean').plot()
plt.gca().set_xticklabels(['Monday', 'Tuesday', 'Wed', 'Thursday', 'Fri', 'Sat', 'Sun'])
plt.ylabel('mean births by day');



In [161]:

    
births_by_day = births.pivot_table('births', [births.index.month, births.index.day])
births_by_day.head()



In [162]:

    
births_by_day.index = [pd.datetime(2012, month, day) for (month, day) in births_by_day.index]



In [163]:

    
births_by_day.head()



In [164]:

    
fig, ax = plt.subplots(figsize = (12, 4))
births_by_day.plot(ax =ax);

Time Series



In [165]:

    
date = np.array('2015-04-03', dtype=np.datetime64)
date









    Out[165]:





array(datetime.date(2015, 4, 3), dtype='datetime64[D]')



In [166]:

    
date + 2









    Out[166]:





numpy.datetime64('2015-04-05')



In [167]:

    
date + np.arange(10)









    Out[167]:





array(['2015-04-03', '2015-04-04', '2015-04-05', '2015-04-06',
       '2015-04-07', '2015-04-08', '2015-04-09', '2015-04-10',
       '2015-04-11', '2015-04-12'], dtype='datetime64[D]')



In [168]:

    
date2 = pd.to_datetime('3th of Feb, 1895')
date2.strftime('%A')









    Out[168]:





'Sunday'



In [169]:

    
date = date + np.arange(5)
index = pd.DatetimeIndex(date)
index









    Out[169]:





DatetimeIndex(['2015-04-03', '2015-04-04', '2015-04-05', '2015-04-06',
               '2015-04-07'],
              dtype='datetime64[ns]', freq=None)



In [170]:

    
data = pd.Series([0,1,2,3,4], index = index)
data









    Out[170]:





2015-04-03    0
2015-04-04    1
2015-04-05    2
2015-04-06    3
2015-04-07    4
dtype: int64



In [171]:

    
data.loc['2015-04-06']









    Out[171]:





3



In [172]:

    
data.loc['2015-04-06'] - data.loc['2015-04-04']









    Out[172]:





2



In [173]:

    
#!pip install pandas_Datareader



In [174]:

    
from pandas_datareader import data



In [175]:

    
goog = data.DataReader('GOOG', start='2014', end='2016', data_source='google')
goog.head()



In [176]:

    
goog = goog['Close']



In [177]:

    
goog.plot();



In [178]:

    
goog.plot(alpha=0.5, style='-')
goog.resample('BA').mean().plot(style=':')
goog.asfreq('BA').plot(style='--')
plt.legend(['input', 'resample','asfreq'], loc='upper left');



In [179]:

    
goog = goog.asfreq('D', method='pad')
ROI = 100 * (goog.tshift(-365) / goog - 1)
ROI.plot();

Seattle Bicycle Counts



In [180]:

    
URL = 'https://data.seattle.gov/api/views/65db-xm6k/rows.csv?accessType=DOWNLOAD'
from urllib.request import urlretrieve
urlretrieve(URL, 'FremontBridge.csv')









    Out[180]:





('FremontBridge.csv', <http.client.HTTPMessage at 0x1218068d0>)



In [181]:

    
data = pd.read_csv('FremontBridge.csv', index_col='Date', parse_dates=True)
data.head()









    Out[181]:







  
    
      
      Fremont Bridge West Sidewalk
      Fremont Bridge East Sidewalk
    
    
      Date
      
      
    
  
  
    
      2012-10-03 00:00:00
      4.0
      9.0
    
    
      2012-10-03 01:00:00
      4.0
      6.0
    
    
      2012-10-03 02:00:00
      1.0
      1.0
    
    
      2012-10-03 03:00:00
      2.0
      3.0
    
    
      2012-10-03 04:00:00
      6.0
      1.0



In [182]:

    
# rename columns
data.columns = ['West', 'East']



In [183]:

    
data['Total'] = data.eval('West + East')



In [184]:

    
data.head()









    Out[184]:







  
    
      
      West
      East
      Total
    
    
      Date
      
      
      
    
  
  
    
      2012-10-03 00:00:00
      4.0
      9.0
      13.0
    
    
      2012-10-03 01:00:00
      4.0
      6.0
      10.0
    
    
      2012-10-03 02:00:00
      1.0
      1.0
      2.0
    
    
      2012-10-03 03:00:00
      2.0
      3.0
      5.0
    
    
      2012-10-03 04:00:00
      6.0
      1.0
      7.0



In [185]:

    
# summary stats
data.dropna().describe()









    Out[185]:







  
    
      
      West
      East
      Total
    
  
  
    
      count
      40841.000000
      40841.00000
      40841.000000
    
    
      mean
      55.043706
      53.48777
      108.531476
    
    
      std
      74.173400
      77.24312
      137.372419
    
    
      min
      0.000000
      0.00000
      0.000000
    
    
      25%
      7.000000
      7.00000
      15.000000
    
    
      50%
      29.000000
      27.00000
      59.000000
    
    
      75%
      72.000000
      66.00000
      142.000000
    
    
      max
      913.000000
      717.00000
      1302.000000



In [186]:

    
import seaborn; seaborn.set()
data.plot()
plt.ylabel('Hourly bicycle count');



In [187]:

    
weekly = data.resample('W').sum()
weekly.plot(style=[':', '--', '-'])
plt.ylabel('Weekly bicycle count');



In [188]:

    
daily = data.resample('D').sum()
daily.rolling(30, center=True).sum().plot(style=[':', '--', '-'])
plt.ylabel('mean hourly count');



In [189]:

    
daily.rolling(50, center=True,
             win_type='gaussian').sum(std=10).plot(style=[':', '--', '-']);



In [190]:

    
by_time = data.groupby(data.index.time).mean()
hourly_ticks = 4 * 60 * 60 * np.arange(6)
by_time.plot(xticks = hourly_ticks, style=[':', '--', '-'])
plt.ylabel('Average hourly biycle count');



In [191]:

    
by_weekday = data.groupby(data.index.dayofweek).mean()
by_weekday.index=['Mon', 'Tue', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun']
by_weekday.plot(style=[':', '--', '-'])
plt.ylabel('Average daily bicycle count');



In [192]:

    
data.head()









    Out[192]:







  
    
      
      West
      East
      Total
    
    
      Date
      
      
      
    
  
  
    
      2012-10-03 00:00:00
      4.0
      9.0
      13.0
    
    
      2012-10-03 01:00:00
      4.0
      6.0
      10.0
    
    
      2012-10-03 02:00:00
      1.0
      1.0
      2.0
    
    
      2012-10-03 03:00:00
      2.0
      3.0
      5.0
    
    
      2012-10-03 04:00:00
      6.0
      1.0
      7.0



In [193]:

    
weekend = np.where(data.index.weekday < 5, 'Weekday', 'Weekend')
by_time = data.groupby([weekend, data.index.time]).mean()



In [194]:

    
fig, ax = plt.subplots(1,2, figsize=(14,5))
by_time.loc['Weekday'].plot(ax=ax[0], title='Weekdays', xticks=hourly_ticks, style=[':', '--', '-'])
by_time.loc['Weekend'].plot(ax= ax[1], title='Weekends', xticks=hourly_ticks, style=[':', '--', '-']);



In [195]:

    
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3)))
                          for i in range(5))



In [204]:

    
result1 = -df1 * df2 / (df3 + df4) - df5



In [203]:

    
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')



In [205]:

    
np.allclose(result1, result2)









    Out[205]:





True



In [231]:

    
df = pd.DataFrame(rng.randint(0 , 300, (100,3) ), columns=['A', 'B', 'C'])
df.head()



In [232]:

    
r1 = (df['A'] + df['B'] / (df['C'] - 1 ))



In [233]:

    
r2 = pd.eval('(df.A + df.B / (df.C - 1 ))')



In [234]:

    
np.allclose(r1, r2)









    Out[234]:





False



In [237]:

    
%timeit pd.eval('(df.A + df.B / (df.C- 1 ))')









    



197 ms ± 24.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)



In [238]:

    
%timeit (df['A'] + df['B'] / (df['C'] - 1 ))









    



115 ms ± 285 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)



In [ ]:

	aaa	bbb
a	0.417933	0.634213
b	0.742840	0.110415
c	0.476192	0.821592

class	First	Second	Third	All
sex
female	0.968085	0.921053	0.500000	0.742038
male	0.368852	0.157407	0.135447	0.188908
All	0.629630	0.472826	0.242363	0.383838

	year	month	day	gender	births
0	1969	1	1	F	4046
1	1969	1	1	M	4440
2	1969	1	2	F	4454
3	1969	1	2	M	4548
4	1969	1	3	F	4548

	year	month	day	gender	births	decade
0	1969	1	1	F	4046	1960
1	1969	1	1	M	4440	1960
2	1969	1	2	F	4454	1960
3	1969	1	2	M	4548	1960
4	1969	1	3	F	4548	1960

	year	month	day	gender	births	decade	dayofweek
1969-01-01	1969	1	1	F	4046	1960	2
1969-01-01	1969	1	1	M	4440	1960	2
1969-01-02	1969	1	2	F	4454	1960	3
1969-01-02	1969	1	2	M	4548	1960	3
1969-01-03	1969	1	3	F	4548	1960	4

	area	population
California	423967	38332521
Florida	170312	19552860
Illinois	149995	12882135
New York	141297	19651127
Texas	695662	26448193

	California	Florida	Illinois	New York	Texas
area	4.239670e+05	1.703120e+05	1.499950e+05	1.412970e+05	6.956620e+05
population	3.833252e+07	1.955286e+07	1.288214e+07	1.965113e+07	2.644819e+07
density	9.041393e+01	1.148061e+02	8.588376e+01	1.390767e+02	3.801874e+01

	A	B	C	D
0	-1.000000	7.071068e-01	1.000000	-1.000000e+00
1	-0.707107	1.224647e-16	0.707107	-7.071068e-01
2	-0.707107	1.000000e+00	-0.707107	1.224647e-16

		total	under18
California	2000	NaN	9267089
California	2010	NaN	9284094
New York	2000	NaN	4687374
New York	2010	NaN	4318033
Texas	2000	NaN	5906301
Texas	2010	NaN	6879014

	state/region	ages	year	population	state	abbreviation
0	AL	under18	2012	1117489.0	Alabama	AL
1	AL	total	2012	4817528.0	Alabama	AL
2	AL	under18	2010	1130966.0	Alabama	AL
3	AL	total	2010	4785570.0	Alabama	AL
4	AL	under18	2011	1125763.0	Alabama	AL

	state/region	ages	year	population	state
2448	PR	under18	1990	NaN	NaN
2449	PR	total	1990	NaN	NaN
2450	PR	total	1991	NaN	NaN
2451	PR	under18	1991	NaN	NaN
2452	PR	total	1993	NaN	NaN

	method	number	orbital_period	mass	distance	year
0	Radial Velocity	1	269.300	7.10	77.40	2006
1	Radial Velocity	1	874.774	2.21	56.95	2008
2	Radial Velocity	1	763.000	2.60	19.84	2011
3	Radial Velocity	1	326.030	19.40	110.62	2007
4	Radial Velocity	1	516.220	10.50	119.47	2009

	number	orbital_period	mass	distance	year
count	498.00000	498.000000	498.000000	498.000000	498.000000
mean	1.73494	835.778671	2.509320	52.068213	2007.377510
std	1.17572	1469.128259	3.636274	46.596041	4.167284
min	1.00000	1.328300	0.003600	1.350000	1989.000000
25%	1.00000	38.272250	0.212500	24.497500	2005.000000
50%	1.00000	357.000000	1.245000	39.940000	2009.000000
75%	2.00000	999.600000	2.867500	59.332500	2011.000000
max	6.00000	17337.500000	25.000000	354.000000	2014.000000

	number	orbital_period	mass	distance	year
count	1035.000000	992.000000	513.000000	808.000000	1035.000000
mean	1.785507	2002.917596	2.638161	264.069282	2009.070531
std	1.240976	26014.728304	3.818617	733.116493	3.972567
min	1.000000	0.090706	0.003600	1.350000	1989.000000
25%	1.000000	5.442540	0.229000	32.560000	2007.000000
50%	1.000000	39.979500	1.260000	55.250000	2010.000000
75%	2.000000	526.005000	3.040000	178.500000	2012.000000
max	7.000000	730000.000000	25.000000	8500.000000	2014.000000

	count	mean	std	min	25%	50%	75%	max
method
Astrometry	2.0	2011.500000	2.121320	2010.0	2010.75	2011.5	2012.25	2013.0
Eclipse Timing Variations	9.0	2010.000000	1.414214	2008.0	2009.00	2010.0	2011.00	2012.0
Imaging	38.0	2009.131579	2.781901	2004.0	2008.00	2009.0	2011.00	2013.0
Microlensing	23.0	2009.782609	2.859697	2004.0	2008.00	2010.0	2012.00	2013.0
Orbital Brightness Modulation	3.0	2011.666667	1.154701	2011.0	2011.00	2011.0	2012.00	2013.0
Pulsar Timing	5.0	1998.400000	8.384510	1992.0	1992.00	1994.0	2003.00	2011.0
Pulsation Timing Variations	1.0	2007.000000	NaN	2007.0	2007.00	2007.0	2007.00	2007.0
Radial Velocity	553.0	2007.518987	4.249052	1989.0	2005.00	2009.0	2011.00	2014.0
Transit	397.0	2011.236776	2.077867	2002.0	2010.00	2012.0	2013.00	2014.0
Transit Timing Variations	4.0	2012.500000	1.290994	2011.0	2011.75	2012.5	2013.25	2014.0

	number	orbital_period	mass	distance	year
method
Astrometry	2	1.262360e+03	NaN	35.75	4023
Eclipse Timing Variations	15	4.276480e+04	10.25000	1261.44	18090
Imaging	50	1.418973e+06	NaN	2166.91	76347
Microlensing	27	2.207500e+04	NaN	41440.00	46225
Orbital Brightness Modulation	5	2.127920e+00	NaN	2360.00	6035
Pulsar Timing	11	3.671511e+04	NaN	1200.00	9992
Pulsation Timing Variations	1	1.170000e+03	NaN	NaN	2007
Radial Velocity	952	4.553151e+05	1341.65638	27348.11	1110158
Transit	776	8.377523e+03	1.47000	134242.77	798461
Transit Timing Variations	9	2.393505e+02	NaN	3313.00	8050

	survived	pclass	sex	age	sibsp	fare	embarked	class	who	adult_male	deck	embark_town	alive	alone
0	0	3	male	22.0	1	7.2500	S	Third	man	True	NaN	Southampton	no	False
1	1	1	female	38.0	1	71.2833	C	First	woman	False	C	Cherbourg	yes	False
2	1	3	female	26.0	0	7.9250	S	Third	woman	False	NaN	Southampton	yes	True
3	1	1	female	35.0	1	53.1000	S	First	woman	False	C	Southampton	yes	False
4	0	3	male	35.0	0	8.0500	S	Third	man	True	NaN	Southampton	no	True

	class	First	Second	Third
sex	age
female	(0, 18]	0.909091	1.000000	0.511628
female	(18, 80]	0.972973	0.900000	0.423729
male	(0, 18]	0.800000	0.600000	0.215686
male	(18, 80]	0.375000	0.071429	0.133663

	births
2012-01-01	4009.225
2012-01-02	4247.400
2012-01-03	4500.900
2012-01-04	4571.350
2012-01-05	4603.625

	Open	High	Low	Close	Volume
Date
2014-01-02	557.17	558.32	553.58	556.00	NaN
2014-01-03	556.94	557.91	551.91	551.95	NaN
2014-01-06	555.95	558.87	552.67	558.10	NaN
2014-01-07	561.94	569.28	560.02	568.86	NaN
2014-01-08	572.43	573.09	566.08	570.04	NaN

	Fremont Bridge West Sidewalk	Fremont Bridge East Sidewalk
Date
2012-10-03 00:00:00	4.0	9.0
2012-10-03 01:00:00	4.0	6.0
2012-10-03 02:00:00	1.0	1.0
2012-10-03 03:00:00	2.0	3.0
2012-10-03 04:00:00	6.0	1.0

	West	East	Total
Date
2012-10-03 00:00:00	4.0	9.0	13.0
2012-10-03 01:00:00	4.0	6.0	10.0
2012-10-03 02:00:00	1.0	1.0	2.0
2012-10-03 03:00:00	2.0	3.0	5.0
2012-10-03 04:00:00	6.0	1.0	7.0

	West	East	Total
count	40841.000000	40841.00000	40841.000000
mean	55.043706	53.48777	108.531476
std	74.173400	77.24312	137.372419
min	0.000000	0.00000	0.000000
25%	7.000000	7.00000	15.000000
50%	29.000000	27.00000	59.000000
75%	72.000000	66.00000	142.000000
max	913.000000	717.00000	1302.000000