In [ ]:
## Pandas basic
  # uses dataframe to put data in easy to use format
  # SQL like - relational
  # fast read right from storage

In [3]:
import pandas as pd
import numpy as np

In [63]:
# DataFrame from dict
students = pd.DataFrame( {
    'phone': ['123-1234', '321-4321', '321-4321'],
    'age': [13, 12, 13],
    'grade': [78, 77, 92]}, 
    index = ['alice', 'bob', 'eve'] )
students


Out[63]:
age grade phone
alice 13 78 123-1234
bob 12 77 321-4321
eve 13 92 321-4321

In [5]:
students.index


Out[5]:
Index(['alice', 'bob', 'eve'], dtype='object')

In [6]:
# Dataframe from numpy array
df = pd.DataFrame(np.random.randn(6,4), 
                  index=['index1', 'index2', 'index3', 'index4','index5','index6'], 
                  columns = list('ABCD'))
df


Out[6]:
A B C D
index1 -0.156835 -0.361788 0.230273 1.107232
index2 -0.982796 -0.694776 1.188058 -0.873255
index3 1.372878 0.592330 0.238245 1.098033
index4 -0.206060 -0.638952 0.335568 -2.091381
index5 -0.963493 -0.273156 -0.113527 -0.155731
index6 0.382200 0.072323 0.216336 0.055544

In [ ]:
# DataFrame from csv file
load csv --> df = pd.read_csv('./PWD/file.csv', delimiter="\t")
save csv --> df.to_csv('./PWD/newFile.csv  ')

In [7]:
#first few lines
df.head()


Out[7]:
A B C D
index1 -0.156835 -0.361788 0.230273 1.107232
index2 -0.982796 -0.694776 1.188058 -0.873255
index3 1.372878 0.592330 0.238245 1.098033
index4 -0.206060 -0.638952 0.335568 -2.091381
index5 -0.963493 -0.273156 -0.113527 -0.155731

In [9]:
df.head(3)


Out[9]:
A B C D
index1 -0.156835 -0.361788 0.230273 1.107232
index2 -0.982796 -0.694776 1.188058 -0.873255
index3 1.372878 0.592330 0.238245 1.098033

In [10]:
#last few lines
df.tail()


Out[10]:
A B C D
index2 -0.982796 -0.694776 1.188058 -0.873255
index3 1.372878 0.592330 0.238245 1.098033
index4 -0.206060 -0.638952 0.335568 -2.091381
index5 -0.963493 -0.273156 -0.113527 -0.155731
index6 0.382200 0.072323 0.216336 0.055544

In [11]:
df.tail(2)


Out[11]:
A B C D
index5 -0.963493 -0.273156 -0.113527 -0.155731
index6 0.382200 0.072323 0.216336 0.055544

In [12]:
# type of data
type(df)


Out[12]:
pandas.core.frame.DataFrame

In [14]:
# number of rows and columns
df.shape


Out[14]:
(6, 4)

In [15]:
# information about data
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, index1 to index6
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0+ bytes

In [17]:
df.describe()


Out[17]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.092351 -0.217336 0.349159 -0.143260
std 0.887802 0.483560 0.438655 1.222480
min -0.982796 -0.694776 -0.113527 -2.091381
25% -0.774135 -0.569661 0.219821 -0.693874
50% -0.181447 -0.317472 0.234259 -0.050094
75% 0.247441 -0.014047 0.311237 0.837411
max 1.372878 0.592330 1.188058 1.107232

In [24]:
# series --> 1D set of data with an index
s = pd.Series([1, 3, 4, np.nan, 4, 2])
s


Out[24]:
0    1.0
1    3.0
2    4.0
3    NaN
4    4.0
5    2.0
dtype: float64

In [26]:
# adding index later
s.index = ['a','b','c','d','e', 'f']
s


Out[26]:
a    1.0
b    3.0
c    4.0
d    NaN
e    4.0
f    2.0
dtype: float64

In [30]:
# Missing data
s.isnull()


Out[30]:
a    False
b    False
c    False
d     True
e    False
f    False
dtype: bool

In [29]:
# plotting 
import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
s.plot()


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa8d44c4a90>

In [33]:
df.plot()


Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa8d447f400>

In [34]:
# see portion of data in dataframe or series
s[s.index > 'c']


Out[34]:
d    NaN
e    4.0
f    2.0
dtype: float64

In [36]:
s[s.isnull() == False]


Out[36]:
a    1.0
b    3.0
c    4.0
e    4.0
f    2.0
dtype: float64

In [37]:
df.head()


Out[37]:
A B C D
index1 1.041647 -0.281104 -2.357189 -0.497115
index2 -0.613589 0.232681 -0.130158 -1.382819
index3 -0.523167 0.249878 0.422815 0.628948
index4 0.711498 -0.111896 -0.359935 0.050962
index5 2.005816 -0.111702 0.495628 -0.243297

In [41]:
# select column
df.A


Out[41]:
index1    1.041647
index2   -0.613589
index3   -0.523167
index4    0.711498
index5    2.005816
index6    1.127781
Name: A, dtype: float64

In [40]:
df['B']


Out[40]:
index1   -0.281104
index2    0.232681
index3    0.249878
index4   -0.111896
index5   -0.111702
index6    0.606418
Name: B, dtype: float64

In [20]:
# column types
df.dtypes


Out[20]:
A    float64
B    float64
C    float64
D    float64
dtype: object

In [21]:
# column names
df.columns


Out[21]:
Index(['A', 'B', 'C', 'D'], dtype='object')

In [22]:
# multiple columns
df[ ['A','B']]


Out[22]:
A B
index1 -0.156835 -0.361788
index2 -0.982796 -0.694776
index3 1.372878 0.592330
index4 -0.206060 -0.638952
index5 -0.963493 -0.273156
index6 0.382200 0.072323

In [23]:
# number of rows
df.shape[0]


Out[23]:
6

In [42]:
# select label / name
df.loc['index2']


Out[42]:
A   -0.613589
B    0.232681
C   -0.130158
D   -1.382819
Name: index2, dtype: float64

In [43]:
# label and column intersection
df.loc['index3', ['A', 'C']]


Out[43]:
A   -0.523167
C    0.422815
Name: index3, dtype: float64

In [45]:
# select by label location/ index
df.iloc[1,3]


Out[45]:
-1.3828185812005171

In [27]:
# get rows by name or number
df.ix[0]


Out[27]:
A   -0.156835
B   -0.361788
C    0.230273
D    1.107232
Name: index1, dtype: float64

In [42]:
df.ix[ ['index2', 'index1' ]]


Out[42]:
A B C D
index2 -0.982796 -0.694776 1.188058 -0.873255
index1 -0.156835 -0.361788 0.230273 1.107232

In [43]:
# df.ix[row, column]
df.ix[ [0,3], ['A', 'D']]


Out[43]:
A D
index1 -0.156835 1.107232
index4 -0.206060 -2.091381

In [44]:
df.iloc[3, :]


Out[44]:
A   -0.206060
B   -0.638952
C    0.335568
D   -2.091381
Name: index4, dtype: float64

In [45]:
df.iloc[:, 2]


Out[45]:
index1    0.230273
index2    1.188058
index3    0.238245
index4    0.335568
index5   -0.113527
index6    0.216336
Name: C, dtype: float64

In [46]:
# add column to dataframe
df['Z'] = [1,3,4,5,6,8]
df


Out[46]:
A B C D Z
index1 -0.156835 -0.361788 0.230273 1.107232 1
index2 -0.982796 -0.694776 1.188058 -0.873255 3
index3 1.372878 0.592330 0.238245 1.098033 4
index4 -0.206060 -0.638952 0.335568 -2.091381 5
index5 -0.963493 -0.273156 -0.113527 -0.155731 6
index6 0.382200 0.072323 0.216336 0.055544 8

In [47]:
df[df['A'] == df['A'].max()]


Out[47]:
A B C D Z
index3 1.372878 0.59233 0.238245 1.098033 4

In [51]:
#function
df.Z.apply(lambda Z: Z + 0.1)


Out[51]:
index1    1.1
index2    3.1
index3    4.1
index4    5.1
index5    6.1
index6    8.1
Name: Z, dtype: float64

In [55]:
#built-in
df.A.mean()


Out[55]:
0.6249978542633915

In [56]:
df.A.max()


Out[56]:
2.00581642006934

In [57]:
df.A.min()


Out[57]:
-0.6135885851876196

In [52]:
df.count()


Out[52]:
A    6
B    6
C    6
D    6
Z    6
dtype: int64

In [54]:
#correlation between columns
df.corr()


Out[54]:
A B C D Z
A 1.000000 0.864102 -0.287728 0.500292 0.104440
B 0.864102 1.000000 -0.452648 0.694440 0.261574
C -0.287728 -0.452648 1.000000 -0.333016 -0.366497
D 0.500292 0.694440 -0.333016 1.000000 -0.282841
Z 0.104440 0.261574 -0.366497 -0.282841 1.000000

In [55]:
# cummulative max
df.cummax()


Out[55]:
A B C D Z
index1 -0.156835 -0.361788 0.230273 1.107232 1.0
index2 -0.156835 -0.361788 1.188058 1.107232 3.0
index3 1.372878 0.592330 1.188058 1.107232 4.0
index4 1.372878 0.592330 1.188058 1.107232 5.0
index5 1.372878 0.592330 1.188058 1.107232 6.0
index6 1.372878 0.592330 1.188058 1.107232 8.0

In [67]:
students


Out[67]:
age grade phone
alice 13 78 123-1234
bob 12 77 321-4321
eve 13 92 321-4321

In [68]:
# aggregation: mean grade grouped by age
students.groupby('age')['grade'].mean()


Out[68]:
age
12    77
13    85
Name: grade, dtype: int64

In [76]:
#bins
bin = np.linspace(70, 100, 4)
bin


Out[76]:
array([ 70.,  80.,  90., 100.])

In [77]:
# grades grouped by 70s 80s and 90s and age of students
students.groupby( np.digitize(students.grade, bin)).age.mean()


Out[77]:
1    12.5
3    13.0
Name: age, dtype: float64