Pandas basic

  • uses dataframe to put data in easy to use format
  • SQL like - relational
  • fast read right from storage

In [12]:
import pandas as pd
import numpy as np

creating dataframe

  1. from dict
  2. from numpy array
  3. from csv
  4. from fwf

In [13]:
# DataFrame from dict
students = pd.DataFrame( {
    'phone': ['123-1234', '321-4321', '321-4321'],
    'age': [13, 12, 13],
    'grade': [78, 77, 92]}, 
    index = ['alice', 'bob', 'eve'] )
students


Out[13]:
age grade phone
alice 13 78 123-1234
bob 12 77 321-4321
eve 13 92 321-4321

In [14]:
students.index


Out[14]:
Index(['alice', 'bob', 'eve'], dtype='object')

In [15]:
# Dataframe from numpy array
df = pd.DataFrame(np.random.randn(6,4), 
                  index=['index1', 'index2', 'index3', 'index4','index5','index6'], 
                  columns = list('ABCD'))
df


Out[15]:
A B C D
index1 0.730595 1.187878 -0.057026 0.497039
index2 0.163752 -1.658283 -0.660962 1.165278
index3 -1.203011 1.454400 0.611386 0.166255
index4 0.644283 -0.875351 1.336898 0.632972
index5 -0.136466 0.879580 0.894363 0.302853
index6 -1.627352 -0.430911 0.371515 -1.582012

DataFrame from csv file

load csv --> df = pd.read_csv('./PWD/file.csv', delimiter="\t")
save csv --> df.to_csv('./PWD/newFile.csv ')

fwf with time series

data = pd.read_fwf('location/file.txt', header=None, index_col=0, parse_dates=[[0,1]], infer_datetime_format=True)


In [16]:
#first few lines
df.head()


Out[16]:
A B C D
index1 0.730595 1.187878 -0.057026 0.497039
index2 0.163752 -1.658283 -0.660962 1.165278
index3 -1.203011 1.454400 0.611386 0.166255
index4 0.644283 -0.875351 1.336898 0.632972
index5 -0.136466 0.879580 0.894363 0.302853

In [17]:
df.head(3)


Out[17]:
A B C D
index1 0.730595 1.187878 -0.057026 0.497039
index2 0.163752 -1.658283 -0.660962 1.165278
index3 -1.203011 1.454400 0.611386 0.166255

In [18]:
#last few lines
df.tail()


Out[18]:
A B C D
index2 0.163752 -1.658283 -0.660962 1.165278
index3 -1.203011 1.454400 0.611386 0.166255
index4 0.644283 -0.875351 1.336898 0.632972
index5 -0.136466 0.879580 0.894363 0.302853
index6 -1.627352 -0.430911 0.371515 -1.582012

In [19]:
df.tail(2)


Out[19]:
A B C D
index5 -0.136466 0.879580 0.894363 0.302853
index6 -1.627352 -0.430911 0.371515 -1.582012

Understanding data


In [20]:
# type of data
type(df)


Out[20]:
pandas.core.frame.DataFrame

In [21]:
# number of rows and columns
df.shape


Out[21]:
(6, 4)

In [22]:
# information about data
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, index1 to index6
Data columns (total 4 columns):
A    6 non-null float64
B    6 non-null float64
C    6 non-null float64
D    6 non-null float64
dtypes: float64(4)
memory usage: 240.0+ bytes

In [23]:
df.describe()


Out[23]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.238033 0.092886 0.416029 0.197064
std 0.974665 1.260956 0.707197 0.937517
min -1.627352 -1.658283 -0.660962 -1.582012
25% -0.936375 -0.764241 0.050109 0.200404
50% 0.013643 0.224334 0.491450 0.399946
75% 0.524150 1.110804 0.823618 0.598989
max 0.730595 1.454400 1.336898 1.165278

Series


In [24]:
# series --> 1D set of data with an index
s = pd.Series([1, 3, 4, np.nan, 4, 2])
s


Out[24]:
0    1.0
1    3.0
2    4.0
3    NaN
4    4.0
5    2.0
dtype: float64

In [25]:
# adding index later
s.index = ['a','b','c','d','e', 'f']
s


Out[25]:
a    1.0
b    3.0
c    4.0
d    NaN
e    4.0
f    2.0
dtype: float64

In [26]:
# Missing data
s.isnull()


Out[26]:
a    False
b    False
c    False
d     True
e    False
f    False
dtype: bool

Plotting


In [27]:
# plotting 
import matplotlib.pyplot as plt
%matplotlib inline

In [28]:
s.plot()


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3377435a58>

In [29]:
df.plot()


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f3375395518>

selecting data


In [30]:
# see portion of data in dataframe or series
s[s.index > 'c']


Out[30]:
d    NaN
e    4.0
f    2.0
dtype: float64

In [31]:
s[s.isnull() == False]


Out[31]:
a    1.0
b    3.0
c    4.0
e    4.0
f    2.0
dtype: float64

In [32]:
df.head()


Out[32]:
A B C D
index1 0.730595 1.187878 -0.057026 0.497039
index2 0.163752 -1.658283 -0.660962 1.165278
index3 -1.203011 1.454400 0.611386 0.166255
index4 0.644283 -0.875351 1.336898 0.632972
index5 -0.136466 0.879580 0.894363 0.302853

In [33]:
# select column
df.A


Out[33]:
index1    0.730595
index2    0.163752
index3   -1.203011
index4    0.644283
index5   -0.136466
index6   -1.627352
Name: A, dtype: float64

In [34]:
df['B']


Out[34]:
index1    1.187878
index2   -1.658283
index3    1.454400
index4   -0.875351
index5    0.879580
index6   -0.430911
Name: B, dtype: float64

In [35]:
# column types
df.dtypes


Out[35]:
A    float64
B    float64
C    float64
D    float64
dtype: object

In [36]:
# column names
df.columns


Out[36]:
Index(['A', 'B', 'C', 'D'], dtype='object')

In [37]:
# multiple columns
df[ ['A','B']]


Out[37]:
A B
index1 0.730595 1.187878
index2 0.163752 -1.658283
index3 -1.203011 1.454400
index4 0.644283 -0.875351
index5 -0.136466 0.879580
index6 -1.627352 -0.430911

In [38]:
# number of rows
df.shape[0]


Out[38]:
6

In [39]:
# select label / name
df.loc['index2']


Out[39]:
A    0.163752
B   -1.658283
C   -0.660962
D    1.165278
Name: index2, dtype: float64

In [40]:
# label and column intersection
df.loc['index3', ['A', 'C']]


Out[40]:
A   -1.203011
C    0.611386
Name: index3, dtype: float64

In [41]:
# select by label location/ index
df.iloc[1,3]


Out[41]:
1.1652775947912888

In [42]:
# get rows by name or number
df.ix[0]


/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
Out[42]:
A    0.730595
B    1.187878
C   -0.057026
D    0.497039
Name: index1, dtype: float64

In [43]:
df.ix[ ['index2', 'index1' ]]


Out[43]:
A B C D
index2 0.163752 -1.658283 -0.660962 1.165278
index1 0.730595 1.187878 -0.057026 0.497039

In [44]:
# df.ix[row, column]
df.ix[ [0,3], ['A', 'D']]


Out[44]:
A D
index1 0.730595 0.497039
index4 0.644283 0.632972

In [45]:
df.iloc[3, :]


Out[45]:
A    0.644283
B   -0.875351
C    1.336898
D    0.632972
Name: index4, dtype: float64

In [46]:
df.iloc[:, 2]


Out[46]:
index1   -0.057026
index2   -0.660962
index3    0.611386
index4    1.336898
index5    0.894363
index6    0.371515
Name: C, dtype: float64

In [47]:
# add column to dataframe
df['Z'] = [1,3,4,5,6,8]
df


Out[47]:
A B C D Z
index1 0.730595 1.187878 -0.057026 0.497039 1
index2 0.163752 -1.658283 -0.660962 1.165278 3
index3 -1.203011 1.454400 0.611386 0.166255 4
index4 0.644283 -0.875351 1.336898 0.632972 5
index5 -0.136466 0.879580 0.894363 0.302853 6
index6 -1.627352 -0.430911 0.371515 -1.582012 8

In [48]:
df[df['A'] == df['A'].max()]


Out[48]:
A B C D Z
index1 0.730595 1.187878 -0.057026 0.497039 1

function


In [49]:
#function
df.Z.apply(lambda Z: Z + 0.1)


Out[49]:
index1    1.1
index2    3.1
index3    4.1
index4    5.1
index5    6.1
index6    8.1
Name: Z, dtype: float64

In [50]:
#built-in
df.A.mean()


Out[50]:
-0.23803309717511253

In [51]:
df.A.max()


Out[51]:
0.7305949004218479

In [52]:
df.A.min()


Out[52]:
-1.6273516826385388

In [53]:
df.count()


Out[53]:
A    6
B    6
C    6
D    6
Z    6
dtype: int64

In [54]:
#correlation between columns
df.corr()


Out[54]:
A B C D Z
A 1.000000 -0.163294 -0.045417 0.782776 -0.657204
B -0.163294 1.000000 0.256123 -0.138042 -0.197453
C -0.045417 0.256123 1.000000 -0.198932 0.488496
D 0.782776 -0.138042 -0.198932 1.000000 -0.732204
Z -0.657204 -0.197453 0.488496 -0.732204 1.000000

In [55]:
# cummulative max
df.cummax()


Out[55]:
A B C D Z
index1 0.730595 1.187878 -0.057026 0.497039 1.0
index2 0.730595 1.187878 -0.057026 1.165278 3.0
index3 0.730595 1.454400 0.611386 1.165278 4.0
index4 0.730595 1.454400 1.336898 1.165278 5.0
index5 0.730595 1.454400 1.336898 1.165278 6.0
index6 0.730595 1.454400 1.336898 1.165278 8.0

aggregation


In [56]:
students


Out[56]:
age grade phone
alice 13 78 123-1234
bob 12 77 321-4321
eve 13 92 321-4321

In [57]:
# aggregation: mean grade grouped by age
students.groupby('age')['grade'].mean()


Out[57]:
age
12    77
13    85
Name: grade, dtype: int64

In [58]:
#bins
bin = np.linspace(70, 100, 4)
bin


Out[58]:
array([ 70.,  80.,  90., 100.])

In [59]:
# grades grouped by 70s 80s and 90s and age of students
students.groupby( np.digitize(students.grade, bin)).age.mean()


Out[59]:
1    12.5
3    13.0
Name: age, dtype: float64

more indexing


In [60]:
num = list(range(20))

In [61]:
num


Out[61]:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

In [63]:
# num[ start: end: jump]
num[ : : 2]


Out[63]:
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [66]:
num[ 15: 1 : -3]


Out[66]:
[15, 12, 9, 6, 3]

In [ ]: