In [ ]:
## Pandas basic
# uses dataframe to put data in easy to use format
# SQL like - relational
# fast read right from storage
In [3]:
import pandas as pd
import numpy as np
In [63]:
# DataFrame from dict
students = pd.DataFrame( {
'phone': ['123-1234', '321-4321', '321-4321'],
'age': [13, 12, 13],
'grade': [78, 77, 92]},
index = ['alice', 'bob', 'eve'] )
students
Out[63]:
In [5]:
students.index
Out[5]:
In [6]:
# Dataframe from numpy array
df = pd.DataFrame(np.random.randn(6,4),
index=['index1', 'index2', 'index3', 'index4','index5','index6'],
columns = list('ABCD'))
df
Out[6]:
In [ ]:
# DataFrame from csv file
load csv --> df = pd.read_csv('./PWD/file.csv', delimiter="\t")
save csv --> df.to_csv('./PWD/newFile.csv ')
In [7]:
#first few lines
df.head()
Out[7]:
In [9]:
df.head(3)
Out[9]:
In [10]:
#last few lines
df.tail()
Out[10]:
In [11]:
df.tail(2)
Out[11]:
In [12]:
# type of data
type(df)
Out[12]:
In [14]:
# number of rows and columns
df.shape
Out[14]:
In [15]:
# information about data
df.info()
In [17]:
df.describe()
Out[17]:
In [24]:
# series --> 1D set of data with an index
s = pd.Series([1, 3, 4, np.nan, 4, 2])
s
Out[24]:
In [26]:
# adding index later
s.index = ['a','b','c','d','e', 'f']
s
Out[26]:
In [30]:
# Missing data
s.isnull()
Out[30]:
In [29]:
# plotting
import matplotlib.pyplot as plt
%matplotlib inline
In [32]:
s.plot()
Out[32]:
In [33]:
df.plot()
Out[33]:
In [34]:
# see portion of data in dataframe or series
s[s.index > 'c']
Out[34]:
In [36]:
s[s.isnull() == False]
Out[36]:
In [37]:
df.head()
Out[37]:
In [41]:
# select column
df.A
Out[41]:
In [40]:
df['B']
Out[40]:
In [20]:
# column types
df.dtypes
Out[20]:
In [21]:
# column names
df.columns
Out[21]:
In [22]:
# multiple columns
df[ ['A','B']]
Out[22]:
In [23]:
# number of rows
df.shape[0]
Out[23]:
In [42]:
# select label / name
df.loc['index2']
Out[42]:
In [43]:
# label and column intersection
df.loc['index3', ['A', 'C']]
Out[43]:
In [45]:
# select by label location/ index
df.iloc[1,3]
Out[45]:
In [27]:
# get rows by name or number
df.ix[0]
Out[27]:
In [42]:
df.ix[ ['index2', 'index1' ]]
Out[42]:
In [43]:
# df.ix[row, column]
df.ix[ [0,3], ['A', 'D']]
Out[43]:
In [44]:
df.iloc[3, :]
Out[44]:
In [45]:
df.iloc[:, 2]
Out[45]:
In [46]:
# add column to dataframe
df['Z'] = [1,3,4,5,6,8]
df
Out[46]:
In [47]:
df[df['A'] == df['A'].max()]
Out[47]:
In [51]:
#function
df.Z.apply(lambda Z: Z + 0.1)
Out[51]:
In [55]:
#built-in
df.A.mean()
Out[55]:
In [56]:
df.A.max()
Out[56]:
In [57]:
df.A.min()
Out[57]:
In [52]:
df.count()
Out[52]:
In [54]:
#correlation between columns
df.corr()
Out[54]:
In [55]:
# cummulative max
df.cummax()
Out[55]:
In [67]:
students
Out[67]:
In [68]:
# aggregation: mean grade grouped by age
students.groupby('age')['grade'].mean()
Out[68]:
In [76]:
#bins
bin = np.linspace(70, 100, 4)
bin
Out[76]:
In [77]:
# grades grouped by 70s 80s and 90s and age of students
students.groupby( np.digitize(students.grade, bin)).age.mean()
Out[77]: