In [12]:
import pandas as pd
import numpy as np
In [13]:
# DataFrame from dict
students = pd.DataFrame( {
'phone': ['123-1234', '321-4321', '321-4321'],
'age': [13, 12, 13],
'grade': [78, 77, 92]},
index = ['alice', 'bob', 'eve'] )
students
Out[13]:
In [14]:
students.index
Out[14]:
In [15]:
# Dataframe from numpy array
df = pd.DataFrame(np.random.randn(6,4),
index=['index1', 'index2', 'index3', 'index4','index5','index6'],
columns = list('ABCD'))
df
Out[15]:
In [16]:
#first few lines
df.head()
Out[16]:
In [17]:
df.head(3)
Out[17]:
In [18]:
#last few lines
df.tail()
Out[18]:
In [19]:
df.tail(2)
Out[19]:
In [20]:
# type of data
type(df)
Out[20]:
In [21]:
# number of rows and columns
df.shape
Out[21]:
In [22]:
# information about data
df.info()
In [23]:
df.describe()
Out[23]:
In [24]:
# series --> 1D set of data with an index
s = pd.Series([1, 3, 4, np.nan, 4, 2])
s
Out[24]:
In [25]:
# adding index later
s.index = ['a','b','c','d','e', 'f']
s
Out[25]:
In [26]:
# Missing data
s.isnull()
Out[26]:
In [27]:
# plotting
import matplotlib.pyplot as plt
%matplotlib inline
In [28]:
s.plot()
Out[28]:
In [29]:
df.plot()
Out[29]:
In [30]:
# see portion of data in dataframe or series
s[s.index > 'c']
Out[30]:
In [31]:
s[s.isnull() == False]
Out[31]:
In [32]:
df.head()
Out[32]:
In [33]:
# select column
df.A
Out[33]:
In [34]:
df['B']
Out[34]:
In [35]:
# column types
df.dtypes
Out[35]:
In [36]:
# column names
df.columns
Out[36]:
In [37]:
# multiple columns
df[ ['A','B']]
Out[37]:
In [38]:
# number of rows
df.shape[0]
Out[38]:
In [39]:
# select label / name
df.loc['index2']
Out[39]:
In [40]:
# label and column intersection
df.loc['index3', ['A', 'C']]
Out[40]:
In [41]:
# select by label location/ index
df.iloc[1,3]
Out[41]:
In [42]:
# get rows by name or number
df.ix[0]
Out[42]:
In [43]:
df.ix[ ['index2', 'index1' ]]
Out[43]:
In [44]:
# df.ix[row, column]
df.ix[ [0,3], ['A', 'D']]
Out[44]:
In [45]:
df.iloc[3, :]
Out[45]:
In [46]:
df.iloc[:, 2]
Out[46]:
In [47]:
# add column to dataframe
df['Z'] = [1,3,4,5,6,8]
df
Out[47]:
In [48]:
df[df['A'] == df['A'].max()]
Out[48]:
In [49]:
#function
df.Z.apply(lambda Z: Z + 0.1)
Out[49]:
In [50]:
#built-in
df.A.mean()
Out[50]:
In [51]:
df.A.max()
Out[51]:
In [52]:
df.A.min()
Out[52]:
In [53]:
df.count()
Out[53]:
In [54]:
#correlation between columns
df.corr()
Out[54]:
In [55]:
# cummulative max
df.cummax()
Out[55]:
In [56]:
students
Out[56]:
In [57]:
# aggregation: mean grade grouped by age
students.groupby('age')['grade'].mean()
Out[57]:
In [58]:
#bins
bin = np.linspace(70, 100, 4)
bin
Out[58]:
In [59]:
# grades grouped by 70s 80s and 90s and age of students
students.groupby( np.digitize(students.grade, bin)).age.mean()
Out[59]:
In [60]:
num = list(range(20))
In [61]:
num
Out[61]:
In [63]:
# num[ start: end: jump]
num[ : : 2]
Out[63]:
In [66]:
num[ 15: 1 : -3]
Out[66]:
In [ ]: