In [1]:
## from numpy import * #Load all the numpy packages
%matplotlib inline
import numpy as np
import *
This loads all submodules. Note: this is a waste of memory when incorporated into deployed code. We'll use it here by example - it's fine to use for learning purposes and legibility.
As we'll see later, the proper convention is to use:
import numpy as np
And then to specifically call needed methods:
An array object represents a multidimensional, homogeneous array of fixed-size items.
In [2]:
# Creating arrays
a = np.zeros((3))
b = np.ones((2,3))
c = np.random.randint(1,10,(2,3,4))
d = np.arange(0,11,1)
What are these functions?
arange?
In [3]:
# Note the way each array is printed:
print("a: ",a)
print("b: ",b)
print("c: ",c)
print("d: ",d)
In [4]:
>>> a = np.array( [20,30,40,50] )
>>> b = np.arange( 4 )
>>> b
Out[4]:
In [5]:
>>> c = a-b
>>> c
Out[5]:
In [6]:
>>> b**2
Out[6]:
In [7]:
# one-dimensional arrays work like lists:
a = np.arange(10)**2
In [8]:
a
Out[8]:
In [9]:
a[2:5]
Out[9]:
In [10]:
b = np.random.randint(1,100,(4,4))
In [11]:
b
Out[11]:
In [12]:
# Guess the output
print(b[2,3])
In [13]:
# Guess the output
print(b[0,0])
In [14]:
# Guess the output
b[0:3,1]
Out[14]:
In [15]:
#Guess the output
b[:,1]
Out[15]:
In [16]:
#Guess the output
b[1:3,:]
Out[16]:
Source: pandas.pydata.org
In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [18]:
dates = pd.date_range('20140101',periods=6)
dates
Out[18]:
In [19]:
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
z = pd.DataFrame(index = df.index, columns = df.columns)
df.columns
Out[19]:
In [20]:
# Index, columns, underlying numpy data
print df
In [21]:
df.T
Out[21]:
In [22]:
df
Out[22]:
In [23]:
temp = df.T
temp
Out[23]:
In [35]:
df = df.T
In [36]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : 'foo' })
df2
Out[36]:
In [37]:
# With specific dtypes
df2.dtypes
Out[37]:
In [38]:
df
Out[38]:
In [39]:
df.head()
Out[39]:
In [40]:
df.tail(3)
Out[40]:
In [41]:
df.index
Out[41]:
In [42]:
df.describe()
Out[42]:
In [43]:
df
Out[43]:
In [44]:
z = df.sort_values(by="B")
z
Out[44]:
In [45]:
df[['A','B']]
Out[45]:
In [46]:
df[0:3]
Out[46]:
In [47]:
# By label
df.loc[dates[0]]
Out[47]:
In [48]:
# multi-axis by label
df.loc[:,['A','B']]
Out[48]:
In [49]:
# Date Range
df.loc['20140102':'20140104',['B']]
Out[49]:
In [50]:
# Fast access to scalar
df.at[dates[1],'B']
Out[50]:
In [51]:
df.describe()
Out[51]:
In [52]:
df.mean(),df.mean(1) # Operation on two different axes
# there are pandas' methods for min, max etc. as well
Out[52]:
In [53]:
df
Out[53]:
In [54]:
df[df.A < 0] # Basically a 'where' operation
Out[54]:
In [55]:
df.A < 0
Out[55]:
In [56]:
# Two conditions added together
df[(df.A < 0) & (df.B > .5) ]
Out[56]:
In [57]:
# Two conditions ORed together
df[(df.A < -1) | (df.D > 0)]
Out[57]:
In [58]:
np.random.randn(10,4)
Out[58]:
In [59]:
##Concatenating pandas objects together
# create a dataframe to use as an example
df2 = pd.DataFrame(np.random.randn(10,4))
df2
Out[59]:
In [60]:
# Break it into pieces
pieces = [df2[:3], df2[3:7],df2[7:]]
pieces
Out[60]:
In [61]:
pd.concat(pieces)
Out[61]:
In [ ]:
# Also can "Join" and "Append"
In [62]:
df_posA = df.copy() # Without "copy" it would act on the dataset
df_posA[df_posA.A < 0] = -1*df_posA
In [65]:
print df_posA
print df
In [ ]:
#Setting new column aligns data by index
s1 = pd.Series([1,2,3,4,5,6],index=pd.date_range('20140102',periods=6))
In [ ]:
s1
In [ ]:
df['F'] = s1
In [ ]:
df
In [66]:
# Add a column with missing data
df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])
In [68]:
df1.loc[dates[0]:dates[1],'E'] = 1
In [69]:
df1
Out[69]:
In [70]:
# find where values are null
pd.isnull(df1)
Out[70]:
In [71]:
df
Out[71]:
In [72]:
df.apply(np.cumsum)
Out[72]:
In [73]:
df.apply(lambda x: x.max() - x.min())
Out[73]:
In [74]:
# Built in string methods
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.str.lower()
Out[74]:
In [75]:
df3 = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
In [76]:
df3
Out[76]:
In [78]:
df3.groupby(['A','B']).mean()
Out[78]:
In [ ]:
# You can also stack or unstack levels
In [79]:
a = df3.groupby(['A','B']).sum()
In [ ]:
# Pivot Tables
# pd.pivot_table(df3,values=['C','D'],rows=['A'],cols=['B'])
In [80]:
import pandas as pd
import numpy as np
In [ ]:
# 100 Seconds starting on January 1st
rng = pd.date_range('1/1/2014', periods=100, freq='S')
In [ ]:
# Give each second a random value
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
In [ ]:
ts
In [ ]:
# Built in resampling
ts.resample('1Min',how='mean') # Resample secondly to 1Minutely
In [ ]:
# Many additional time series features
# ts. #use tab
In [ ]:
ts.plot()
In [ ]:
def randwalk(startdate,points):
ts = pd.Series(np.random.randn(points), index=pd.date_range(startdate, periods=points))
ts=ts.cumsum()
ts.plot()
return(ts)
In [ ]:
# Using pandas to make a simple random walker by repeatedly running:
a=randwalk('1/1/2012',1000)
In [ ]:
# Pandas plot function will print with labels as default
In [ ]:
df = pd.DataFrame(np.random.randn(100, 4), index=ts.index,columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
plt.figure();df.plot();plt.legend(loc='best') #
Recommended Resources
| Name | Description |
|---|---|
| Official Pandas Tutorials | Wes & Company's selection of tutorials and lectures |
| Julia Evans Pandas Cookbook | Great resource with examples from weather, bikes and 311 calls |
| Learn Pandas Tutorials | A great series of Pandas tutorials from Dave Rojas |
| Research Computing Python Data PYNBs | A super awesome set of python notebooks from a meetup-based course exclusively devoted to pandas |