In [1]:
import numpy as np
import pandas as pd
In [2]:
np.random.seed(482010) # seed the pseudo-random number generators
x = np.random.random(15)
y = np.random.binomial(10, x)
df = pd.DataFrame()
df['prob'] = x
df['count'] = y
df.head()
Out[2]:
In [3]:
# alternate way to create same df by passing a dictionary
df2 = pd.DataFrame({'prob':x, 'count':y})
df2.head()
Out[3]:
In [4]:
# column and row names
print("Column names: ", df.columns)
print("Row names (index): ", df.index)
In [5]:
# add a new column to our existing df
# randomly give each observation a categorization
df['category'] = np.random.choice(['A','B','C'], size=15)
df
Out[5]:
In [6]:
# create a new column representing locations
# using None to represent missing valuess
df['location'] = np.random.choice(["Raleigh","Durham","Chapel Hill", None], size=15)
df
Out[6]:
In [7]:
# find out for which observations location is null
df["location"].isnull() # notice None objects are consider null
Out[7]:
In [8]:
# transpose flips rows and columns
df.T
Out[8]:
In [9]:
# numerical operations can be applied to numerical columns of data frames
# just as if they were numpy arrays
df['count'] * 2
Out[9]:
In [10]:
df['count'] * df['prob']
Out[10]:
Pandas DataFrames support three different indexing mechanisms.
.iloc
-- indexes rows and columns by integers. Indexes go from 0 to $n-1$ for rows, and 0 to $p-1$ for columns, where $n$ and $p$ are the number of rows and columns respectively. Slicing is like normal python slices.
.loc
-- indexes rows and columns by labels. If you use slices with .loc
, note that contrary to standard Python slices, both the start and the stop are included! Note that if the row names (df.index
) are integers, than slice
.ix
-- allows you to mix integers and labels. However, it defaults to label based indexing like .loc
, and only adopts integer based indexing if row or column labels don't have integers in them. For the most part, .ix
works intuitively, but if you're not careful it can lead to seom unexpected behavior.
For a fuller explanation of the differences between these indexing mechanisms see:
In [11]:
df["prob"] # labels by themselves index column names
Out[11]:
In [12]:
df.prob # if your column name is a valid python variable name you can acces it like an attribute
Out[12]:
In [13]:
# get specific columns using list of column names
df[["count","location"]].head() # lookup the head method in the docs
Out[13]:
In [14]:
# get specific columns using list of column names
df[["prob","category"]].tail() # lookup the tail method in the docs
Out[14]:
In [15]:
# get first three rows of df using slices
df[:3]
Out[15]:
In [16]:
# last three rows of df using slices
df[-3:]
Out[16]:
In [17]:
df.iloc[:3,:] # integer indexing and slicing using iloc
Out[17]:
In [18]:
df.iloc[:4,(1,3)] # integer indexing and slicing
Out[18]:
In [19]:
df.loc[:4,("count","location")] # label indexing and slicing, contrast to above
Out[19]:
In [20]:
df.ix[:4,:3] # indexes rows like .loc (because row names are integers)
# indexes columns like .iloc (because no integers in column names)
Out[20]:
In [21]:
# 'category' is a categorical variable. Count the number in each category
df['category'].value_counts()
Out[21]:
In [22]:
df.category.value_counts() # same as above
Out[22]:
In [23]:
# similar counts for location, note that None is treated as missing values and not counted
df['location'].value_counts()
Out[23]:
In [24]:
# overall mean of the "prob" variable
df.prob.mean()
Out[24]:
In [25]:
# mean of prob grouped by location
df.groupby("location").prob.mean()
Out[25]:
In [26]:
# mean of all the continuous variables, group by category
df.groupby("category").mean()
Out[26]:
In [27]:
# groupby multiple columns
df.groupby(["location","category"]).mean()
Out[27]:
In [ ]: