In [1]:
import pandas as pd
In [2]:
pd.ExcelFile?
In [3]:
pd.read_csv?
In [4]:
# Load data
pib_pet_df = pd.ExcelFile('data/fake_pib_pet_data.xlsx').parse('Sheet1')
apoe_df = pd.read_csv('data/fake_apoe_data.csv', sep=',')
In [5]:
# Preview DataFrame
pib_pet_df.head()
Out[5]:
In [6]:
# Preview DataFrame
apoe_df.head()
Out[6]:
In [7]:
# Delete the first column
apoe_df = apoe_df.drop(apoe_df.columns[0], axis=1)
apoe_df.head()
Out[7]:
In [8]:
# Get column names
apoe_df.columns
Out[8]:
In [9]:
pd.DataFrame?
http://pandas.pydata.org/pandas-docs/stable/indexing.html
In [10]:
# Set up row indexing of DataFrame based on subject IDs
pib_pet_df.index = pib_pet_df.Subject.values
apoe_df.index = apoe_df.Subject.values
In [11]:
#Notice that the index, on the far left, now matches the Subject column
pib_pet_df.head()
Out[11]:
In [12]:
# Check Row Names
pib_pet_df.index
Out[12]:
In [13]:
# Get APOE data for Subject 2
# i.e. select a row using an index
apoe_df.loc['S2']
Out[13]:
In [14]:
# Get PIB-PET data for Subject 2
# i.e. select a single row using row number
pib_pet_df.ix[1].head()
Out[14]:
In [15]:
# Get all PIB-PET data on the Left Hippocampus
# i.e. select a column using column name
pib_pet_df['Left-Hippocampus']
Out[15]:
In [16]:
pd.DataFrame.sort?
In [17]:
# Sort by a column name
pib_pet_df.sort('Subject')
Out[17]:
In [18]:
pib_pet_df = pib_pet_df.sort('Subject')
apoe_df = apoe_df.sort('Subject')
http://pandas.pydata.org/pandas-docs/stable/merging.html
In [19]:
pd.DataFrame.merge?
In [20]:
# These subjects have both APOE and PIB data
print len(set.intersection(set(pib_pet_df.index), set(apoe_df.index)))
set.intersection(set(pib_pet_df.index), set(apoe_df.index))
Out[20]:
In [21]:
# Make a new DataFrame that combines APOE and PIB data into a single DataFrame
# Exclude subjects without both APOE and PIB data (i.e. do an "inner" join)
# NOTE: Inner joins merge using the "on" keys of the DataFrames, therefore they can still have missing data.
inner_df = pd.merge(apoe_df, pib_pet_df, how='inner', on=['Subject'])
inner_df
Out[21]:
In [22]:
# Make a new DataFrame that combines APOE and PIB data into a single DataFrame
# Include all subjects/data (i.e. do an "outer" join)
outer_df = apoe_df.merge(pib_pet_df, how='outer', on=['Subject'])
outer_df
Out[22]:
In [23]:
pd.DataFrame.dropna?
In [24]:
# Removing rows with NaN
# Although we accept that there is some missing data in outer_df,
# we cannot use rows with Subject==NaN
outer_df.dropna(axis=0, how='any', subset=['Subject'])
Out[24]:
https://github.com/jvns/pandas-cookbook/tree/master/cookbook
In [ ]: