In [1]:
import pandas as pd

Load Data


In [2]:
pd.ExcelFile?

In [3]:
pd.read_csv?

In [4]:
# Load data
pib_pet_df = pd.ExcelFile('data/fake_pib_pet_data.xlsx').parse('Sheet1')
apoe_df = pd.read_csv('data/fake_apoe_data.csv', sep=',')

In [5]:
# Preview DataFrame
pib_pet_df.head()


Out[5]:
Subject Left-Cerebellum-Cortex Left-Thalamus-Proper Left-Caudate Left-Putamen Left-Pallidum Left-Hippocampus Left-Amygdala Right-Cerebellum-Cortex Right-Thalamus-Proper ... ctx-rh-rostralanteriorcingulate ctx-rh-rostralmiddlefrontal ctx-rh-superiorfrontal ctx-rh-superiorparietal ctx-rh-superiortemporal ctx-rh-supramarginal ctx-rh-frontalpole ctx-rh-temporalpole ctx-rh-transversetemporal ctx-rh-insula
0 S4 0.425247 0.665001 0.460931 1.025706 0.138759 0.986567 1.469463 0.463216 0.333045 ... 0.530403 0.755903 1.788937 1.547954 0.864034 0.517150 0.944125 0.104910 0.636804 0.458578
1 S2 0.481369 0.970674 0.328054 0.985217 2.364674 1.645989 0.601938 0.643652 0.943901 ... 0.244447 2.785548 2.333329 0.098286 1.088393 0.939494 2.003004 0.836558 0.506255 0.402447
2 S7 0.783799 0.661049 0.175157 0.475327 0.838076 1.518468 0.566472 0.398546 1.004035 ... 1.171654 0.709581 0.546841 0.716018 0.072203 0.783156 2.060965 1.024808 1.269127 0.492403
3 S14 0.484094 0.676801 1.437728 2.077313 0.547277 0.555102 1.401707 0.287049 1.342868 ... 0.275494 1.440458 0.287285 0.432101 0.824702 0.846591 0.557986 1.232137 0.742445 0.360133
4 S9 0.913431 0.659872 1.658435 0.549286 0.329609 0.366407 0.559081 0.622520 0.650571 ... 0.387701 0.836313 1.577652 0.321051 1.441353 0.204672 0.916508 1.395682 1.125795 0.229797

5 rows × 81 columns


In [6]:
# Preview DataFrame
apoe_df.head()


Out[6]:
Unnamed: 0 Subject APOE_Copy1 APOE_Copy2
0 0 S2 E4 E3
1 1 S13 E4 E2
2 2 S8 E2 E3
3 3 S4 E2 E4
4 4 S5 E4 E4

In [7]:
# Delete the first column
apoe_df = apoe_df.drop(apoe_df.columns[0], axis=1)
apoe_df.head()


Out[7]:
Subject APOE_Copy1 APOE_Copy2
0 S2 E4 E3
1 S13 E4 E2
2 S8 E2 E3
3 S4 E2 E4
4 S5 E4 E4

In [8]:
# Get column names
apoe_df.columns


Out[8]:
Index([u'Subject', u'APOE_Copy1', u'APOE_Copy2'], dtype='object')

Note: You can easioly create a DataFrame from a numpy array. See documentation in cell below.


In [9]:
pd.DataFrame?

Indexing

Pandas documentation on "Indexing and Selecting Data"

http://pandas.pydata.org/pandas-docs/stable/indexing.html

  • NOTE: Indexing can get complicated in Pandas. Check out the Pandas Cookbook, Chapter 2 (see "Additional Resources below) for a guided introduction to the basics.

In [10]:
# Set up row indexing of DataFrame based on subject IDs
pib_pet_df.index = pib_pet_df.Subject.values
apoe_df.index = apoe_df.Subject.values

In [11]:
#Notice that the index, on the far left, now matches the Subject column
pib_pet_df.head()


Out[11]:
Subject Left-Cerebellum-Cortex Left-Thalamus-Proper Left-Caudate Left-Putamen Left-Pallidum Left-Hippocampus Left-Amygdala Right-Cerebellum-Cortex Right-Thalamus-Proper ... ctx-rh-rostralanteriorcingulate ctx-rh-rostralmiddlefrontal ctx-rh-superiorfrontal ctx-rh-superiorparietal ctx-rh-superiortemporal ctx-rh-supramarginal ctx-rh-frontalpole ctx-rh-temporalpole ctx-rh-transversetemporal ctx-rh-insula
S4 S4 0.425247 0.665001 0.460931 1.025706 0.138759 0.986567 1.469463 0.463216 0.333045 ... 0.530403 0.755903 1.788937 1.547954 0.864034 0.517150 0.944125 0.104910 0.636804 0.458578
S2 S2 0.481369 0.970674 0.328054 0.985217 2.364674 1.645989 0.601938 0.643652 0.943901 ... 0.244447 2.785548 2.333329 0.098286 1.088393 0.939494 2.003004 0.836558 0.506255 0.402447
S7 S7 0.783799 0.661049 0.175157 0.475327 0.838076 1.518468 0.566472 0.398546 1.004035 ... 1.171654 0.709581 0.546841 0.716018 0.072203 0.783156 2.060965 1.024808 1.269127 0.492403
S14 S14 0.484094 0.676801 1.437728 2.077313 0.547277 0.555102 1.401707 0.287049 1.342868 ... 0.275494 1.440458 0.287285 0.432101 0.824702 0.846591 0.557986 1.232137 0.742445 0.360133
S9 S9 0.913431 0.659872 1.658435 0.549286 0.329609 0.366407 0.559081 0.622520 0.650571 ... 0.387701 0.836313 1.577652 0.321051 1.441353 0.204672 0.916508 1.395682 1.125795 0.229797

5 rows × 81 columns


In [12]:
# Check Row Names
pib_pet_df.index


Out[12]:
Index([u'S4', u'S2', u'S7', u'S14', u'S9', u'S6', u'S11', u'S8', u'S5',
       u'S10'],
      dtype='object')

In [13]:
# Get APOE data for Subject 2
# i.e. select a row using an index
apoe_df.loc['S2']


Out[13]:
Subject       S2
APOE_Copy1    E4
APOE_Copy2    E3
Name: S2, dtype: object

In [14]:
# Get PIB-PET data for Subject 2
# i.e. select a single row using row number
pib_pet_df.ix[1].head()


Out[14]:
Subject                          S2
Left-Cerebellum-Cortex    0.4813693
Left-Thalamus-Proper      0.9706744
Left-Caudate              0.3280536
Left-Putamen              0.9852168
Name: S2, dtype: object

In [15]:
# Get all PIB-PET data on the Left Hippocampus
# i.e. select a column using column name
pib_pet_df['Left-Hippocampus']


Out[15]:
S4     0.986567
S2     1.645989
S7     1.518468
S14    0.555102
S9     0.366407
S6     2.525695
S11    0.642106
S8     1.250021
S5     0.995690
S10    1.362993
Name: Left-Hippocampus, dtype: float64

Sort


In [16]:
pd.DataFrame.sort?

In [17]:
# Sort by a column name
pib_pet_df.sort('Subject')


Out[17]:
Subject Left-Cerebellum-Cortex Left-Thalamus-Proper Left-Caudate Left-Putamen Left-Pallidum Left-Hippocampus Left-Amygdala Right-Cerebellum-Cortex Right-Thalamus-Proper ... ctx-rh-rostralanteriorcingulate ctx-rh-rostralmiddlefrontal ctx-rh-superiorfrontal ctx-rh-superiorparietal ctx-rh-superiortemporal ctx-rh-supramarginal ctx-rh-frontalpole ctx-rh-temporalpole ctx-rh-transversetemporal ctx-rh-insula
S10 S10 0.220978 0.207769 1.737944 1.619239 0.311993 1.362993 0.396153 0.588856 1.853778 ... 1.320036 1.608255 1.547771 0.283531 3.212926 0.624633 0.278144 0.832677 1.280211 3.906851
S11 S11 0.779993 1.227137 1.021403 1.199690 0.196101 0.642106 0.833604 0.724067 2.811528 ... 1.148492 2.698353 1.523860 1.148073 0.380291 1.007278 1.124297 0.795056 0.653855 1.251686
S14 S14 0.484094 0.676801 1.437728 2.077313 0.547277 0.555102 1.401707 0.287049 1.342868 ... 0.275494 1.440458 0.287285 0.432101 0.824702 0.846591 0.557986 1.232137 0.742445 0.360133
S2 S2 0.481369 0.970674 0.328054 0.985217 2.364674 1.645989 0.601938 0.643652 0.943901 ... 0.244447 2.785548 2.333329 0.098286 1.088393 0.939494 2.003004 0.836558 0.506255 0.402447
S4 S4 0.425247 0.665001 0.460931 1.025706 0.138759 0.986567 1.469463 0.463216 0.333045 ... 0.530403 0.755903 1.788937 1.547954 0.864034 0.517150 0.944125 0.104910 0.636804 0.458578
S5 S5 0.316929 0.072831 2.470544 1.323165 0.935996 0.995690 1.410956 1.969602 0.642618 ... 0.609223 0.669490 0.645484 1.284197 1.813887 3.994287 0.564868 1.616204 0.637810 0.716793
S6 S6 0.687471 1.467664 2.188556 1.453456 1.027919 2.525695 0.307863 1.303975 1.091399 ... 0.235592 0.582665 1.461953 1.056232 0.422989 0.269740 1.662686 0.227491 0.744177 0.873712
S7 S7 0.783799 0.661049 0.175157 0.475327 0.838076 1.518468 0.566472 0.398546 1.004035 ... 1.171654 0.709581 0.546841 0.716018 0.072203 0.783156 2.060965 1.024808 1.269127 0.492403
S8 S8 0.607726 0.256968 1.188404 1.035735 0.354894 1.250021 0.527916 1.868921 0.349697 ... 0.528014 0.284339 0.283722 2.806958 0.243856 1.101390 1.126577 1.200524 1.051035 1.170293
S9 S9 0.913431 0.659872 1.658435 0.549286 0.329609 0.366407 0.559081 0.622520 0.650571 ... 0.387701 0.836313 1.577652 0.321051 1.441353 0.204672 0.916508 1.395682 1.125795 0.229797

10 rows × 81 columns


In [18]:
pib_pet_df = pib_pet_df.sort('Subject')
apoe_df = apoe_df.sort('Subject')

Merge DataFrames

Pandas documentation on how to "Merge, join, and concatenate" DataFrames:

http://pandas.pydata.org/pandas-docs/stable/merging.html

  • NOTE: Merging can get complicated in Pandas. Check out the Pandas Cookbook, Chapter 5 (see "Additional Resources below) for more examples.

In [19]:
pd.DataFrame.merge?

In [20]:
# These subjects have both APOE and PIB data
print len(set.intersection(set(pib_pet_df.index), set(apoe_df.index)))
set.intersection(set(pib_pet_df.index), set(apoe_df.index))


10
Out[20]:
{u'S10', u'S11', u'S14', u'S2', u'S4', u'S5', u'S6', u'S7', u'S8', u'S9'}

In [21]:
# Make a new DataFrame that combines APOE and PIB data into a single DataFrame
# Exclude subjects without both APOE and PIB data (i.e. do an "inner" join)
# NOTE: Inner joins merge using the "on" keys of the DataFrames, therefore they can still have missing data.
inner_df = pd.merge(apoe_df, pib_pet_df, how='inner', on=['Subject'])
inner_df


Out[21]:
Subject APOE_Copy1 APOE_Copy2 Left-Cerebellum-Cortex Left-Thalamus-Proper Left-Caudate Left-Putamen Left-Pallidum Left-Hippocampus Left-Amygdala ... ctx-rh-rostralanteriorcingulate ctx-rh-rostralmiddlefrontal ctx-rh-superiorfrontal ctx-rh-superiorparietal ctx-rh-superiortemporal ctx-rh-supramarginal ctx-rh-frontalpole ctx-rh-temporalpole ctx-rh-transversetemporal ctx-rh-insula
0 S10 E4 E2 0.220978 0.207769 1.737944 1.619239 0.311993 1.362993 0.396153 ... 1.320036 1.608255 1.547771 0.283531 3.212926 0.624633 0.278144 0.832677 1.280211 3.906851
1 S11 E3 E4 0.779993 1.227137 1.021403 1.199690 0.196101 0.642106 0.833604 ... 1.148492 2.698353 1.523860 1.148073 0.380291 1.007278 1.124297 0.795056 0.653855 1.251686
2 S14 E2 E2 0.484094 0.676801 1.437728 2.077313 0.547277 0.555102 1.401707 ... 0.275494 1.440458 0.287285 0.432101 0.824702 0.846591 0.557986 1.232137 0.742445 0.360133
3 S2 E4 E3 0.481369 0.970674 0.328054 0.985217 2.364674 1.645989 0.601938 ... 0.244447 2.785548 2.333329 0.098286 1.088393 0.939494 2.003004 0.836558 0.506255 0.402447
4 S4 E2 E4 0.425247 0.665001 0.460931 1.025706 0.138759 0.986567 1.469463 ... 0.530403 0.755903 1.788937 1.547954 0.864034 0.517150 0.944125 0.104910 0.636804 0.458578
5 S5 E4 E4 0.316929 0.072831 2.470544 1.323165 0.935996 0.995690 1.410956 ... 0.609223 0.669490 0.645484 1.284197 1.813887 3.994287 0.564868 1.616204 0.637810 0.716793
6 S6 E2 E3 0.687471 1.467664 2.188556 1.453456 1.027919 2.525695 0.307863 ... 0.235592 0.582665 1.461953 1.056232 0.422989 0.269740 1.662686 0.227491 0.744177 0.873712
7 S7 E4 E2 0.783799 0.661049 0.175157 0.475327 0.838076 1.518468 0.566472 ... 1.171654 0.709581 0.546841 0.716018 0.072203 0.783156 2.060965 1.024808 1.269127 0.492403
8 S8 E2 E3 0.607726 0.256968 1.188404 1.035735 0.354894 1.250021 0.527916 ... 0.528014 0.284339 0.283722 2.806958 0.243856 1.101390 1.126577 1.200524 1.051035 1.170293
9 S9 E2 E4 0.913431 0.659872 1.658435 0.549286 0.329609 0.366407 0.559081 ... 0.387701 0.836313 1.577652 0.321051 1.441353 0.204672 0.916508 1.395682 1.125795 0.229797

10 rows × 83 columns


In [22]:
# Make a new DataFrame that combines APOE and PIB data into a single DataFrame
# Include all subjects/data (i.e. do an "outer" join)
outer_df = apoe_df.merge(pib_pet_df, how='outer', on=['Subject'])
outer_df


Out[22]:
Subject APOE_Copy1 APOE_Copy2 Left-Cerebellum-Cortex Left-Thalamus-Proper Left-Caudate Left-Putamen Left-Pallidum Left-Hippocampus Left-Amygdala ... ctx-rh-rostralanteriorcingulate ctx-rh-rostralmiddlefrontal ctx-rh-superiorfrontal ctx-rh-superiorparietal ctx-rh-superiortemporal ctx-rh-supramarginal ctx-rh-frontalpole ctx-rh-temporalpole ctx-rh-transversetemporal ctx-rh-insula
0 S1 E2 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 S10 E4 E2 0.220978 0.207769 1.737944 1.619239 0.311993 1.362993 0.396153 ... 1.320036 1.608255 1.547771 0.283531 3.212926 0.624633 0.278144 0.832677 1.280211 3.906851
2 S11 E3 E4 0.779993 1.227137 1.021403 1.199690 0.196101 0.642106 0.833604 ... 1.148492 2.698353 1.523860 1.148073 0.380291 1.007278 1.124297 0.795056 0.653855 1.251686
3 S12 E4 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 S13 E4 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 S14 E2 E2 0.484094 0.676801 1.437728 2.077313 0.547277 0.555102 1.401707 ... 0.275494 1.440458 0.287285 0.432101 0.824702 0.846591 0.557986 1.232137 0.742445 0.360133
6 S2 E4 E3 0.481369 0.970674 0.328054 0.985217 2.364674 1.645989 0.601938 ... 0.244447 2.785548 2.333329 0.098286 1.088393 0.939494 2.003004 0.836558 0.506255 0.402447
7 S3 E3 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 S4 E2 E4 0.425247 0.665001 0.460931 1.025706 0.138759 0.986567 1.469463 ... 0.530403 0.755903 1.788937 1.547954 0.864034 0.517150 0.944125 0.104910 0.636804 0.458578
9 S5 E4 E4 0.316929 0.072831 2.470544 1.323165 0.935996 0.995690 1.410956 ... 0.609223 0.669490 0.645484 1.284197 1.813887 3.994287 0.564868 1.616204 0.637810 0.716793
10 S6 E2 E3 0.687471 1.467664 2.188556 1.453456 1.027919 2.525695 0.307863 ... 0.235592 0.582665 1.461953 1.056232 0.422989 0.269740 1.662686 0.227491 0.744177 0.873712
11 S7 E4 E2 0.783799 0.661049 0.175157 0.475327 0.838076 1.518468 0.566472 ... 1.171654 0.709581 0.546841 0.716018 0.072203 0.783156 2.060965 1.024808 1.269127 0.492403
12 S8 E2 E3 0.607726 0.256968 1.188404 1.035735 0.354894 1.250021 0.527916 ... 0.528014 0.284339 0.283722 2.806958 0.243856 1.101390 1.126577 1.200524 1.051035 1.170293
13 S9 E2 E4 0.913431 0.659872 1.658435 0.549286 0.329609 0.366407 0.559081 ... 0.387701 0.836313 1.577652 0.321051 1.441353 0.204672 0.916508 1.395682 1.125795 0.229797
14 NaN E2 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

15 rows × 83 columns

Clean Data


In [23]:
pd.DataFrame.dropna?

In [24]:
# Removing rows with NaN
# Although we accept that there is some missing data in outer_df, 
# we cannot use rows with Subject==NaN
outer_df.dropna(axis=0, how='any', subset=['Subject'])


Out[24]:
Subject APOE_Copy1 APOE_Copy2 Left-Cerebellum-Cortex Left-Thalamus-Proper Left-Caudate Left-Putamen Left-Pallidum Left-Hippocampus Left-Amygdala ... ctx-rh-rostralanteriorcingulate ctx-rh-rostralmiddlefrontal ctx-rh-superiorfrontal ctx-rh-superiorparietal ctx-rh-superiortemporal ctx-rh-supramarginal ctx-rh-frontalpole ctx-rh-temporalpole ctx-rh-transversetemporal ctx-rh-insula
0 S1 E2 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 S10 E4 E2 0.220978 0.207769 1.737944 1.619239 0.311993 1.362993 0.396153 ... 1.320036 1.608255 1.547771 0.283531 3.212926 0.624633 0.278144 0.832677 1.280211 3.906851
2 S11 E3 E4 0.779993 1.227137 1.021403 1.199690 0.196101 0.642106 0.833604 ... 1.148492 2.698353 1.523860 1.148073 0.380291 1.007278 1.124297 0.795056 0.653855 1.251686
3 S12 E4 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 S13 E4 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
5 S14 E2 E2 0.484094 0.676801 1.437728 2.077313 0.547277 0.555102 1.401707 ... 0.275494 1.440458 0.287285 0.432101 0.824702 0.846591 0.557986 1.232137 0.742445 0.360133
6 S2 E4 E3 0.481369 0.970674 0.328054 0.985217 2.364674 1.645989 0.601938 ... 0.244447 2.785548 2.333329 0.098286 1.088393 0.939494 2.003004 0.836558 0.506255 0.402447
7 S3 E3 E2 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 S4 E2 E4 0.425247 0.665001 0.460931 1.025706 0.138759 0.986567 1.469463 ... 0.530403 0.755903 1.788937 1.547954 0.864034 0.517150 0.944125 0.104910 0.636804 0.458578
9 S5 E4 E4 0.316929 0.072831 2.470544 1.323165 0.935996 0.995690 1.410956 ... 0.609223 0.669490 0.645484 1.284197 1.813887 3.994287 0.564868 1.616204 0.637810 0.716793
10 S6 E2 E3 0.687471 1.467664 2.188556 1.453456 1.027919 2.525695 0.307863 ... 0.235592 0.582665 1.461953 1.056232 0.422989 0.269740 1.662686 0.227491 0.744177 0.873712
11 S7 E4 E2 0.783799 0.661049 0.175157 0.475327 0.838076 1.518468 0.566472 ... 1.171654 0.709581 0.546841 0.716018 0.072203 0.783156 2.060965 1.024808 1.269127 0.492403
12 S8 E2 E3 0.607726 0.256968 1.188404 1.035735 0.354894 1.250021 0.527916 ... 0.528014 0.284339 0.283722 2.806958 0.243856 1.101390 1.126577 1.200524 1.051035 1.170293
13 S9 E2 E4 0.913431 0.659872 1.658435 0.549286 0.329609 0.366407 0.559081 ... 0.387701 0.836313 1.577652 0.321051 1.441353 0.204672 0.916508 1.395682 1.125795 0.229797

14 rows × 83 columns

Additional Resources

Pandas Cookbook

https://github.com/jvns/pandas-cookbook/tree/master/cookbook

  • To download without using GitHub just click on the "Download ZIP" link.

In [ ]: