In [1]:
%matplotlib inline
In [2]:
import os
import numpy as np
import pandas
from matplotlib import pyplot
# local reference to a data directory
datadir = 'data'
I will use pandas to access my excel file of demographic data from the data directory.
Note that excel files can have multiple sheets, so I first query my options and then load the sheet I want to work with, in this case sheet1
In [3]:
demographics_file = os.path.join(datadir, 'demographics.xls')
tmpdf = pandas.ExcelFile(demographics_file)
tmpdf.sheet_names
Out[3]:
In [4]:
## create DataFrame called demog by parsing ExcelFile
## Note I use the second column to be my default index as
## this has unique IDs for each person
demog = tmpdf.parse('sheet1', index_col = 0)
In [5]:
print 'Column Names', demog.columns
print 'Index Values', demog.index[:5]
In [6]:
#look at clunky columns
print demog.columns
In [7]:
## clean up column names
cols = demog.columns
newcols = [x.replace(' ','').replace('#','').replace('GRAB','') \
for x in cols]
newcols = [x.replace('TstScrsSession1::','') for x in newcols]
# Use cleaned headers with DataFrame
demog.columns = newcols
print demog.columns
In [8]:
demog[demog.columns[3]]
Out[8]:
In [9]:
# by cleaning up column names I can now easily access fields
# this mapping does not work if fields contain special characters or math operators
print demog.BAC[:10]
In [10]:
# write data with cleaned headers to new excel file
demog.to_excel('data/newdemog.xls')
!ls data
In [11]:
demog.describe()
Out[11]:
So lets see the structure of this missing data.
I will make a mask where data is missing and visualize the table in a heat map
In [12]:
## set default colormap to gray
pyplot.gray()
# create binary mask where data is NOT null
mask = ~pandas.isnull(demog.values)
# set interpolation to nearest (nearest neighbor) to
# avoid fuzzy image
pyplot.imshow(mask, aspect='auto', interpolation='nearest')
pyplot.ylabel('subjects')
pyplot.xlabel('demographic scores')
pyplot.colorbar(ticks=[0,1])
Out[12]:
In [13]:
demog.mean()
Out[13]:
The pattern suggests:
What we want to do:
Now we just need to drop the tests with very few samples.
This is much easier since we can just use pandas built in method dropna, and specify our axis to use across columns (axis=1).
This will drop any columns that have nan in them.
In [14]:
## To drop the tests that are missing for most subjects is easy,
## I use the built in dropna
final_demog = demog.dropna(axis=1)
In [15]:
# set interpolation to nearest (nearest neighbor) to
# avoid fuzzy image
pyplot.imshow(pandas.isnull(final_demog),
aspect='auto',
interpolation='nearest')
pyplot.ylabel('subjects')
pyplot.xlabel('demographic scores')
pyplot.colorbar(ticks=[0,1])
Out[15]:
We have REAL data for all our data points!
In [16]:
print final_demog.shape
print final_demog.mean()
In [17]:
print 'Years Education\n', final_demog['YearsEd'].describe()
Now I want to take this data, and for the subjects I have useful dempographics for, grab their biomarker data.
In [18]:
## Use pandas to load the biomarker csv data
biomarker = pandas.read_csv('data/subject_biomarker.csv', sep=None)
In [19]:
biomarker.describe()
Out[19]:
In [20]:
biomarker.columns
Out[20]:
In [21]:
biomarker.index
Out[21]:
In [22]:
biomarker.index = biomarker.SUBID
In [23]:
biomarker.index
Out[23]:
Currently I have two data frames,
final_demog uses unique subids for an index biomarker uses integers
I could go back and use the index_col = 0 flag when I parse the csv file.
But I can also use the SUBID columns, and just reset the index to reflect their SUBID (which is unique)
This will allow me to join the two dataframes database-style
In [24]:
## reindex based on subject ID
#biomarker.index = biomarker['SUBID']
## Join dataframes
join_data = final_demog.join(biomarker)
## or Concatenate
concat_data = pandas.concat((final_demog, biomarker))
In [25]:
print concat_data.shape
pyplot.imshow(pandas.isnull(concat_data), interpolation='nearest',
aspect='auto')
Out[25]:
In [26]:
## we can fix this by forcing concat to concatenate on a
## specific axis (in this case axis = 1)
##
concat_data2 = pandas.concat((final_demog, biomarker), axis=1)
In [27]:
print concat_data2.shape
pyplot.imshow(pandas.isnull(concat_data2), interpolation='nearest',
aspect='auto')
Out[27]:
In [28]:
combined_data = concat_data2.dropna(axis=0)
In [29]:
pyplot.imshow(pandas.isnull(combined_data))
print combined_data.SUBID[:10]
In [30]:
final_demog.join?
In [31]:
## Quick check that my data is aligned
## Note non-overlapping data
print join_data[2:6]['SUBID']
In [32]:
## by looking I can see not all my subjects with demographics
# have biomarker data
join_data
Out[32]:
In [33]:
## quick look at image, I can see a number of people are missing
## biomarker data
mask = ~pandas.isnull(join_data)
pyplot.imshow(mask, interpolation = 'nearest', aspect = 'auto')
Out[33]:
I can use an inner join which will match all the data on index and column, and leave me with only good data.
In [34]:
join_inner = final_demog.join(biomarker, how='inner')
In [35]:
print join_inner.shape
pyplot.imshow(pandas.isnull(join_inner))
Out[35]:
In [36]:
final_data = join_inner
In [37]:
final_data['random'] = np.random.random(42)
In [38]:
final_data['silly_thing']= 1
In [39]:
print final_data.columns[11:14]
subsample = final_data.columns[11:14]
jnk = final_data[subsample].boxplot()
In [40]:
subsample
Out[40]:
In [41]:
## data entry is sloppy for gender
final_data.Gender
Out[41]:
So I am now going to clean up these values much as I cleaned up the headers
In [42]:
## fix gender to reflect just upper case letter M or F
final_data.Gender = [x[0].upper() for x in final_data.Gender]
print final_data.Gender
In [43]:
## now I can create masks to parse the data
females = final_data['Gender'] == 'F'
female_data = final_data[females]
male_data = final_data[~females]
In [44]:
female_data.shape
Out[44]:
In [45]:
male_data.shape
Out[45]:
In [46]:
## This allows me to make a simple box plot to compare
## years of education between males and females
pyplot.boxplot((female_data['YearsEd'], male_data['YearsEd']))
pyplot.axis([0,3,10,25])
pyplot.xticks([1,2], ['Female', 'Male'])
pyplot.ylabel('Years of Education')
Out[46]:
In [47]:
for val, (name, group) in enumerate(final_data.groupby('Gender')):
pyplot.subplot(1,2,val+1)
pyplot.boxplot(group['Post-cingulate'])
pyplot.axis([0,2,0.8, 2.3])
pyplot.xticks([1,], [name,])
pyplot.title('Post-cingulate')
In [48]:
for val, (name, group) in enumerate(final_data.groupby('Gender')):
print val, name, group.shape
In [49]:
just_girls = final_data.groupby('Gender').get_group('F')
In [50]:
final_data.groupby('Gender').apply
Out[50]:
In [51]:
final_data.columns
# quick look at influence of MMSE on Post-cingulate Biomarker
ols = pandas.stats.ols.OLS(final_data['Post-cingulate'],
final_data['MMSE'])
In [52]:
print ols.summary
In [53]:
import statsmodels.regression.linear_model as lm
I will grab some variables I care about
dependent variable
independent variables
In [54]:
final_data.columns
Out[54]:
In [55]:
items_of_interest = ['Gender', 'AgeatSession', 'Precuneus']
independent = final_data.reindex(columns = items_of_interest)
independent
Out[55]:
In [56]:
## demean my dependent variable
y = final_data['Post-cingulate'] - final_data['Post-cingulate'].mean()
## cast and demean my independent variables
independent.Gender[independent.Gender == 'F'] = 0
independent.Gender[independent.Gender == 'M'] = 1
independent.AgeatSession = independent.AgeatSession - independent.AgeatSession.mean()
independent.Precuneus = independent.Precuneus - independent.Precuneus.mean()
## Add constant (intercept)
## note how this creates a new column and populates it with ones
independent['const'] = 1
In [57]:
print 'dependent\n'
print y.values[:10]
print '\nindependent\n'
print independent.values[:10]
In [58]:
lm.OLS?
lmfit = lm.OLS(endog = y.values, exog = independent.values)
In [59]:
result = lmfit.fit()
In [60]:
print independent.columns
result.summary()
Out[60]:
In [61]:
pyplot.subplot(121)
pyplot.plot(independent.AgeatSession, y, 'bo')
pyplot.xlabel('Age')
pyplot.ylabel('Post-cingulate Biomarker')
pyplot.subplot(122)
pyplot.plot(independent.Precuneus, y, 'ro')
pyplot.xlabel('Precuneus Biomarker')
Out[61]:
In [62]:
import sqlite3
In [63]:
data_to_db = final_data.reindex(columns =
('Post-cingulate',
'Precuneus',
'MMSE',
'AgeatSession'))
In [63]:
In [64]:
con = sqlite3.connect(':memory:')
con.commit?
In [65]:
query = """
CREATE TABLE test
(postcing FLOAT, precun FLOAT, mmse FLOAT, age FLOAT);"""
# open connection to database in memory
con = sqlite3.connect(':memory:')
## execute SQL statement to CREATE TABLE (create new table)
con.execute(query)
## Commit transaction
con.commit()
In [66]:
# coerce data to format that can be input into the table
# basically create a list of tuples
data = [tuple(item) for item in data_to_db.values.tolist()]
In [67]:
data[:2]
Out[67]:
In [68]:
# new statement to insert our data into database
statement = "INSERT INTO test VALUES(?,?,?,?)"
con.executemany(statement, data)
con.commit()
In [69]:
## roundtrip: lets grab all the data out of the database
cursor = con.execute('select * from test')
newrows = cursor.fetchall()
In [70]:
cursor.description
Out[70]:
In [71]:
# zip column items in cursor description , and grab first
columns = zip(*cursor.description)[0]
columns
Out[71]:
In [72]:
# create a new DataFrame
newdf = pandas.DataFrame(newrows, columns = columns)
newdf
Out[72]:
In [73]:
import pandas.io.sql as sql
newdf = sql.read_frame('select * from test', con)
print newdf[:3]
In [74]:
# I still have an open connection (con) to my database table
# I can create a new table ordered by age
agedf = sql.read_frame('select * from test order by age',con)
agedf
Out[74]:
In [75]:
## I can create a dataframe where MMSE > 28 ordered by age
query = 'select * from test where mmse > 28 order by age'
mmsedf = sql.read_frame(query,con)
mmsedf
Out[75]:
In [75]: