In [220]:
# Imports from __future__ in case we're running Python 2
from __future__ import division, print_function
from __future__ import absolute_import, unicode_literals
#pandas
import pandas as pd
# Our numerical workhorses
import numpy as np
import scipy.integrate
# Import pyplot for plotting
import matplotlib.pyplot as plt
# Seaborn, useful for graphics
import seaborn as sns
#pretty tables with plotly
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook
# Import Bokeh modules for interactive plotting
import bokeh.io
import bokeh.mpl
import bokeh.plotting
# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline
# This enables SVG graphics inline. There is a bug, so uncomment if it works.
# %config InlineBackend.figure_formats = {'svg',}
# This enables high resolution PNGs. SVG is preferred, but has problems
# rendering vertical and horizontal lines
%config InlineBackend.figure_formats = {'png', 'retina'}
# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
# Set up Bokeh for inline viewing
bokeh.io.output_notebook()
In [221]:
#load genotype data
fname= '../input/gandhi_et_al/130315_1A_genotypes.txt'
df_gt=pd.read_csv(fname, delimiter= '\t', comment= '#', header= [0,1])
#take a look
df_gt
Out[221]:
In [222]:
df_gt.columns
Out[222]:
In [223]:
#remove multi indexing by reseting the columns to be the second level of the index
df_gt.columns= df_gt.columns.get_level_values(1)
#Check out the new columns
df_gt.columns
Out[223]:
In [224]:
df_gt.columns= ['wt', 'het', 'mut'] #rename the columns
In [320]:
#this isn't tidy data.
#tidy data should have 2 columns: genotype and fish id
#let's use melt
df_gt= pd.melt(df_gt, var_name='genotype', value_name='fish')
df_gt.head()
Out[320]:
In [321]:
df_gt= df_gt.dropna()
df_gt.head()
Out[321]:
In [227]:
#load raw data
fname= '../input/gandhi_et_al/130315_01_rawDATA.txt'
df= pd.read_csv(fname, delimiter= '\t', comment= '#', header= [0,1 ])
#take a look
df.head()
Out[227]:
In [228]:
#make list of columns
#level 1 :2 -> start, end; level 0 2: -> FISH 1... CLOCK
df.columns= list(df.columns.get_level_values(1)[:2])\
+ list(df.columns.get_level_values(0)[2:])
In [229]:
df.columns
Out[229]:
In [230]:
cols_to_drop= df.columns[df.columns.str.contains('Unnamed')]
#drop the columns (axis 1 to drop cols, axis 0 for rows)
df= df.drop(cols_to_drop, axis= 1)
In [231]:
df= df.drop(['start', 'end'], axis= 1)
In [232]:
int('FISH42'.lstrip('FISH')) #how to separate number from FISH
Out[232]:
In [233]:
#Now loop through fish columns and drop ones that don't have genotypes
cols_to_drop= []
for col in df.columns:
if 'FISH' in col and int(col.lstrip('FISH')) not in df_gt['fish'].values:
cols_to_drop.append(col)
#drop em
df= df.drop(cols_to_drop, axis= 1)
In [234]:
#plot clock and t_start values
plt.plot(df.CLOCK, '.')
plt.xlabel('index')
plt.ylabel('time(hours)')
Out[234]:
In [235]:
df['light']= pd.Series(df.CLOCK < 14.0, index= df.index)
In [236]:
dark_to_light= np.where(np.diff(df.light.astype(np.int)) == 1)[0]
#initialize array with day numbers
day= np.zeros_like(df.light, dtype= np.int)
for i in range(len(dark_to_light)-1):
day[dark_to_light[i] +1: dark_to_light[i+1]+1]= i+1
day[dark_to_light[-1]+1:] = len(dark_to_light)
df['day']= pd.Series(day, index=df.index)
In [237]:
#build zeitgeiber clock
zeit= 24.0*df.day+df.CLOCK
df['zeit']= pd.Series(zeit, index= df.index)
In [238]:
genotypes= []
#check each column, put 'dummy' for non-fish column
for col in df.columns:
if 'FISH' in col:
fish_id= int(col.lstrip('FISH'))
# print(fish_id, df_gt.genotype[df_gt.fish== fish_id].iloc[0])
genotypes.append(df_gt.genotype[df_gt.fish== fish_id].iloc[0])
else:
genotypes.append(None)
#make a multi-index dataframe
df.columns= pd.MultiIndex.from_arrays((genotypes, df.columns),\
names= ['genotype', 'variable'])
In [239]:
df.head()
Out[239]:
In [240]:
#melt the dataframe
#value_vars get turned into data
#id_vars remain as columns
#value variables are the ones with FISH in the name
col_1= df.columns.get_level_values(1)
value_vars= list(df.columns[col_1.str.contains('FISH')])
#id_vars are non fish entries
id_vars= list(df.columns[~col_1.str.contains('FISH')])
#perform the melt
df1= pd.melt(df, value_vars= value_vars, id_vars= id_vars, value_name=\
'activity', var_name= ['genotype', 'fish'])
In [241]:
df1.head()
Out[241]:
In [242]:
#rename tuple columns:
for i, col in enumerate(df1.columns):
if type(col) is tuple:
df1.rename(columns={col: col[1]}, inplace= True)
In [243]:
df1= df1[['fish', 'genotype', 'day', 'light', 'CLOCK', 'zeit', 'activity']]
In [244]:
df1.head()
Out[244]:
In [245]:
df2= df.copy()
df2.head()
Out[245]:
In [246]:
#zero level columns
# 'variable is the 2nd row of headings
cols= [col.replace('FISH', 'activity_')\
for col in df2.columns.get_level_values('variable')]
df2.columns= pd.MultiIndex.from_arrays([df2.columns.get_level_values('genotype'),\
cols], names= ['genotype', 'variable'])
In [247]:
df2.head()
Out[247]:
In [248]:
#slice using cross sections
df2.xs('zeit', level= 'variable', axis= 1).head()
Out[248]:
In [249]:
df2['wt'].head()
Out[249]:
In [250]:
# Get view into DataFrame with only fish 2
df_fish = df1[df1['fish'] == 'FISH2']
# Get figure and axis objects using plt.subplots utility function
fig, ax = plt.subplots()
# Plot activity versus time for fish 2. lw denotes the line width
ax.plot(df_fish.zeit, df_fish.activity, '-', lw=0.25)
# Set axis labels
ax.set_xlabel('time (hr)')
ax.set_ylabel('activity (sec / min)')
# Set axis limits
ax.set_xlim((df_fish.zeit.min(), df_fish.zeit.max()))
ax.set_ylim((0, ax.get_ylim()[1]))
# We can overlay day and night. We'll make night shaded.
# The where keyword is useful, and ~df_wt.light means "not light."
ax.fill_between(df_fish.zeit, 0, ax.get_ylim()[1], where=~df_fish.light,
color='gray', alpha=0.3)
print('Fish 2 is', df_fish.genotype.iloc[0])
In [316]:
def pretty_activity_plot(ax, selector, selection, col, df, xlabel='time (hr)',
ylabel='activity (sec / min)', lw=0.25,
color=None):
"""
Makes a pretty plot of sleep traces. Generates the plot on axes ax,
and then returns the updated ax.
"""
# Make sure selection input is iterable
if type(selection) in [str, int, float, unicode]:
selection = [selection]
# Plot time traces of column col for each fish
for sel in selection:
# Pull out record of interest
df_plot = df[df[selector]==sel]
# Generate plot
if color is None:
ax.plot(df_plot.zeit, df_plot[col], '-', lw=lw)
else:
ax.plot(df_plot.zeit, df_plot[col], '-', lw=lw, color=color)
# Label axes
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
# Set axis limits
ax.set_xlim((df_plot.zeit.min(), df_plot.zeit.max()))
ax.set_ylim((0.0, ax.get_ylim()[1]))
# Overlay night shading
ax.fill_between(df_plot.zeit, 0.0, ax.get_ylim()[1],
where=~df_plot.light, color='gray', alpha=0.3, zorder=0)
return ax
In [252]:
# Perform the rolling sum and return it as a new DataFrame
time_win = 10 # in units of no. of indices to sum over, equal to min here
# Perform rolling sum and store in window column
df1['window']= df1.groupby('fish')['activity'].apply(pd.rolling_sum, time_win)
In [304]:
fig, ax = plt.subplots()
ax = pretty_activity_plot(ax, 'fish', str('FISH2'), 'window', df1, lw=1,
ylabel='activity (sec/10 min)')
In [307]:
# Time points/lightness
t = df1.zeit[df1.fish=='FISH1']
light = df1.light[df1.fish=='FISH1']
# Index for right edge of first averaging win. (ensure win. ends at lights out)
start_ind = time_win + np.where(np.diff(light.astype(int)))[0][0] % time_win
# The time points we want
times = np.sort(np.unique(df1.zeit))[start_ind::time_win]
# The indicies of the tidy DataFrame we want
inds = df1.zeit.isin(times)
# New DataFrame
new_cols = ['fish', 'genotype', 'day', 'light', 'CLOCK', 'zeit', 'window']
df1_10 = df1[new_cols][inds]
# Rename window column to activity
df1_10 = df1_10.rename(columns={'window': 'activity'})
In [318]:
# Plot the results for non-overlapping ten minute intervals
fig, ax = plt.subplots()
ax = pretty_activity_plot(ax, 'fish', 'FISH2', 'activity', df1_10, lw=1,
ylabel='activity (sec/10 min)')
In [319]:
# Save our new DataFrame to CSV file, using kwarg index=False to suppress
# writing of (trivial) indices
df1.to_csv('../input/130315_1_minute_intervals.csv', index=False)
df1_10.to_csv('../input/130315_10_minute_intervals.csv', index=False)