Connectomics Challenge

This challenge requires to discover directed neural connections from time series of neural imagery depicting neuron activation patterns.

More at http://www.kaggle.com/c/connectomics

Connectomics Challenge Resources

Papers

We start by a listing of the installed python packages using pip freeze, so all of this can be reproduced. We're also setting matplotlib to inline output, and load the cythonmagic extension for IPython (see http://ipython.org/ipython-doc/dev/config/extensions/cythonmagic.html )


In [32]:
!pip freeze


Cython==0.20.1
Jinja2==2.7.2
MarkupSafe==0.18
Pygments==1.6
Sphinx==1.2.1
Theano==0.6.0
argparse==1.2.1
backports.ssl-match-hostname==3.4.0.2
distribute==0.7.3
docutils==0.11
h5py==2.2.1
ipython==1.1.0
matplotlib==1.3.1
networkx==1.8.1
nose==1.3.0
numexpr==2.3.1
numpy==1.8.0
pandas==0.13.0
patsy==0.2.1
pebl==1.01
pydot==1.0.28
pymc==2.3
pyparsing==2.0.1
python-dateutil==2.2
pytz==2013.9
pyzmq==14.0.1
scikit-learn==0.14.1
scipy==0.13.3
six==1.5.2
statsmodels==0.5.0
tables==3.1.0
tornado==3.2
wsgiref==0.1.2

In [2]:
%matplotlib inline
%load_ext cythonmagic

Next, download and extract the data packages linked at http://www.kaggle.com/c/connectomics/data to a suitable data directory


In [1]:
# Some core imports
import os
import sys
from subprocess import call
import time
import pandas
import numpy as np
import h5py
# These are IPython specific
from IPython.display import display, clear_output

In [2]:
# Core configuration

datadir = "/data/quick/connectomics"
mirror = "http://www.causality.inf.ethz.ch/data/connectomics"

nbdir = os.getcwd()

sources = [
           
           "http://www.kaggle.com/c/connectomics/download/normal-1.tgz",
           "http://www.kaggle.com/c/connectomics/download/normal-2.tgz",
           "http://www.kaggle.com/c/connectomics/download/normal-3.tgz",
           "http://www.kaggle.com/c/connectomics/download/normal-4.tgz",
           "http://www.kaggle.com/c/connectomics/download/normal-4-lownoise.tgz",
           "http://www.kaggle.com/c/connectomics/download/normal-4-lownoise.tgz",
           "http://www.kaggle.com/c/connectomics/download/small.tgz",
           "http://www.kaggle.com/c/connectomics/download/highcc.tgz",
           "http://www.kaggle.com/c/connectomics/download/lowcc.tgz",
           "http://www.kaggle.com/c/connectomics/download/lowcon.tgz",
           "http://www.kaggle.com/c/connectomics/download/highcon.tgz",
           "http://www.kaggle.com/c/connectomics/download/normal-3-highrate.tgz",
           "http://www.kaggle.com/c/connectomics/download/test.tgz",
           "http://www.kaggle.com/c/connectomics/download/valid.tgz",
           "http://www.kaggle.com/c/connectomics/download/sampleSubmission.csv.zip"
          ]

In [3]:
print "Creating data directory"
call(['mkdir', '-p', datadir + "/downloads"])
call(['mkdir', '-p', datadir + "/tmp"])
os.chdir(datadir + "/tmp")


Creating data directory

In [6]:
cmd = ['wget', '-O', 'target', 'src']

for src in sources:
    clear_output()
    print "Downloading %s" % (src)
    sys.stdout.flush()
    parts = src.rsplit('/', 1)
    filename = parts[-1]
    target = "%s/tmp/%s" % (datadir, filename)
    mirror_src = "%s/%s" % (mirror, filename)
    try:
        os.unlink(target)
    except:
        pass
    cmd[2] = target
    cmd[3] = mirror_src
    #print ' '.join(cmd)
    call(cmd)
    tgz_cmd = ["tar", "-xf", filename]
    print "Extracting %s" % (filename)
    sys.stdout.flush()
    call(tgz_cmd)
    print "Removing %s" % (filename)
    sys.stdout.flush()
    os.unlink(target)
clear_output()
print "Downloads finished"


Downloads finished

Making the data accessible

We are going to make the data a bit more accessible by loading it into a hierarchical HDF5 data store using the h5py library ( see http://docs.h5py.org/en/latest/ ). While pandas also supports HDF5, it does not natively support multidimensional tensors/matrices which we will use later on in conjunction with Theano.


In [7]:
def import_network(store, name, f_file, p_file, n_file):
    print "Importing %s" % (name)
    fdata = pandas.read_csv(f_file, sep=',', skipinitialspace=True, header=None, prefix='N', dtype=np.float32)
    concatenated = pandas.concat([fdata[c] for c in fdata.columns])
    timecount = len(fdata)
    neuroncount = len(fdata.columns)
    fdata = None
    tdata = concatenated.values.reshape((neuroncount, timecount))
    g = store.create_group(name)
    dset = g.create_dataset("fluorescence", data=tdata, compression='lzf')
    
    g['fluorescence'].dims[0].label = 'neuron'
    g['fluorescence'].dims[1].label = 'time'
    pdata = pandas.read_csv(p_file,  sep=',', skipinitialspace=True, header=None, names=['x','y'], dtype=np.float64)
    concatenated = pandas.concat([pdata[c] for c in pdata.columns])
    tensor = concatenated.values.reshape((2, len(pdata))).T
    pset = g.create_dataset("networkPositions", data=tensor)
    if ((n_file is not None) and os.path.isfile(n_file)):
        ndata = pandas.read_csv(n_file,  sep=',', skipinitialspace=True, header=None, names=['i','j', 'weight'], dtype={ 'i' : np.int32, 'j' : np.int32, 'weight' : np.float32})
        connection_matrix = np.zeros((neuroncount, neuroncount), dtype=np.float32)
        for idx in range(len(ndata)):
            i = int(ndata.i[idx])
            j = int(ndata.j[idx])
            weight = float(ndata.weight[idx])
            if (weight==-1.0):
                weight = 0.0
            connection_matrix[i-1,j-1] = weight
        g.create_dataset("connectionMatrix", data=connection_matrix, compression='lzf')
    return g

In [4]:
store = h5py.File(datadir + '/store.h5')

In [10]:
for i in range(1,7):
    import_network(store, 'small-%d' % (i),
                   'fluorescence_iNet1_Size100_CC0%dinh.txt' % (i), 
                   'networkPositions_iNet1_Size100_CC0%dinh.txt'  % (i),
                   'network_iNet1_Size100_CC0%dinh.txt'  % (i))
    
for nname in ['normal-1', 'normal-2', 'normal-3', 'normal-4', 'normal-4-lownoise', 'lowcc', 'highcc', 'highcon', 'test', 'valid']:
        import_network(store, nname, 
                       "%s/fluorescence_%s.txt" % (nname, nname),
                       "%s/networkPositions_%s.txt" % (nname, nname),
                       "%s/network_%s.txt" % (nname, nname))
print "Done"
print store


Importing small-1
Importing small-2
Importing small-3
Importing small-4
Importing small-5
Importing small-6
Importing normal-1
Importing normal-2
Importing normal-3
Importing normal-4
Importing normal-4-lownoise
Importing lowcc
Importing highcc
Importing highcon
Importing test
Importing valid
Done
<HDF5 file "store.h5" (mode r+)>

That's it, we have the raw data in a nice accessible format in a HDF5 store and can access the individual datasets like this:


In [29]:
store['normal-1']['connectionMatrix']


Out[29]:
<HDF5 dataset "connectionMatrix": shape (1000, 1000), type "<f4">

Network Connectivity Postprocessing

Next, we are going to calculate a neuron distance matrix and "causal distance" matrices which determine how close a neuron is from another one, both in spatial distance and how many "hops" would be neccessary for a spike to traverse from one neuron to another.


In [63]:
from math import sqrt

def create_spatial_distance_matrix(net):
    
    positions = net['networkPositions'].value
    
    def distance(n1, n2):
        x1 = positions[n1,0]
        y1 = positions[n1,1]
        x2 = positions[n2,0]
        y2 = positions[n2,1]
        return sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
    nneurons = positions.shape[0]
    distanceMatrix = np.zeros((nneurons, nneurons), dtype=np.float32)
    for n1 in range(nneurons):
        for n2 in range(positions.shape[0]):
            distanceMatrix[n1,n2] = distance(n1,n2)
    try:
        del net['spatialDistanceMatrix']
    except:
        pass
    net.create_dataset("spatialDistanceMatrix", data=distanceMatrix)

In [105]:
def update_signal_distance_matrix(net):
    
    if (not 'connectionMatrix' in net.keys()):
        return False
    connections = net['connectionMatrix'].value
    try:
        del net['signalDistanceMatrix']
    except:
        pass
    inf = float('inf')
    
    signalMatrix = np.zeros_like(connections, dtype=np.float32)
    signalMatrix[:,:] = connections[:,:]
    signalMatrix[signalMatrix==0.0] = inf
    for i in range(signalMatrix.shape[0]):
        signalMatrix[i,i] = 0.0
    
    def signal(src, via):
        depth = signalMatrix[src,via]
        signal = signalMatrix[via,:] + float(depth)
        signalMatrix[src,:] = np.minimum(signalMatrix[src,:], signal)
    
    
    for i in range(1000):
        orig = signalMatrix.copy()
        for src in range(signalMatrix.shape[0]):
            for via in range(signalMatrix.shape[0]):
                if (via==target):
                    continue
                if (signalMatrix[src,via]>=inf):
                    continue
                signal(src, via)
        if (np.all(signalMatrix==orig)):
            break
    
    net.create_dataset("signalDistanceMatrix", data=signalMatrix)
    print "Done updating"

In [ ]:
from sys import stdout
for nwname in store.keys():
    net = store[nwname]
    print "Network: %s" % (nwname)
    if (('networkPositions' in net.keys())):
        print "Calculating spatial distance matrix"
        stdout.flush()
        create_spatial_distance_matrix(net)
        print "Calculating signal distance matrix"
        stdout.flush()
        update_signal_distance_matrix(net)

Let's verify that the code is correct, by counter-checking with the shortest paths given by the NetworkX Library:


In [89]:
import networkx as nx

def create_nx_graph(net):
    ''' Load a neural network with known connectivity as a NetworkX directed graph'''
    if (not 'connectionMatrix' in net):
        return None
    G = nx.DiGraph() 
    conn = net['connectionMatrix']
    for i in range(conn.shape[0]):
        G.add_node(i)
    for i in range(conn.shape[0]):
        for j in range(conn.shape[0]):
            if (conn[i,j]>0.0):
                G.add_edge(i,j)
    return G

In [104]:
# Simple test case: Check all connections of the small-1 network
G = create_nx_graph(store['small-1'])
for i in range(100):
    for j in range(100):
        try:
            plen = len(nx.shortest_path(G, i, j))-1 
            assert plen == store['small-1']['signalDistanceMatrix'][i,j]
        except:
            assert store['small-1']['signalDistanceMatrix'][i,j]==float('inf')
print "Test passed"


Test passed

In [5]:
store.close()