In [23]:
import numpy as np
import pandas as pd
from collections import defaultdict
import math
%matplotlib inline
import matplotlib.pyplot as plt

Parsing ABA experimental data


In [24]:
aba_onto = pd.read_table('mouse_regions.tsv',sep='\t')

#mapping acronym -> id
aba_onto_to_aba_id = aba_onto.set_index('acronym')['id'].to_dict()
assert aba_onto_to_aba_id['TMv'] == 1

aba_onto.head()


Out[24]:
id myid atlas_id acronym name parent_structure_id
0 1 1 424 TMv Tuberomammillary nucleus, ventral part 557
1 2 2 990 SSp-m6b Primary somatosensory area, mouth, layer 6b 345
2 3 3 707 sec secondary fissure 1040
3 4 4 141 IC Inferior colliculus 339
4 6 5 566 int internal capsule 784

5 rows × 6 columns


In [25]:
# data created with script from Lydia@ABA
aba_raw = pd.read_csv('normalized_projection_volume.csv')

# regions on the y-axis
y_es = aba_raw.primary_injection_structure.unique()
aba_raw.head()


Out[25]:
data_set_id primary_injection_structure FRP-R MOp-R MOs-R SSp-n-R SSp-bfd-R SSp-ll-R SSp-m-R SSp-ul-R SSp-tr-R SSp-un-R SSs-R GU-R VISC-R AUDd-R AUDp-R AUDpo-R AUDv-R VISal-R
0 180719293 MOp 0.068580 2.468630 0.815549 0.357992 0.454036 0.081658 0.979404 0.527580 0.017429 1.087990 0.684646 0.478943 0.183690 0.002365 0.001693 0.000000e+00 0.018263 0.000406 ...
1 180709942 MOp 0.057397 2.925960 1.021330 0.522288 0.515159 0.041720 1.248320 0.454361 0.001301 1.499530 1.304780 0.718774 0.325738 0.000419 0.001437 2.569000e-08 0.026881 0.000009 ...
2 166082128 MOp 0.112593 4.741260 1.894710 0.337731 1.394310 0.124539 1.330040 1.263350 0.008491 1.431670 1.415500 0.313845 0.213622 0.004193 0.006252 0.000000e+00 0.041305 0.000922 ...
3 180720175 MOp 0.002641 0.823139 0.725565 0.004429 0.136374 0.728642 0.006659 0.209588 1.106390 0.044225 0.088390 0.016314 0.010245 0.228062 0.241397 4.405670e-03 0.136536 0.067005 ...
4 120814821 MOp 0.291335 9.751350 4.992740 0.098949 5.297970 0.854422 0.685045 2.701720 0.282305 0.395045 1.864880 0.145435 0.090214 0.172363 0.058344 1.695270e-05 0.225992 0.014359 ...

5 rows × 594 columns


In [26]:
# group all data_sets (experiments)
# TODO .max() or .mean()?
aba_raw = aba_raw.groupby('primary_injection_structure').max()

del aba_raw['data_set_id'] # not needed anymore
# remove hemisphere (trailing -R or -L)
aba_raw.columns = [c[:-2] for c in aba_raw.columns] 

# regions on the x-axis
x_es = aba_raw.columns
aba_raw.head()


Out[26]:
FRP MOp MOs SSp-n SSp-bfd SSp-ll SSp-m SSp-ul SSp-tr SSp-un SSs GU VISC AUDd AUDp AUDpo AUDv VISal VISam VISl
primary_injection_structure
AAA 0.008259 0.039756 0.020719 0.002492 0.006706 0.043079 0.004413 0.021725 0.001265 0.004855 0.009930 0.043916 0.015284 0.002637 0.001815 0.000090 0.005926 0.000785 0.000085 0.001074 ...
ACA 0.019118 0.497297 1.000740 0.039186 0.209422 0.027633 0.024815 0.049324 0.073476 0.023429 0.146186 0.040924 0.031911 0.045518 0.035344 0.010326 0.038093 0.095197 0.897499 0.041352 ...
ACAd 0.051942 1.513790 3.999290 0.048974 0.238441 0.572160 0.045477 0.321856 0.427127 0.189453 0.196698 0.126714 0.034762 0.194629 0.074409 0.008546 0.047286 0.842452 1.472160 0.258359 ...
ACAv 0.019947 0.336812 1.246400 0.034205 0.046766 0.020294 0.017483 0.007384 0.075899 0.020424 0.093040 0.011684 0.026757 0.021256 0.007016 0.002764 0.012144 0.065850 0.618113 0.024922 ...
ACB 0.010619 0.048564 0.153678 0.001667 0.002769 0.001848 0.001419 0.003253 0.001753 0.001518 0.004475 0.005439 0.002025 0.000952 0.000700 0.000239 0.002160 0.001265 0.001310 0.000415 ...

5 rows × 592 columns


In [27]:
# regions on the matrix axis
x_es[:5] , y_es[:5]


Out[27]:
(Index([u'FRP', u'MOp', u'MOs', u'SSp-n', u'SSp-bfd'], dtype='object'),
 array(['MOp', 'MOs', 'SSp', 'SSp-bfd', 'SSp-ll'], dtype=object))

In [28]:
# flatten matrix into tuples (region_a, region_b, projection_volume)
aba_conn_dto = []
for x in x_es:
    for y in y_es:
        x_id = aba_onto_to_aba_id[x]
        y_id = aba_onto_to_aba_id[y]
        # TODO again: .max() or .mean()?
        value = aba_raw.loc[y][x].max() # has 2 items, since X-R and X-L, just take max
        aba_conn_dto.append( (x_id, y_id, value ) )
aba_gold = pd.DataFrame(aba_conn_dto)

aba_gold.drop_duplicates(inplace = True)
print len(aba_conn_dto)
print len(aba_gold)

aba_gold.columns = ['a_id','b_id', 'gold_score']
# ensure a < b (for sorting later)
aba_gold['a'] = aba_gold['a_id'].combine(aba_gold['b_id'], min, 0)
aba_gold['b'] = aba_gold['a_id'].combine(aba_gold['b_id'], max, 0)
del aba_gold['a_id']
del aba_gold['b_id']


139712
69856

In [29]:
aba_gold.sort('gold_score', ascending=False).head()


Out[29]:
gold_score a b
69839 41.7725 773 1009
12089 38.6056 382 463
69751 33.0490 223 1009
17153 28.0085 672 749
17167 26.4854 374 672

5 rows × 3 columns


In [30]:
aba_gold.gold_score.describe()


Out[30]:
count    69856.000000
mean         0.106288
std          0.629020
min          0.000000
25%          0.000295
50%          0.003611
75%          0.030513
max         41.772500
Name: gold_score, dtype: float64

In [31]:
fig = plt.figure(figsize=(18, 18))
aba_gold.sort('gold_score', ascending=False).gold_score.plot(style='.',use_index=False, title='Distribution of projection_volume values')


Out[31]:
<matplotlib.axes.AxesSubplot at 0x67f6810>

In [31]: