In [23]:
    
import numpy as np
import pandas as pd
from collections import defaultdict
import math
%matplotlib inline
import matplotlib.pyplot as plt
    
In [24]:
    
aba_onto = pd.read_table('mouse_regions.tsv',sep='\t')
#mapping acronym -> id
aba_onto_to_aba_id = aba_onto.set_index('acronym')['id'].to_dict()
assert aba_onto_to_aba_id['TMv'] == 1
aba_onto.head()
    
    Out[24]:
In [25]:
    
# data created with script from Lydia@ABA
aba_raw = pd.read_csv('normalized_projection_volume.csv')
# regions on the y-axis
y_es = aba_raw.primary_injection_structure.unique()
aba_raw.head()
    
    Out[25]:
In [26]:
    
# group all data_sets (experiments)
# TODO .max() or .mean()?
aba_raw = aba_raw.groupby('primary_injection_structure').max()
del aba_raw['data_set_id'] # not needed anymore
# remove hemisphere (trailing -R or -L)
aba_raw.columns = [c[:-2] for c in aba_raw.columns] 
# regions on the x-axis
x_es = aba_raw.columns
aba_raw.head()
    
    Out[26]:
In [27]:
    
# regions on the matrix axis
x_es[:5] , y_es[:5]
    
    Out[27]:
In [28]:
    
# flatten matrix into tuples (region_a, region_b, projection_volume)
aba_conn_dto = []
for x in x_es:
    for y in y_es:
        x_id = aba_onto_to_aba_id[x]
        y_id = aba_onto_to_aba_id[y]
        # TODO again: .max() or .mean()?
        value = aba_raw.loc[y][x].max() # has 2 items, since X-R and X-L, just take max
        aba_conn_dto.append( (x_id, y_id, value ) )
aba_gold = pd.DataFrame(aba_conn_dto)
aba_gold.drop_duplicates(inplace = True)
print len(aba_conn_dto)
print len(aba_gold)
aba_gold.columns = ['a_id','b_id', 'gold_score']
# ensure a < b (for sorting later)
aba_gold['a'] = aba_gold['a_id'].combine(aba_gold['b_id'], min, 0)
aba_gold['b'] = aba_gold['a_id'].combine(aba_gold['b_id'], max, 0)
del aba_gold['a_id']
del aba_gold['b_id']
    
    
In [29]:
    
aba_gold.sort('gold_score', ascending=False).head()
    
    Out[29]:
In [30]:
    
aba_gold.gold_score.describe()
    
    Out[30]:
In [31]:
    
fig = plt.figure(figsize=(18, 18))
aba_gold.sort('gold_score', ascending=False).gold_score.plot(style='.',use_index=False, title='Distribution of projection_volume values')
    
    Out[31]:
    
In [31]: