In [23]:
import numpy as np
import pandas as pd
from collections import defaultdict
import math
%matplotlib inline
import matplotlib.pyplot as plt
In [24]:
aba_onto = pd.read_table('mouse_regions.tsv',sep='\t')
#mapping acronym -> id
aba_onto_to_aba_id = aba_onto.set_index('acronym')['id'].to_dict()
assert aba_onto_to_aba_id['TMv'] == 1
aba_onto.head()
Out[24]:
In [25]:
# data created with script from Lydia@ABA
aba_raw = pd.read_csv('normalized_projection_volume.csv')
# regions on the y-axis
y_es = aba_raw.primary_injection_structure.unique()
aba_raw.head()
Out[25]:
In [26]:
# group all data_sets (experiments)
# TODO .max() or .mean()?
aba_raw = aba_raw.groupby('primary_injection_structure').max()
del aba_raw['data_set_id'] # not needed anymore
# remove hemisphere (trailing -R or -L)
aba_raw.columns = [c[:-2] for c in aba_raw.columns]
# regions on the x-axis
x_es = aba_raw.columns
aba_raw.head()
Out[26]:
In [27]:
# regions on the matrix axis
x_es[:5] , y_es[:5]
Out[27]:
In [28]:
# flatten matrix into tuples (region_a, region_b, projection_volume)
aba_conn_dto = []
for x in x_es:
for y in y_es:
x_id = aba_onto_to_aba_id[x]
y_id = aba_onto_to_aba_id[y]
# TODO again: .max() or .mean()?
value = aba_raw.loc[y][x].max() # has 2 items, since X-R and X-L, just take max
aba_conn_dto.append( (x_id, y_id, value ) )
aba_gold = pd.DataFrame(aba_conn_dto)
aba_gold.drop_duplicates(inplace = True)
print len(aba_conn_dto)
print len(aba_gold)
aba_gold.columns = ['a_id','b_id', 'gold_score']
# ensure a < b (for sorting later)
aba_gold['a'] = aba_gold['a_id'].combine(aba_gold['b_id'], min, 0)
aba_gold['b'] = aba_gold['a_id'].combine(aba_gold['b_id'], max, 0)
del aba_gold['a_id']
del aba_gold['b_id']
In [29]:
aba_gold.sort('gold_score', ascending=False).head()
Out[29]:
In [30]:
aba_gold.gold_score.describe()
Out[30]:
In [31]:
fig = plt.figure(figsize=(18, 18))
aba_gold.sort('gold_score', ascending=False).gold_score.plot(style='.',use_index=False, title='Distribution of projection_volume values')
Out[31]:
In [31]: