In [1]:
%run '../ipython_startup.py'
In [2]:
# Import additional libraries
import sas7bdat as sas
from itertools import combinations
import seaborn
pjoin = os.path.join
In [3]:
with sas.SAS7BDAT(pjoin(PROJ, 'sas_data/cis_eq.sas7bdat')) as FH:
df = FH.to_data_frame()
In [7]:
df.head()
Out[7]:
In [90]:
mated = df[df['mating_status'] == 'M']
virgin = df[df['mating_status'] == 'V']
In [108]:
def loop(df, func):
""" This is a general looping function.
Creates and iterates over all pairwise combinations. Filters
dataframe and passes it to the given function. This will make it
easy to run various distance metrics.
"""
# get list of lines
lines = df['line'].factorize()[1]
# Create matrix to store distance results
dfResults = pd.DataFrame(np.eye(lines.shape[0], lines.shape[0]), columns=lines, index=lines)
#Iterate over all pairwise combinations
for combo in combinations(lines, 2):
# Create dataframe for current combination
dfCurr = df[(df['line'] == combo[0]) | (df['line'] == combo[1])]
# Pass dataframe to my function of choice
res = func(dfCurr)
# Fill in results matrix
dfResults.loc[combo[0], combo[1]] = res
dfResults.loc[combo[1], combo[0]] = res
return dfResults
Here I use flag_AI_combined from the Bayesian machine to determine pairwise distance. For this calculation I drop all fusions that are not in both datasets (i.e., drop NaN). I then normalize the sum by the number of non-missing fusions, trying to normalize for the number of fusions present in both datasets.
In [109]:
def AI_dist(df):
# Create a 2 column side-by-side dataframe with fusion_id as rows and lines and columns.
dfSbs = df[['line', 'fusion_id', 'flag_AI_combined']].set_index(['fusion_id', 'line']).unstack()
# Sum flag_AI_combined across lines, if equals 2 then both lines had AI
dfSum = dfSbs.sum(axis=1)
# Number of fusions that both had AI
num = dfSum[dfSum == 2].shape[0]
# How many fusions did we test, after removing NaN
total = dfSum.dropna().shape[0]
return num / float(total)
In [121]:
aiMated = loop(mated, AI_dist)
aiVirgin = loop(virgin, AI_dist)
In [113]:
aiMated.head()
Out[113]:
In [139]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
cbar_ax = fig.add_axes([.94, .2, .03, .5])
seaborn.heatmap(aiMated, vmax=0.1, square=True, ax=ax1, cbar=False)
ax1.set_title('Mated')
seaborn.heatmap(aiVirgin, vmax=0.1, square=True, ax=ax2, cbar_ax=cbar_ax)
ax2.set_title('Virgin')
Out[139]:
In [152]:
def cis_direction_dist(df):
# Create a 2 column side-by-side dataframe with fusion_id as rows and lines and columns.
dfSbs = df[['line', 'fusion_id', 'cis_line']].set_index(['fusion_id', 'line']).unstack()
# Drop missing
noMiss = dfSbs.dropna()
# If both have a positive or negative cis effects
flag = ((noMiss.iloc[0] > 0) & (noMiss.iloc[1] > 0)) | ((noMiss.iloc[0] < 0) & (noMiss.iloc[1] < 0))
# Number in same direction
num = flag.sum()
# How many fusions did we test, after removing NaN
total = noMiss.shape[0]
return num / float(total)
In [153]:
cis_direction_Mated = loop(mated, cis_direction_dist)
cis_direction_Virgin = loop(virgin, cis_direction_dist)
In [154]:
cis_direction_Mated.head()
Out[154]:
In [155]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
cbar_ax = fig.add_axes([.94, .2, .03, .5])
seaborn.heatmap(cis_direction_Mated, vmax=0.01, square=True, ax=ax1, cbar=False)
ax1.set_title('Mated')
seaborn.heatmap(cis_direction_Virgin, vmax=0.01, square=True, ax=ax2, cbar_ax=cbar_ax)
ax2.set_title('Virgin')
Out[155]:
In [ ]: