In [1]:
    
# Set-up default environment
%run '../ipython_startup.py'
# Import additional libraries
import sas7bdat as sas
import cPickle as pickle
import statsmodels.formula.api as smf
from ase_cisEq import marenEq
from ase_cisEq import marenPrintTable
from ase_normalization import meanCenter
from ase_normalization import q3Norm
from ase_normalization import meanStd
pjoin = os.path.join
    
    
This data set was created by: ase_summarize_ase_filters.sas
The data has had the following droped:
In [2]:
    
# Import clean dataset
with sas.SAS7BDAT(pjoin(PROJ, 'sas_data/clean_ase_stack.sas7bdat')) as FH:
    df = FH.to_data_frame()
    
dfClean = df[['line', 'mating_status', 'fusion_id', 'flag_AI_combined', 'q5_mean_theta', 'sum_both', 'sum_line', 'sum_tester', 'sum_total', 'mean_apn']]
print 'Rows ' + str(dfClean.shape[0])
print 'Columns ' + str(dfClean.shape[1])
print 'Number of Genotypes ' + str(len(set(dfClean['line'].tolist())))
print 'Number of Exonic Regions ' + str(len(set(dfClean['fusion_id'].tolist())))
    
    
    
For the maren equations, I am also going to drop exonic regions with less than 10 genotypes. The maren equations make some assumptions about the population level sums. Obvisouly the more genotypes that are present for each fusions the better, but I am comfortable with as few as 10 genotypes.
In [3]:
    
# Drop groups with less than 10 lines per fusion
grp = dfClean.groupby(['mating_status', 'fusion_id'])
dfGt10 = grp.filter(lambda x: x['line'].count() >= 10).copy()
print 'Rows ' + str(dfGt10.shape[0])
print 'Columns ' + str(dfGt10.shape[1])
print 'Number of Genotypes ' + str(len(set(dfGt10['line'].tolist())))
print 'Number of Exonic Regions ' + str(len(set(dfGt10['fusion_id'].tolist())))
    
    
In [4]:
    
# Calculate Maren TIG equations by mating status and exonic region
marenRawCounts = marenEq(dfGt10, Eii='sum_line', Eti='sum_tester', group=['mating_status', 'fusion_id'])
marenRawCounts['mag_cis'] = abs(marenRawCounts['cis_line'])
marenRawCounts.columns
    
    Out[4]:
In [5]:
    
# Plot densities
def panelKde(df, **kwargs):
    options = {'subplots': True, 
               'layout': (7, 7), 
               'figsize': (20, 20), 
               'xlim': (-500, 500), 
               'legend': False,
               'color': 'k'}
    options.update(kwargs)
    
    # Make plot
    axes = df.plot(kind='kde', **options)
    
    # Add titles intead of legends
    try:
        for ax in axes.ravel():
            h, l = ax.get_legend_handles_labels()
            ax.set_title(l[0])
            ax.get_yaxis().set_visible(False)
            ax.axvline(0, lw=1)
    except:
        ax = axes
        ax.get_yaxis().set_visible(False)
        ax.axvline(0, lw=1)
    
    return plt.gcf()
    
def linePanel(df, value='cis_line', index='fusion_id', columns='line'):
    
    mymap = {
        'cis_line': 'cis-Line Effects',
        'trans_line': 'trans-Line Effects',
        'line': 'genotype',
        'fusion_id': 'exonic_region'
    }
    
    # Iterate over mated and virgin
    for k, v in {'M': 'Mated', 'V': 'Virgin'}.iteritems():
        
        # Pivot data frame so that the thing you want to make panels by is in columns.
        dfPiv = pd.pivot_table(df[df['mating_status'] == k], 
                               values=value, index=index, columns=columns)
        
        # Generate panel plot with at most 49 panels
        if value == 'cis_line':
            xlim = (-500, 500)
        else:
            # trans-effects appear to be larger in magnitude
            xlim = (-1500, 1500)
            
        fig = panelKde(dfPiv.iloc[:, :49], xlim=xlim)
        
        title = '{}\n{}'.format(mymap[value], v)
        fig.suptitle(title, fontsize=18, fontweight='bold')
        
        fname = pjoin(PROJ, 'pipeline_output/cis_effects/density_plot_by_{}_{}_{}.png'.format(mymap[columns], value, v.lower()))
        plt.savefig(fname, bbox_inches='tight')
        print("Saved figure to: " + fname)
        
        plt.close()
        
def testerPanel(df, value='cis_tester'):
    mymap = {
        'cis_tester': 'cis-Tester Effects',
        'trans_tester': 'trans-Trans Effects'
    }
    
    # Iterate over mated and virgin
    for k, v in {'M': 'Mated', 'V': 'Virgin'}.iteritems():
        
        # Split table by mating status and drop duplicates, because 
        # there is only one tester value for each exonic region
        dfSub = df.ix[df['mating_status'] == k,['fusion_id', value]].drop_duplicates()
        
        # Generate Panel Plot
        fig = panelKde(dfSub, subplots=False)
        
        title = '{}\n{}'.format(mymap[value], v)
        fig.suptitle(title, fontsize=18, fontweight='bold')
        
        fname = pjoin(PROJ, 'pipeline_output/cis_effects/density_plot_{}_{}.png'.format(value, v.lower()))
        plt.savefig(fname, bbox_inches='tight')
        print("Saved figure to: " + fname)
        
        plt.close()
    
In [7]:
    
# Cis and trans line effects by genotype
linePanel(marenRawCounts, value='cis_line', index='fusion_id', columns='line')
linePanel(marenRawCounts, value='trans_line', index='fusion_id', columns='line')
# Cis and trans line effects by exonic region
linePanel(marenRawCounts, value='cis_line', index='line', columns='fusion_id')
linePanel(marenRawCounts, value='trans_line', index='line', columns='fusion_id')
# Cis and trans tester effects
testerPanel(marenRawCounts, value='cis_tester')
testerPanel(marenRawCounts, value='trans_tester')
    
    
In [14]:
    
# Set Globals
SHAPES = {'M': 'o', 'V': '^'}
CMAP='jet'
# Add color column to color by genotype
colors = {}
cnt = 0
genos = set(dfGt10['line'].tolist())
for l in genos:
    colors[l] = cnt
    cnt += 1
marenRawCounts['color'] = marenRawCounts['line'].map(colors)
    
In [15]:
    
# Plotting scatter
def getR2(df, x, y):
    """ Calculate the R-squared using OLS with an intercept """
    formula = '{} ~ {} + 1'.format(y, x)
    return smf.ols(formula, df).fit().rsquared
def scatPlt(df, x, y, c=None, cmap='jet', s=50, marker='o', ax=None, title=None, xlab=None, ylab=None, diag='pos'):
    """ Make a scatter plot using some default options """
    
    ax = df.plot(x, y, kind='scatter', ax=ax, c=c, cmap=cmap, s=s, marker=marker, title=title, colorbar=False)
    
    # Add a diag line
    if diag == 'neg':
        # draw a diag line with negative slope
        ax.plot([0, 1], [1, 0], transform=ax.transAxes)
    elif diag == 'pos':
        # draw a diag line with positive slope
        ax.plot([0, 1], [0, 1], transform=ax.transAxes)
        
    ax.set_xlabel(xlab)
    ax.set_ylabel(ylab)
    
    return ax
def scatPltPanel(df, line='sum_line', tester='sum_tester', x='cis_line', y='prop', cmap='jet', s=60, panel_title=None, diag='pos'):
    """ Make a panel of scatter plots using pandas """
    # Plot the cis-line effects x proportion Line by fusion
    df['prop'] = 1 - df[tester] / (df[line] + df[tester])
    
    # Create 5x5 panel plot
    fig, axes = plt.subplots(5, 5, figsize=(20, 20))
    fig.suptitle(panel_title, fontsize=12, fontweight='bold')
    axes = axes.ravel()
    # Group by fusion_id
    for i, (n, gdf) in enumerate(df.groupby('fusion_id')):
        ax = axes[i]
                    
        # Calculate R-squared value
        r2 = getR2(gdf, x, y)
        # Make new title with R-squared in it
        t = '{}\nR^2: {}'.format(n, round(r2, 3))
        # Change marker style based on mating status and plot
        for ms, msdf in gdf.groupby('mating_status'):
            scatPlt(msdf, x, y, c='color', cmap=cmap, s=s, marker=SHAPES[ms], ax=ax, title=t, xlab=x, ylab=y, diag=diag)
        # only plot 25 fusions
        if i == 24:
            break
    
    fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_by_exonic_region_{}_v_{}.png'.format(x, y))
    plt.savefig(fname, bbox_inches='tight')
    print("Saved figure to: " + fname)
    
    plt.close()
    
In [16]:
    
# Plot the cis-line effects x proportion by fusion
scatPltPanel(marenRawCounts, line='sum_line', tester='sum_tester', cmap=CMAP, panel_title='Raw Counts: cis-line')
    
    
In [17]:
    
# Plot the trans-line effects x proportion by fusion
scatPltPanel(marenRawCounts, line='sum_line', tester='sum_tester', x='trans_line', cmap=CMAP, panel_title='Raw Counts: trans-line', diag='neg')
    
    
In [18]:
    
# Plot F10005_SI
FUSION='F10005_SI'
dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])
# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')
for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')
    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')
    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)
fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)
plt.close()
    
    
In [19]:
    
# Plot F10317_SI
FUSION='F10317_SI'
dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])
# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')
for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')
    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')
    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)
fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)
plt.close()
    
    
In [20]:
    
# Plot F10482_SI
FUSION='F10482_SI'
dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])
# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')
for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')
    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')
    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)
fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)
plt.close()
    
    
In [71]:
    
f1005
    
    Out[71]:
In [ ]:
    
    
In [ ]:
    
    
In [17]:
    
marenRawCounts.columns
    
    Out[17]:
In [15]:
    
meanByMsLine = marenRawCounts[['mean_apn', 'cis_line', 'mating_status', 'line']].groupby(['mating_status', 'line']).mean()
meanByMsLine.columns
    
    Out[15]:
In [60]:
    
meanByMsLine.plot(kind='scatter', x='mean_apn', y='cis_line')
    
    Out[60]:
    
In [61]:
    
    
In [68]:
    
def cisAPN(df, fusion, value='cis_line', xcutoff='>=150', ycutoff='<=-180'):
    """ Plot effects vs mean apn"""
    # Pull out fusion of interest
    dfSub = marenRawCounts[marenRawCounts['fusion_id'] == fusion]
    # Make scatter plot
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    dfSub.plot(kind='scatter', x='mean_apn', y='cis_line', ax=ax, title=fusion)
    # Annotate outliers
    xc = 
    filt = dfSub.loc[(dfSub[value] eval(ycutoff)) | (dfSub['mean_apn'] eval(xcutoff)), ['line', 'mating_status', 'mean_apn', 'cis_line']]
    for row in filt.values:
        line, ms, apn, cis = row
        ax.annotate(line + '_' + ms, xy=(apn, cis))
    fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_{}_v_meanApn.png'.format(fusion, value))
    plt.savefig(fname, bbox_inches='tight')
    
    
In [70]:
    
eval("{} == 'M'".format(marenRawCounts['mating_status']))
    
    
In [ ]: