Maren Equations Summary

This notebook is just pulling out the important figures and tables for the manuscript. For more detailed explanations and exploring see other notebooks.


In [1]:
# Set-up default environment
%run '../ipython_startup.py'

# Import additional libraries
import sas7bdat as sas
import cPickle as pickle
import statsmodels.formula.api as smf

from ase_cisEq import marenEq
from ase_cisEq import marenPrintTable

from ase_normalization import meanCenter
from ase_normalization import q3Norm
from ase_normalization import meanStd

pjoin = os.path.join


Importing commonly used libraries: 
            os, sys 
            numpy as np 
            scipy as sp 
            pandas as pd 
            matplotlib as mp 
            matplotlib.pyplot as plt
            datetime as dt 
            mclib_Python/flagging as fg

Creating project level variables: 
        MCLAB = /home/jfear/mclab 
        PROJ = /home/jfear/mclab/cegs_ase_paper 
        TODAY = 20150929

Adding ['scripts/mclib_Python', 'scripts/ase_Python'] to PYTHONPATH

Import clean data set

This data set was created by: ase_summarize_ase_filters.sas

The data has had the following droped:

  • regions that were always bias in 100 genome simulation
  • regions with APN $\le 25$
  • regions not in at least 10% of genotypes
  • regions not in mated and virgin
  • genotypes with extreme bias in median(q5_mean_theta)
  • genotypes with $\le500$ regions

In [2]:
# Import clean dataset
with sas.SAS7BDAT(pjoin(PROJ, 'sas_data/clean_ase_stack.sas7bdat')) as FH:
    df = FH.to_data_frame()
    
dfClean = df[['line', 'mating_status', 'fusion_id', 'flag_AI_combined', 'q5_mean_theta', 'sum_both', 'sum_line', 'sum_tester', 'sum_total', 'mean_apn']]
print 'Rows ' + str(dfClean.shape[0])
print 'Columns ' + str(dfClean.shape[1])
print 'Number of Genotypes ' + str(len(set(dfClean['line'].tolist())))
print 'Number of Exonic Regions ' + str(len(set(dfClean['fusion_id'].tolist())))


[clean_ase_stack.sas7bdat] header length 65536 != 8192
WARNING:/home/jfear/mclab/cegs_ase_paper/sas_data/clean_ase_stack.sas7bdat:[clean_ase_stack.sas7bdat] header length 65536 != 8192
Rows 159934
Columns 10
Number of Genotypes 49
Number of Exonic Regions 5391

Additional cleaning

For the maren equations, I am also going to drop exonic regions with less than 10 genotypes. The maren equations make some assumptions about the population level sums. Obvisouly the more genotypes that are present for each fusions the better, but I am comfortable with as few as 10 genotypes.


In [3]:
# Drop groups with less than 10 lines per fusion
grp = dfClean.groupby(['mating_status', 'fusion_id'])
dfGt10 = grp.filter(lambda x: x['line'].count() >= 10).copy()
print 'Rows ' + str(dfGt10.shape[0])
print 'Columns ' + str(dfGt10.shape[1])
print 'Number of Genotypes ' + str(len(set(dfGt10['line'].tolist())))
print 'Number of Exonic Regions ' + str(len(set(dfGt10['fusion_id'].tolist())))


Rows 131700
Columns 10
Number of Genotypes 49
Number of Exonic Regions 3053

Raw Counts


In [4]:
# Calculate Maren TIG equations by mating status and exonic region
marenRawCounts = marenEq(dfGt10, Eii='sum_line', Eti='sum_tester', group=['mating_status', 'fusion_id'])
marenRawCounts['mag_cis'] = abs(marenRawCounts['cis_line'])
marenRawCounts.columns


Out[4]:
Index([            u'line',    u'mating_status',        u'fusion_id',
       u'flag_AI_combined',    u'q5_mean_theta',         u'sum_both',
               u'sum_line',       u'sum_tester',        u'sum_total',
               u'mean_apn',               u'mu',         u'cis_line',
             u'trans_line',       u'cis_tester',     u'trans_tester',
                u'mag_cis'],
      dtype='object')

Plot Distribution of cis- and trans-effects


In [5]:
# Plot densities
def panelKde(df, **kwargs):
    options = {'subplots': True, 
               'layout': (7, 7), 
               'figsize': (20, 20), 
               'xlim': (-500, 500), 
               'legend': False,
               'color': 'k'}
    options.update(kwargs)
    
    # Make plot
    axes = df.plot(kind='kde', **options)
    
    # Add titles intead of legends
    try:
        for ax in axes.ravel():
            h, l = ax.get_legend_handles_labels()
            ax.set_title(l[0])
            ax.get_yaxis().set_visible(False)
            ax.axvline(0, lw=1)
    except:
        ax = axes
        ax.get_yaxis().set_visible(False)
        ax.axvline(0, lw=1)
    
    return plt.gcf()
    
def linePanel(df, value='cis_line', index='fusion_id', columns='line'):
    
    mymap = {
        'cis_line': 'cis-Line Effects',
        'trans_line': 'trans-Line Effects',
        'line': 'genotype',
        'fusion_id': 'exonic_region'
    }
    
    # Iterate over mated and virgin
    for k, v in {'M': 'Mated', 'V': 'Virgin'}.iteritems():
        
        # Pivot data frame so that the thing you want to make panels by is in columns.
        dfPiv = pd.pivot_table(df[df['mating_status'] == k], 
                               values=value, index=index, columns=columns)
        
        # Generate panel plot with at most 49 panels
        if value == 'cis_line':
            xlim = (-500, 500)
        else:
            # trans-effects appear to be larger in magnitude
            xlim = (-1500, 1500)
            
        fig = panelKde(dfPiv.iloc[:, :49], xlim=xlim)
        
        title = '{}\n{}'.format(mymap[value], v)
        fig.suptitle(title, fontsize=18, fontweight='bold')
        
        fname = pjoin(PROJ, 'pipeline_output/cis_effects/density_plot_by_{}_{}_{}.png'.format(mymap[columns], value, v.lower()))
        plt.savefig(fname, bbox_inches='tight')
        print("Saved figure to: " + fname)
        
        plt.close()
        
def testerPanel(df, value='cis_tester'):
    mymap = {
        'cis_tester': 'cis-Tester Effects',
        'trans_tester': 'trans-Trans Effects'
    }
    
    # Iterate over mated and virgin
    for k, v in {'M': 'Mated', 'V': 'Virgin'}.iteritems():
        
        # Split table by mating status and drop duplicates, because 
        # there is only one tester value for each exonic region
        dfSub = df.ix[df['mating_status'] == k,['fusion_id', value]].drop_duplicates()
        
        # Generate Panel Plot
        fig = panelKde(dfSub, subplots=False)
        
        title = '{}\n{}'.format(mymap[value], v)
        fig.suptitle(title, fontsize=18, fontweight='bold')
        
        fname = pjoin(PROJ, 'pipeline_output/cis_effects/density_plot_{}_{}.png'.format(value, v.lower()))
        plt.savefig(fname, bbox_inches='tight')
        print("Saved figure to: " + fname)
        
        plt.close()

In [7]:
# Cis and trans line effects by genotype
linePanel(marenRawCounts, value='cis_line', index='fusion_id', columns='line')
linePanel(marenRawCounts, value='trans_line', index='fusion_id', columns='line')

# Cis and trans line effects by exonic region
linePanel(marenRawCounts, value='cis_line', index='line', columns='fusion_id')
linePanel(marenRawCounts, value='trans_line', index='line', columns='fusion_id')

# Cis and trans tester effects
testerPanel(marenRawCounts, value='cis_tester')
testerPanel(marenRawCounts, value='trans_tester')


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_genotype_cis_line_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_genotype_cis_line_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_genotype_trans_line_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_genotype_trans_line_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_exonic_region_cis_line_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_exonic_region_cis_line_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_exonic_region_trans_line_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_exonic_region_trans_line_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_cis_tester_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_cis_tester_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_trans_tester_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_trans_tester_virgin.png

Plot cis- and trans-effects vs Allelic Proportion


In [14]:
# Set Globals
SHAPES = {'M': 'o', 'V': '^'}
CMAP='jet'

# Add color column to color by genotype
colors = {}
cnt = 0
genos = set(dfGt10['line'].tolist())
for l in genos:
    colors[l] = cnt
    cnt += 1

marenRawCounts['color'] = marenRawCounts['line'].map(colors)

In [15]:
# Plotting scatter
def getR2(df, x, y):
    """ Calculate the R-squared using OLS with an intercept """
    formula = '{} ~ {} + 1'.format(y, x)
    return smf.ols(formula, df).fit().rsquared

def scatPlt(df, x, y, c=None, cmap='jet', s=50, marker='o', ax=None, title=None, xlab=None, ylab=None, diag='pos'):
    """ Make a scatter plot using some default options """
    
    ax = df.plot(x, y, kind='scatter', ax=ax, c=c, cmap=cmap, s=s, marker=marker, title=title, colorbar=False)
    
    # Add a diag line
    if diag == 'neg':
        # draw a diag line with negative slope
        ax.plot([0, 1], [1, 0], transform=ax.transAxes)
    elif diag == 'pos':
        # draw a diag line with positive slope
        ax.plot([0, 1], [0, 1], transform=ax.transAxes)
        
    ax.set_xlabel(xlab)
    ax.set_ylabel(ylab)
    
    return ax

def scatPltPanel(df, line='sum_line', tester='sum_tester', x='cis_line', y='prop', cmap='jet', s=60, panel_title=None, diag='pos'):
    """ Make a panel of scatter plots using pandas """
    # Plot the cis-line effects x proportion Line by fusion
    df['prop'] = 1 - df[tester] / (df[line] + df[tester])
    
    # Create 5x5 panel plot
    fig, axes = plt.subplots(5, 5, figsize=(20, 20))
    fig.suptitle(panel_title, fontsize=12, fontweight='bold')
    axes = axes.ravel()

    # Group by fusion_id
    for i, (n, gdf) in enumerate(df.groupby('fusion_id')):
        ax = axes[i]
                    
        # Calculate R-squared value
        r2 = getR2(gdf, x, y)

        # Make new title with R-squared in it
        t = '{}\nR^2: {}'.format(n, round(r2, 3))

        # Change marker style based on mating status and plot
        for ms, msdf in gdf.groupby('mating_status'):
            scatPlt(msdf, x, y, c='color', cmap=cmap, s=s, marker=SHAPES[ms], ax=ax, title=t, xlab=x, ylab=y, diag=diag)

        # only plot 25 fusions
        if i == 24:
            break
    
    fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_by_exonic_region_{}_v_{}.png'.format(x, y))
    plt.savefig(fname, bbox_inches='tight')
    print("Saved figure to: " + fname)
    
    plt.close()

In [16]:
# Plot the cis-line effects x proportion by fusion
scatPltPanel(marenRawCounts, line='sum_line', tester='sum_tester', cmap=CMAP, panel_title='Raw Counts: cis-line')


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_by_exonic_region_cis_line_v_prop.png

In [17]:
# Plot the trans-line effects x proportion by fusion
scatPltPanel(marenRawCounts, line='sum_line', tester='sum_tester', x='trans_line', cmap=CMAP, panel_title='Raw Counts: trans-line', diag='neg')


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_by_exonic_region_trans_line_v_prop.png

Plot cis- and trans-effects vs Allelic Proportion for Specific Exonic Regions


In [18]:
# Plot F10005_SI
FUSION='F10005_SI'

dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])

# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')

for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')

    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')

    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)

fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)

plt.close()


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_F10005_SI_effects_v_prop.png

F10317_SI

This fusion has a weaker cis-line effects but trans-line effects look more linear.


In [19]:
# Plot F10317_SI
FUSION='F10317_SI'

dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])

# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')

for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')

    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')

    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)

fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)

plt.close()


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_F10317_SI_effects_v_prop.png

F10482_SI

This fusion has a weaker cis-line effects but trans-line effects look more linear.


In [20]:
# Plot F10482_SI
FUSION='F10482_SI'

dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])

# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')

for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')

    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')

    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)

fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)

plt.close()


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_F10482_SI_effects_v_prop.png

In [71]:
f1005


Out[71]:
line mating_status fusion_id flag_AI_combined q5_mean_theta sum_both sum_line sum_tester sum_total mean_apn mu cis_line trans_line cis_tester trans_tester mag_cis
10 r101 M F10005_SI 0 0.475 1274 155 133 1562 29.523922 353.733333 28.666667 -448.133333 6.666667 -6.666667 28.666667
11 r280 M F10005_SI 0 0.545 1208 126 143 1477 27.917307 353.733333 -10.333333 -428.133333 6.666667 -6.666667 10.333333
12 r315 M F10005_SI 0 0.490 1231 237 218 1686 31.867690 353.733333 25.666667 -278.133333 6.666667 -6.666667 25.666667
13 r324 M F10005_SI 1 0.450 2554 349 276 3179 60.087419 353.733333 79.666667 -162.133333 6.666667 -6.666667 79.666667
14 r335 M F10005_SI 0 0.514 1339 215 217 1771 33.474306 353.733333 4.666667 -280.133333 6.666667 -6.666667 4.666667
15 r340 M F10005_SI 1 0.241 1897 332 102 2331 44.059067 353.733333 236.666667 -510.133333 6.666667 -6.666667 236.666667
16 r357 M F10005_SI 1 0.542 2797 358 408 3563 67.345540 353.733333 -43.333333 101.866667 6.666667 -6.666667 43.333333
17 r358 M F10005_SI 0 0.480 1494 185 162 1841 34.797401 353.733333 29.666667 -390.133333 6.666667 -6.666667 29.666667
18 r365 M F10005_SI 0 0.508 2313 385 386 3084 58.291790 353.733333 5.666667 57.866667 6.666667 -6.666667 5.666667
19 r373 M F10005_SI 0 0.450 1546 146 113 1805 34.116952 353.733333 39.666667 -488.133333 6.666667 -6.666667 39.666667
20 r374 M F10005_SI 0 0.412 1703 231 155 2089 39.484938 353.733333 82.666667 -404.133333 6.666667 -6.666667 82.666667
21 r380 M F10005_SI 0 0.472 1584 295 254 2133 40.316598 353.733333 47.666667 -206.133333 6.666667 -6.666667 47.666667
22 r427 M F10005_SI 0 0.501 4246 522 510 5278 99.761370 353.733333 18.666667 305.866667 6.666667 -6.666667 18.666667
23 r491 M F10005_SI 0 0.484 3827 657 599 5083 96.075605 353.733333 64.666667 483.866667 6.666667 -6.666667 64.666667
24 r517 M F10005_SI 0 0.480 2591 580 520 3691 69.764914 353.733333 66.666667 325.866667 6.666667 -6.666667 66.666667
25 r732 M F10005_SI 0 0.471 1281 276 237 1794 33.909037 353.733333 45.666667 -240.133333 6.666667 -6.666667 45.666667
26 r737 M F10005_SI 0 0.566 1576 126 156 1858 35.118724 353.733333 -23.333333 -402.133333 6.666667 -6.666667 23.333333
27 r799 M F10005_SI 0 0.487 2798 520 480 3798 71.787360 353.733333 46.666667 245.866667 6.666667 -6.666667 46.666667
28 r820 M F10005_SI 0 0.442 1880 236 180 2296 43.397519 353.733333 62.666667 -354.133333 6.666667 -6.666667 62.666667
29 r85 M F10005_SI 0 0.606 1086 131 191 1408 26.613113 353.733333 -53.333333 -332.133333 6.666667 -6.666667 53.333333
30 w114 M F10005_SI 0 0.524 1042 230 243 1515 28.635558 353.733333 -6.333333 -228.133333 6.666667 -6.666667 6.333333
31 w38 M F10005_SI 1 0.537 2046 444 501 2991 56.533963 353.733333 -50.333333 287.866667 6.666667 -6.666667 50.333333
32 w47 M F10005_SI 0 0.504 5492 1425 1417 8334 157.523922 353.733333 14.666667 2119.866667 6.666667 -6.666667 14.666667
33 w52 M F10005_SI 0 0.501 3848 407 397 4652 87.929120 353.733333 16.666667 79.866667 6.666667 -6.666667 16.666667
34 w55 M F10005_SI 1 0.784 4191 252 882 5325 100.649734 353.733333 -623.333333 1049.866667 6.666667 -6.666667 623.333333
35 w59 M F10005_SI 0 0.481 1429 367 329 2125 40.165387 353.733333 44.666667 -56.133333 6.666667 -6.666667 44.666667
36 w64 M F10005_SI 0 0.467 2153 284 239 2676 50.580035 353.733333 51.666667 -236.133333 6.666667 -6.666667 51.666667
37 w68 M F10005_SI 0 0.460 1915 371 305 2591 48.973420 353.733333 72.666667 -104.133333 6.666667 -6.666667 72.666667
38 w76 M F10005_SI 0 0.578 1757 286 376 2419 45.722386 353.733333 -83.333333 37.866667 6.666667 -6.666667 83.333333
39 w79 M F10005_SI 0 0.611 2596 384 583 3563 67.345540 353.733333 -192.333333 451.866667 6.666667 -6.666667 192.333333
65860 r101 V F10005_SI 1 0.461 4444 454 377 5275 99.704666 370.816667 74.433333 14.933333 -2.566667 2.566667 74.433333
65861 r280 V F10005_SI 0 0.481 1472 236 210 1918 36.252806 370.816667 23.433333 -319.066667 -2.566667 2.566667 23.433333
65862 r315 V F10005_SI 1 0.436 2754 674 508 3936 74.395747 370.816667 163.433333 276.933333 -2.566667 2.566667 163.433333
65863 r324 V F10005_SI 0 0.441 2473 331 252 3056 57.762552 370.816667 76.433333 -235.066667 -2.566667 2.566667 76.433333
65864 r335 V F10005_SI 0 0.498 1359 185 175 1719 32.491435 370.816667 7.433333 -389.066667 -2.566667 2.566667 7.433333
65865 r340 V F10005_SI 0 0.511 1661 166 165 1992 37.651506 370.816667 -1.566667 -409.066667 -2.566667 2.566667 1.566667
65866 r357 V F10005_SI 0 0.520 2373 322 335 3030 57.271116 370.816667 -15.566667 -69.066667 -2.566667 2.566667 15.566667
65867 r358 V F10005_SI 0 0.526 2075 238 252 2565 48.481985 370.816667 -16.566667 -235.066667 -2.566667 2.566667 16.566667
65868 r365 V F10005_SI 0 0.500 3119 489 474 4082 77.155346 370.816667 12.433333 208.933333 -2.566667 2.566667 12.433333
65869 r373 V F10005_SI 0 0.426 3498 371 266 4135 78.157118 370.816667 102.433333 -207.066667 -2.566667 2.566667 102.433333
65870 r374 V F10005_SI 0 0.527 2930 401 433 3764 71.144714 370.816667 -34.566667 126.933333 -2.566667 2.566667 34.566667
65871 r380 V F10005_SI 0 0.477 1724 371 327 2422 45.779090 370.816667 41.433333 -85.066667 -2.566667 2.566667 41.433333
65872 r427 V F10005_SI 0 0.479 3023 367 326 3716 70.237448 370.816667 38.433333 -87.066667 -2.566667 2.566667 38.433333
65873 r491 V F10005_SI 0 0.517 2550 448 465 3463 65.455405 370.816667 -19.566667 190.933333 -2.566667 2.566667 19.566667
65874 r517 V F10005_SI 0 0.425 1393 363 258 2014 38.067336 370.816667 102.433333 -223.066667 -2.566667 2.566667 102.433333
65875 r732 V F10005_SI 0 0.532 1126 186 202 1514 28.616657 370.816667 -18.566667 -335.066667 -2.566667 2.566667 18.566667
65876 r737 V F10005_SI 0 0.457 1353 148 118 1619 30.601299 370.816667 27.433333 -503.066667 -2.566667 2.566667 27.433333
65877 r799 V F10005_SI 0 0.488 1609 272 250 2131 40.278795 370.816667 19.433333 -239.066667 -2.566667 2.566667 19.433333
65878 r820 V F10005_SI 0 0.506 2337 248 245 2830 53.490845 370.816667 0.433333 -249.066667 -2.566667 2.566667 0.433333
65879 r85 V F10005_SI 0 0.562 1859 234 289 2382 45.023036 370.816667 -57.566667 -161.066667 -2.566667 2.566667 57.566667
65880 w114 V F10005_SI 0 0.397 1095 472 300 1867 35.288836 370.816667 169.433333 -139.066667 -2.566667 2.566667 169.433333
65881 w38 V F10005_SI 0 0.514 3793 704 727 5224 98.740697 370.816667 -25.566667 714.933333 -2.566667 2.566667 25.566667
65882 w47 V F10005_SI 0 0.495 2971 772 737 4480 84.678086 370.816667 32.433333 734.933333 -2.566667 2.566667 32.433333
65883 w52 V F10005_SI 0 0.495 2136 225 212 2573 48.633196 370.816667 10.433333 -315.066667 -2.566667 2.566667 10.433333
65884 w55 V F10005_SI 1 0.773 3809 228 750 4787 90.480803 370.816667 -524.566667 760.933333 -2.566667 2.566667 524.566667
65885 w59 V F10005_SI 0 0.567 1531 329 417 2277 43.038393 370.816667 -90.566667 94.933333 -2.566667 2.566667 90.566667
65886 w64 V F10005_SI 0 0.486 3494 423 387 4304 81.351447 370.816667 33.433333 34.933333 -2.566667 2.566667 33.433333
65887 w68 V F10005_SI 1 0.438 2826 639 487 3952 74.698169 370.816667 149.433333 234.933333 -2.566667 2.566667 149.433333
65888 w76 V F10005_SI 0 0.514 2561 475 488 3524 66.608387 370.816667 -15.566667 236.933333 -2.566667 2.566667 15.566667
65889 w79 V F10005_SI 1 0.633 2754 392 654 3800 71.825162 370.816667 -264.566667 568.933333 -2.566667 2.566667 264.566667

In [ ]:


In [ ]:


In [17]:
marenRawCounts.columns


Out[17]:
Index([            u'line',    u'mating_status',        u'fusion_id',
       u'flag_AI_combined',    u'q5_mean_theta',         u'sum_both',
               u'sum_line',       u'sum_tester',        u'sum_total',
               u'mean_apn',               u'mu',         u'cis_line',
             u'trans_line',       u'cis_tester',     u'trans_tester',
                u'mag_cis'],
      dtype='object')

In [15]:
meanByMsLine = marenRawCounts[['mean_apn', 'cis_line', 'mating_status', 'line']].groupby(['mating_status', 'line']).mean()
meanByMsLine.columns


Out[15]:
Index([u'mean_apn', u'cis_line'], dtype='object')

In [60]:
meanByMsLine.plot(kind='scatter', x='mean_apn', y='cis_line')


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2b4a987450>

In [61]:


In [68]:
def cisAPN(df, fusion, value='cis_line', xcutoff='>=150', ycutoff='<=-180'):
    """ Plot effects vs mean apn"""
    # Pull out fusion of interest
    dfSub = marenRawCounts[marenRawCounts['fusion_id'] == fusion]

    # Make scatter plot
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    dfSub.plot(kind='scatter', x='mean_apn', y='cis_line', ax=ax, title=fusion)

    # Annotate outliers
    xc = 
    filt = dfSub.loc[(dfSub[value] eval(ycutoff)) | (dfSub['mean_apn'] eval(xcutoff)), ['line', 'mating_status', 'mean_apn', 'cis_line']]
    for row in filt.values:
        line, ms, apn, cis = row
        ax.annotate(line + '_' + ms, xy=(apn, cis))

    fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_{}_v_meanApn.png'.format(fusion, value))
    plt.savefig(fname, bbox_inches='tight')


  File "<ipython-input-68-ab89754b1cf2>", line 11
    filt = dfSub.loc[(dfSub[value] eval(ycutoff)) | (dfSub['mean_apn'] eval(xcutoff)), ['line', 'mating_status', 'mean_apn', 'cis_line']]
                                      ^
SyntaxError: invalid syntax

In [70]:
eval("{} == 'M'".format(marenRawCounts['mating_status']))


  File "<string>", line 1
    0         M
              ^
SyntaxError: invalid syntax

In [ ]: