Maren Equations Summary

This notebook is just pulling out the important figures and tables for the manuscript. For more detailed explanations and exploring see other notebooks.


In [1]:
# Set-up default environment
%run '../ipython_startup.py'

# Import additional libraries
import sas7bdat as sas
import cPickle as pickle
import statsmodels.formula.api as smf

from ase_cisEq import marenEq
from ase_cisEq import marenPrintTable

from ase_normalization import meanCenter
from ase_normalization import q3Norm
from ase_normalization import meanStd

pjoin = os.path.join


Importing commonly used libraries: 
            os, sys 
            numpy as np 
            scipy as sp 
            pandas as pd 
            matplotlib as mp 
            matplotlib.pyplot as plt
            datetime as dt 
            mclib_Python/flagging as fg

Creating project level variables: 
        MCLAB = /home/jfear/mclab 
        PROJ = /home/jfear/mclab/cegs_ase_paper 
        TODAY = 20150929

Adding ['scripts/mclib_Python', 'scripts/ase_Python'] to PYTHONPATH

Import clean data set

This data set was created by: ase_summarize_ase_filters.sas

The data has had the following droped:

  • regions that were always bias in 100 genome simulation
  • regions with APN $\le 25$
  • regions not in at least 10% of genotypes
  • regions not in mated and virgin
  • genotypes with extreme bias in median(q5_mean_theta)
  • genotypes with $\le500$ regions

In [2]:
# Import clean dataset
with sas.SAS7BDAT(pjoin(PROJ, 'sas_data/clean_ase_stack.sas7bdat')) as FH:
    df = FH.to_data_frame()
    
dfClean = df[['line', 'mating_status', 'fusion_id', 'flag_AI_combined', 'q5_mean_theta', 'sum_both', 'sum_line', 'sum_tester', 'sum_total', 'mean_apn']]
print 'Rows ' + str(dfClean.shape[0])
print 'Columns ' + str(dfClean.shape[1])
print 'Number of Genotypes ' + str(len(set(dfClean['line'].tolist())))
print 'Number of Exonic Regions ' + str(len(set(dfClean['fusion_id'].tolist())))


[clean_ase_stack.sas7bdat] header length 65536 != 8192
WARNING:/home/jfear/mclab/cegs_ase_paper/sas_data/clean_ase_stack.sas7bdat:[clean_ase_stack.sas7bdat] header length 65536 != 8192
Rows 159934
Columns 10
Number of Genotypes 49
Number of Exonic Regions 5391

Additional cleaning

For the maren equations, I am also going to drop exonic regions with less than 10 genotypes. The maren equations make some assumptions about the population level sums. Obvisouly the more genotypes that are present for each fusions the better, but I am comfortable with as few as 10 genotypes.


In [3]:
# Drop groups with less than 10 lines per fusion
grp = dfClean.groupby(['mating_status', 'fusion_id'])
dfGt10 = grp.filter(lambda x: x['line'].count() >= 10).copy()
print 'Rows ' + str(dfGt10.shape[0])
print 'Columns ' + str(dfGt10.shape[1])
print 'Number of Genotypes ' + str(len(set(dfGt10['line'].tolist())))
print 'Number of Exonic Regions ' + str(len(set(dfGt10['fusion_id'].tolist())))


Rows 131700
Columns 10
Number of Genotypes 49
Number of Exonic Regions 3053

Raw Counts


In [4]:
# Calculate Maren TIG equations by mating status and exonic region
marenRawCounts = marenEq(dfGt10, Eii='sum_line', Eti='sum_tester', group=['mating_status', 'fusion_id'])
marenRawCounts['mag_cis'] = abs(marenRawCounts['cis_line'])
marenRawCounts.columns


Out[4]:
Index([            u'line',    u'mating_status',        u'fusion_id',
       u'flag_AI_combined',    u'q5_mean_theta',         u'sum_both',
               u'sum_line',       u'sum_tester',        u'sum_total',
               u'mean_apn',               u'mu',         u'cis_line',
             u'trans_line',       u'cis_tester',     u'trans_tester',
                u'mag_cis'],
      dtype='object')

In [ ]:
marenRawCounts

In [ ]:

Plot Distribution of cis- and trans-effects


In [5]:
# Plot densities
def panelKde(df, **kwargs):
    options = {'subplots': True, 
               'layout': (7, 7), 
               'figsize': (20, 20), 
               'xlim': (-500, 500), 
               'legend': False,
               'color': 'k'}
    options.update(kwargs)
    
    # Make plot
    axes = df.plot(kind='kde', **options)
    
    # Add titles intead of legends
    try:
        for ax in axes.ravel():
            h, l = ax.get_legend_handles_labels()
            ax.set_title(l[0])
            ax.get_yaxis().set_visible(False)
            ax.axvline(0, lw=1)
    except:
        ax = axes
        ax.get_yaxis().set_visible(False)
        ax.axvline(0, lw=1)
    
    return plt.gcf()
    
def linePanel(df, value='cis_line', index='fusion_id', columns='line'):
    
    mymap = {
        'cis_line': 'cis-Line Effects',
        'trans_line': 'trans-Line Effects',
        'line': 'genotype',
        'fusion_id': 'exonic_region'
    }
    
    # Iterate over mated and virgin
    for k, v in {'M': 'Mated', 'V': 'Virgin'}.iteritems():
        
        # Pivot data frame so that the thing you want to make panels by is in columns.
        dfPiv = pd.pivot_table(df[df['mating_status'] == k], 
                               values=value, index=index, columns=columns)
        
        # Generate panel plot with at most 49 panels
        if value == 'cis_line':
            xlim = (-500, 500)
        else:
            # trans-effects appear to be larger in magnitude
            xlim = (-1500, 1500)
            
        fig = panelKde(dfPiv.iloc[:, :49], xlim=xlim)
        
        title = '{}\n{}'.format(mymap[value], v)
        fig.suptitle(title, fontsize=18, fontweight='bold')
        
        fname = pjoin(PROJ, 'pipeline_output/cis_effects/density_plot_by_{}_{}_{}.png'.format(mymap[columns], value, v.lower()))
        plt.savefig(fname, bbox_inches='tight')
        print("Saved figure to: " + fname)
        
        plt.close()
        
def testerPanel(df, value='cis_tester'):
    mymap = {
        'cis_tester': 'cis-Tester Effects',
        'trans_tester': 'trans-Trans Effects'
    }
    
    # Iterate over mated and virgin
    for k, v in {'M': 'Mated', 'V': 'Virgin'}.iteritems():
        
        # Split table by mating status and drop duplicates, because 
        # there is only one tester value for each exonic region
        dfSub = df.ix[df['mating_status'] == k,['fusion_id', value]].drop_duplicates()
        
        # Generate Panel Plot
        fig = panelKde(dfSub, subplots=False)
        
        title = '{}\n{}'.format(mymap[value], v)
        fig.suptitle(title, fontsize=18, fontweight='bold')
        
        fname = pjoin(PROJ, 'pipeline_output/cis_effects/density_plot_{}_{}.png'.format(value, v.lower()))
        plt.savefig(fname, bbox_inches='tight')
        print("Saved figure to: " + fname)
        
        plt.close()

In [7]:
# Cis and trans line effects by genotype
linePanel(marenRawCounts, value='cis_line', index='fusion_id', columns='line')
linePanel(marenRawCounts, value='trans_line', index='fusion_id', columns='line')

# Cis and trans line effects by exonic region
linePanel(marenRawCounts, value='cis_line', index='line', columns='fusion_id')
linePanel(marenRawCounts, value='trans_line', index='line', columns='fusion_id')

# Cis and trans tester effects
testerPanel(marenRawCounts, value='cis_tester')
testerPanel(marenRawCounts, value='trans_tester')


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_genotype_cis_line_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_genotype_cis_line_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_genotype_trans_line_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_genotype_trans_line_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_exonic_region_cis_line_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_exonic_region_cis_line_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_exonic_region_trans_line_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_by_exonic_region_trans_line_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_cis_tester_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_cis_tester_virgin.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_trans_tester_mated.png
Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/density_plot_trans_tester_virgin.png

Plot cis- and trans-effects vs Allelic Proportion


In [14]:
# Set Globals
SHAPES = {'M': 'o', 'V': '^'}
CMAP='jet'

# Add color column to color by genotype
colors = {}
cnt = 0
genos = set(dfGt10['line'].tolist())
for l in genos:
    colors[l] = cnt
    cnt += 1

marenRawCounts['color'] = marenRawCounts['line'].map(colors)

In [15]:
# Plotting scatter
def getR2(df, x, y):
    """ Calculate the R-squared using OLS with an intercept """
    formula = '{} ~ {} + 1'.format(y, x)
    return smf.ols(formula, df).fit().rsquared

def scatPlt(df, x, y, c=None, cmap='jet', s=50, marker='o', ax=None, title=None, xlab=None, ylab=None, diag='pos'):
    """ Make a scatter plot using some default options """
    
    ax = df.plot(x, y, kind='scatter', ax=ax, c=c, cmap=cmap, s=s, marker=marker, title=title, colorbar=False)
    
    # Add a diag line
    if diag == 'neg':
        # draw a diag line with negative slope
        ax.plot([0, 1], [1, 0], transform=ax.transAxes)
    elif diag == 'pos':
        # draw a diag line with positive slope
        ax.plot([0, 1], [0, 1], transform=ax.transAxes)
        
    ax.set_xlabel(xlab)
    ax.set_ylabel(ylab)
    
    return ax

def scatPltPanel(df, line='sum_line', tester='sum_tester', x='cis_line', y='prop', cmap='jet', s=60, panel_title=None, diag='pos'):
    """ Make a panel of scatter plots using pandas """
    # Plot the cis-line effects x proportion Line by fusion
    df['prop'] = 1 - df[tester] / (df[line] + df[tester])
    
    # Create 5x5 panel plot
    fig, axes = plt.subplots(5, 5, figsize=(20, 20))
    fig.suptitle(panel_title, fontsize=12, fontweight='bold')
    axes = axes.ravel()

    # Group by fusion_id
    for i, (n, gdf) in enumerate(df.groupby('fusion_id')):
        ax = axes[i]
                    
        # Calculate R-squared value
        r2 = getR2(gdf, x, y)

        # Make new title with R-squared in it
        t = '{}\nR^2: {}'.format(n, round(r2, 3))

        # Change marker style based on mating status and plot
        for ms, msdf in gdf.groupby('mating_status'):
            scatPlt(msdf, x, y, c='color', cmap=cmap, s=s, marker=SHAPES[ms], ax=ax, title=t, xlab=x, ylab=y, diag=diag)

        # only plot 25 fusions
        if i == 24:
            break
    
    fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_by_exonic_region_{}_v_{}.png'.format(x, y))
    plt.savefig(fname, bbox_inches='tight')
    print("Saved figure to: " + fname)
    
    plt.close()

In [16]:
# Plot the cis-line effects x proportion by fusion
scatPltPanel(marenRawCounts, line='sum_line', tester='sum_tester', cmap=CMAP, panel_title='Raw Counts: cis-line')


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_by_exonic_region_cis_line_v_prop.png

In [17]:
# Plot the trans-line effects x proportion by fusion
scatPltPanel(marenRawCounts, line='sum_line', tester='sum_tester', x='trans_line', cmap=CMAP, panel_title='Raw Counts: trans-line', diag='neg')


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_by_exonic_region_trans_line_v_prop.png

Plot cis- and trans-effects vs Allelic Proportion for Specific Exonic Regions


In [18]:
# Plot F10005_SI
FUSION='F10005_SI'

dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])

# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')

for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')

    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')

    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)

fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)

plt.close()


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_F10005_SI_effects_v_prop.png

F10317_SI

This fusion has a weaker cis-line effects but trans-line effects look more linear.


In [19]:
# Plot F10317_SI
FUSION='F10317_SI'

dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])

# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')

for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')

    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')

    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)

fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)

plt.close()


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_F10317_SI_effects_v_prop.png

F10482_SI

This fusion has a weaker cis-line effects but trans-line effects look more linear.


In [20]:
# Plot F10482_SI
FUSION='F10482_SI'

dfFus = marenRawCounts[marenRawCounts['fusion_id'] == FUSION].copy()
dfFus['prop'] = 1 - dfFus['sum_tester'] / (dfFus['sum_line'] + dfFus['sum_tester'])

# Generate 3 panel plot
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
fig.suptitle(FUSION, fontsize=14, fontweight='bold')

for n, mdf in dfFus.groupby('mating_status'):
    # Plot the cis-line effects x proportion by fusion
    scatPlt(mdf, x='cis_line', y='prop', ax=axes[0], c='color', cmap=CMAP, marker=SHAPES[n], title='cis-line', xlab='cis-line', ylab='prop')

    # Plot the trans-line effects x proportion by fusion
    scatPlt(mdf, x='trans_line', y='prop', ax=axes[1], c='color', cmap=CMAP, marker=SHAPES[n], title='trans-line', xlab='trans-line', ylab='prop', diag='neg')

    # Plot the Tester effects x proportion by fusion
    scatPlt(mdf, x='cis_tester', y='prop', ax=axes[2], c='color', cmap=CMAP,  marker=SHAPES[n], title='Tester', xlab='cis-tester', ylab='prop', diag=None)

fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_effects_v_prop.png'.format(FUSION))
plt.savefig(fname, bbox_inches='tight')
print("Saved figure to: " + fname)

plt.close()


Saved figure to: /home/jfear/mclab/cegs_ase_paper/pipeline_output/cis_effects/scatter_plot_F10482_SI_effects_v_prop.png

In [15]:
meanByMsLine = marenRawCounts[['mean_apn', 'cis_line', 'mating_status', 'line']].groupby(['mating_status', 'line']).mean()
meanByMsLine.columns


Out[15]:
Index([u'mean_apn', u'cis_line'], dtype='object')

In [60]:
meanByMsLine.plot(kind='scatter', x='mean_apn', y='cis_line')


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2b4a987450>

In [61]:


In [68]:
def cisAPN(df, fusion, value='cis_line', xcutoff='>=150', ycutoff='<=-180'):
    """ Plot effects vs mean apn"""
    # Pull out fusion of interest
    dfSub = marenRawCounts[marenRawCounts['fusion_id'] == fusion]

    # Make scatter plot
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    dfSub.plot(kind='scatter', x='mean_apn', y='cis_line', ax=ax, title=fusion)

    # Annotate outliers
    xc = 
    filt = dfSub.loc[(dfSub[value] eval(ycutoff)) | (dfSub['mean_apn'] eval(xcutoff)), ['line', 'mating_status', 'mean_apn', 'cis_line']]
    for row in filt.values:
        line, ms, apn, cis = row
        ax.annotate(line + '_' + ms, xy=(apn, cis))

    fname = pjoin(PROJ, 'pipeline_output/cis_effects/scatter_plot_{}_{}_v_meanApn.png'.format(fusion, value))
    plt.savefig(fname, bbox_inches='tight')


  File "<ipython-input-68-ab89754b1cf2>", line 11
    filt = dfSub.loc[(dfSub[value] eval(ycutoff)) | (dfSub['mean_apn'] eval(xcutoff)), ['line', 'mating_status', 'mean_apn', 'cis_line']]
                                      ^
SyntaxError: invalid syntax

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [5]:
marenRawCounts.columns


Out[5]:
Index([            u'line',    u'mating_status',        u'fusion_id',
       u'flag_AI_combined',    u'q5_mean_theta',         u'sum_both',
               u'sum_line',       u'sum_tester',        u'sum_total',
               u'mean_apn',               u'mu',         u'cis_line',
             u'trans_line',       u'cis_tester',     u'trans_tester',
                u'mag_cis'],
      dtype='object')

In [6]:
mated = marenRawCounts[marenRawCounts['mating_status'] == 'M']

In [8]:
genos = set(mated['line'].tolist())

In [11]:
mated.head()


Out[11]:
line mating_status fusion_id flag_AI_combined q5_mean_theta sum_both sum_line sum_tester sum_total mean_apn mu cis_line trans_line cis_tester trans_tester mag_cis
0 r324 M F10001_SI 0 0.507 1465 107 104 1676 35.659574 108.7 14 -20.4 11 -11 14
1 r365 M F10001_SI 0 0.646 1281 67 115 1463 31.127660 108.7 -37 1.6 11 -11 37
2 r373 M F10001_SI 0 0.589 1743 181 247 2171 46.191489 108.7 -55 265.6 11 -11 55
3 r427 M F10001_SI 0 0.500 2071 94 88 2253 47.936170 108.7 17 -52.4 11 -11 17
4 r491 M F10001_SI 0 0.401 2102 74 46 2222 47.276596 108.7 39 -136.4 11 -11 39

In [14]:
grp = mated.groupby('line')

In [17]:
sub = grp.get_group('r324')

In [54]:
fig, ax = plt.subplots(1, 1, figsize=(10, 6), dpi=300)
ax.axvline(0, c='k', lw=1)
sub['cis_line'].plot(kind='kde', xlim=(-600, 600), ax=ax, label='all', style='b-', lw=3)
sub.ix[sub['flag_AI_combined'] == 1, 'cis_line'].plot(kind='kde', xlim=(-600, 600), ax=ax, label='AI', style='r--')
sub.ix[sub['flag_AI_combined'] == 0, 'cis_line'].plot(kind='kde', xlim=(-600, 600), ax=ax, label='no AI', style='m-.')
plt.legend()
plt.title('r324')


Out[54]:
<matplotlib.text.Text at 0x7f1d44fdb250>

In [ ]: