Low Coverage Lines

Kjong was wanting a list of low coverage lines to see if they are correlated with the eQTL results. Here I make these lists for mated and virgin and together.

Import Raw APN Counts


In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
fname = '../pipeline_output/rawAPN_uqNormCenter_plusFlags_20140518.csv'
df = pd.read_csv(fname)

In [ ]:
# Sum Region Depth across bio-reps
grp = df.groupby(['line', 'mating_status', 'fusion_id'])
dfSums = pd.DataFrame(grp['region_depth'].sum())
dfSums.reset_index(inplace=True)

In [ ]:
# Grab region length for each fusion
rlen = df[['fusion_id', 'region_length']].copy()
rlen.dropna(inplace=True)
rlen.drop_duplicates(subset='fusion_id', inplace=True)

In [ ]:
merged = dfSums.merge(rlen, on='fusion_id', how='left')
merged['apn'] = merged['region_depth'] / merged['region_length']
merged.drop(['region_depth', 'region_length'], axis=1, inplace=True)

Mated


In [ ]:
# Created Mated data set
mated = merged[merged['mating_status'] == 'M'].copy()
mated.drop('mating_status', axis=1, inplace=True)

In [ ]:
# Transpose the data for plotting
matedT = mated.pivot_table(values='apn', columns='line', index='fusion_id')
matedT.head(3)

In [ ]:
# Order by lines mean APN
grp = mated.groupby('line')
dfMeanM = pd.DataFrame(grp['apn'].mean())
dfMeanM.sort(columns='apn', axis=0, inplace=True)
dfMeanM['rank'] = dfMeanM.rank()
dfMeanM.to_csv('../pipeline_output/mated_mean_apn_rank.csv')

In [ ]:
# Plot
matedT[dfMeanM.index].plot(kind='box', figsize=(20, 10), rot=90, ylim=(0, 10000))

Virgin


In [ ]:
# Created Virgin data set
virgin = merged[merged['mating_status'] == 'V'].copy()
virgin.drop('mating_status', axis=1, inplace=True)

In [ ]:
# Transpose the data for plotting
virginT = virgin.pivot_table(values='apn', columns='line', index='fusion_id')
virginT.head(3)

In [ ]:
# Order by lines mean APN
grp = virgin.groupby('line')
dfMeanV = pd.DataFrame(grp['apn'].mean())
dfMeanV.sort(columns='apn', axis=0, inplace=True)
dfMeanV['rank'] = dfMeanV.rank()
dfMeanV.to_csv('../pipeline_output/virgin_mean_apn_rank.csv')

In [ ]:
# Plot
virginT[dfMeanV.index].plot(kind='box', figsize=(20, 10), rot=90, ylim=(0, 10000))

Sum Across Mating Status

Here I want to combine mated and virgin and just look at line effects.


In [ ]:
# Sum APN across bio-reps and mating status
grp = df.groupby(['line', 'fusion_id'])
dfSumsBoth = pd.DataFrame(grp['region_depth'].sum())
dfSumsBoth.reset_index(inplace=True)

In [ ]:
# Grab region length for each fusion
rlen = df[['fusion_id', 'region_length']].copy()
rlen.dropna(inplace=True)
rlen.drop_duplicates(subset='fusion_id', inplace=True)

In [ ]:
merged = dfSumsBoth.merge(rlen, on='fusion_id', how='left')
merged['apn'] = merged['region_depth'] / merged['region_length']
merged.drop(['region_depth', 'region_length'], axis=1, inplace=True)

In [ ]:
# Transpose the data for plotting
bothT = merged.pivot_table(values='apn', columns='line', index='fusion_id')
bothT.head(3)

In [ ]:
# Order by lines mean APN
grp = merged.groupby('line')
dfMeanB = pd.DataFrame(grp['apn'].mean())
dfMeanB.sort(columns='apn', axis=0, inplace=True)
dfMeanB['rank'] = dfMeanB.rank()
dfMeanB.to_csv('../pipeline_output/mated_plus_virgin_mean_apn_rank.csv')

In [ ]:
# Plot
bothT[dfMeanB.index].plot(kind='box', figsize=(20, 10), rot=90, ylim=(0, 10000))

In [ ]: