This notebook is designed to 1) create a dataframe that brings together runtime information with the information provided by the QC group. It then 2) calculates the correlation of QC variables to runtime for a core variant calling workflows. Based on those correlations, it 3) generates a linear regression model to predict runtime from QC variables and then 4) evaluates the performance of this model on ~20% of the dataset that was excluded from the training.
In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
# this allows plots to appear directly in the notebook
%matplotlib inline
In [2]:
import math
# read a dict for runtimes
f = open("dkfz_embl_timing.tsv", "r")
runtimes = {}
for line in iter(f):
a = line.split("\t")
runtimes[a[0]] = a[1]
f.close()
# read data into a DataFrame
data = pd.read_csv('PCAWG-QC_Summary-of-Measures_QC_Measures.tsv', delimiter='\t')
data['runtime'] = 0.0
# loop over and annotate with runtime
for index, row in data.iterrows():
key = row['Project_code'] + "::" + row['Submitter_donor_ID']
try:
curr_runtime = math.log(float(runtimes[key]))
#curr_runtime = float(runtimes[key])
data.set_value(index, 'runtime', curr_runtime)
except:
continue
data.head()
Out[2]:
In [3]:
# now I have a merged dataframe that has all QC values along with the runtime for the workflow
print(data.shape)
In [4]:
# Import matplotlib
import matplotlib.pyplot as plt
# Make a histogram of all the ratings in the average_rating column.
plt.hist(data["runtime"])
# Show the plot.
plt.show()
In [5]:
# remove 0 runtimes
data = data[data["runtime"] > 0.0]
plt.hist(data["runtime"])
plt.show()
In [6]:
# remove any NAN
data = data.dropna()
data.isnull().values.any()
Out[6]:
In [7]:
# showing zeros and nan have been removed
print(data.shape)
In [8]:
# general stats
data.describe()
Out[8]:
In [9]:
# look at correlation
data.corr()["runtime"]
Out[9]:
In [10]:
# visualize the relationship between the features and the response using scatterplots
import matplotlib.pyplot as plt
fig, axs = plt.subplots(4, 3, sharey=True)
fig.subplots_adjust(hspace=.5, wspace=.5)
data.plot(kind='scatter', x='Stars', y='runtime', ax=axs[0, 0], figsize=(16, 16))
data.plot(kind='scatter', x='Mean_Coverage_Normal', y='runtime', ax=axs[0, 1])
data.plot(kind='scatter', x='Mean_Coverage_Tumour', y='runtime', ax=axs[0, 2])
data.plot(kind='scatter', x='FWHM_Normal', y='runtime', ax=axs[1, 0])
data.plot(kind='scatter', x='Median/Mean_Coverage_Tumour', y='runtime', ax=axs[1, 1])
data.plot(kind='scatter', x='FWHM_Tumour', y='runtime', ax=axs[1, 2])
data.plot(kind='scatter', x='Somatic_Mutation_Calling_Coverage', y='runtime', ax=axs[2, 0])
data.plot(kind='scatter', x='%_of_paired_reads_mapping_to_different_chromosomes_Normal', y='runtime', ax=axs[2, 1])
data.plot(kind='scatter', x='%_of_paired_reads_mapping_to_different_chromosomes_Tumour', y='runtime', ax=axs[2, 2])
data.plot(kind='scatter', x='Ratio_of_difference_in_edits_between_paired_reads_Normal', y='runtime', ax=axs[3, 0])
data.plot(kind='scatter', x='Ratio_of_difference_in_edits_between_paired_reads_Tumour', y='runtime', ax=axs[3, 1])
data.plot(kind='scatter', x='runtime', y='runtime', ax=axs[3, 2])
Out[10]:
In [11]:
# now clear out the columns that we don't want to use
# Get all the columns from the dataframe.
columns = data.columns.tolist()
# Filter the columns to remove ones we don't want.
columns = [c for c in columns if c not in ["runtime", "Project_code", "Submitter_donor_ID", "Normal_WGS_aliquot_ID", "Tumour_WGS_aliquot_ID"]]
#columns = [c for c in columns if c not in ["runtime", "Project_code", "Submitter_donor_ID", "Normal_WGS_aliquot_ID", "Tumour_WGS_aliquot_ID", "Median/Mean_Coverage_Normal", "FWHM_Normal", "Median/Mean_Coverage_Tumour", "FWHM_Tumour", "Somatic_Mutation_Calling_Coverage", "%_of_paired_reads_mapping_to_different_chromosomes_Normal", "Ratio_of_difference_in_edits_between_paired_reads_Normal", "Ratio_of_difference_in_edits_between_paired_reads_Tumour"]]
# Store the variable we'll be predicting on.
target = "runtime"
In [12]:
# Import a convenience function to split the sets.
from sklearn.model_selection import train_test_split
# Generate the training set. Set random_state to be able to replicate results.
train = data.sample(frac=0.8, random_state=1)
# Select anything not in the training set and put it in the testing set.
test = data.loc[~data.index.isin(train.index)]
# Print the shapes of both sets.
print(train.shape)
print(test.shape)
In [13]:
# Import the linearregression model.
from sklearn.linear_model import LinearRegression
# Initialize the model class.
model = LinearRegression()
train.head()
# Fit the model to the training data.
model.fit(train[columns], train[target])
Out[13]:
In [14]:
# now test
# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error
# Generate our predictions for the test set.
predictions = model.predict(test[columns])
# Compute error between our test predictions and the actual values.
mean_squared_error(predictions, test[target])
Out[14]:
In [15]:
# try random forest
# Import the random forest model.
from sklearn.ensemble import RandomForestRegressor
# Initialize the model with some parameters.
model = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
# Fit the model to the data.
model.fit(train[columns], train[target])
# Make predictions.
predictions = model.predict(test[columns])
# Compute the error.
mean_squared_error(predictions, test[target])
Out[15]:
In [19]:
# look at predicted vs. actual
import statsmodels.api as sm
import numpy as np
data['predicted_runtime'] = model.predict(data[columns])
#data.plot(kind='scatter', x='runtime', y='predicted_runtime')
results = sm.OLS(data['predicted_runtime'],sm.add_constant(data['runtime'])).fit()
print(results.summary())
plt.scatter(data['runtime'],data['predicted_runtime'])
#X_plot = np.linspace(0,1,100)
#plt.plot(X_plot, X_plot*results.params[0] + results.params[1])
# fit with np.polyfit
m, b = np.polyfit(data['runtime'], data['predicted_runtime'], 1)
#plt.plot(x, y, '.')
plt.plot(data['runtime'], m*data['runtime'] + b, '-', color='r')
plt.ylabel('predicted runtime (log)')
plt.xlabel('runtime (log)')
plt.show()
In [ ]: