This notebook exists to take the graphs from all the other notebooks and process them into tikz files for plotting in the report. This involves loading the numpy files which have been saved and plotting the graphs again. The default settings at the top of this notebook will be propagated to all the saved graphs, so it's very important that this is chosen appropriately.
It can take a long time to produce the results being graphed in each of the notebooks. To recreate the results would involve running, in some cases, the entire notebook again. Therefore, to make it easy to change annotations or formatting of the plots it is better to save the results in those notebook and plot the graphs here. However, in those notebooks after running analysis we still would like to have the results so the graphs will be plotted in those notebooks as well.
Of course, it would still save time to save the graphs once in the notebook in which the results are generated as pgf. Unfortunately, many of the results were saved before it was clear what the best way to plot the results would be so they have been saved to be plotted in this notebook. Also the backend for notebooks running pylab (which is all of them) can't be changed while the kernel is running. In new notebooks it would be preferable to save both the data and the final plots to pgf but it's not clear whether that will be possible and it may be easier to simply do all graph processing in this notebook.
Starting with just the default values from the parallel ML tutorial:
In [ ]:
import matplotlib
# Set backend to pgf
matplotlib.use('pgf')
import matplotlib.pyplot as plt
import numpy as np
In [ ]:
# Some nice default configuration for plots
plt.rcParams['figure.figsize'] = 6, 4.5
plt.rcParams['axes.grid'] = True
plt.gray()
In [129]:
#testing custom preamble from here: http://matplotlib.org/users/pgf.html
pgf_with_custom_preamble = {
"font.family": "serif", # use serif/main font for text elements
"text.usetex": True, # use inline math for ticks
"pgf.rcfonts": False, # don't setup fonts from rc parameters
"pgf.preamble": [
r"\usepackage{units}", # load additional packages
r"\usepackage{metalogo}",
r"\usepackage{unicode-math}", # unicode math setup
r"\setmathfont{xits-math.otf}",
r"\setmainfont{DejaVu Serif}", # serif font via preamble
],
"figure.figsize":(6,4.5)
}
matplotlib.rcParams.update(pgf_with_custom_preamble)
In [2]:
%matplotlib inline
In [3]:
cd ../../plots/bayes
In [4]:
import os
In [5]:
imagedir = os.path.abspath("../../opencast-bio/report/images/")
The graphs from the Classifier Training notebook fitting a logistic regression model to the DIP training data.
Loading the saved data file and plotting with a sample of the same code:
In [ ]:
lrlc = np.load("lrlc.npz")
In [5]:
mean_scores = lrlc['arr_0'][()]
Defining the function to plot the learning curve:
In [6]:
def plotlearningcurve(mean_scores):
import pylab as pl
# now the mean_scores dictionary contains everything required to build the plot
trainsizes = sorted(mean_scores.keys())
mean_train = np.array([mean_scores[train_size][0] for train_size in trainsizes])
mean_test = np.array([mean_scores[train_size][2] for train_size in trainsizes])
train_confidence = np.array([mean_scores[train_size][1]*2 for train_size in trainsizes])
test_confidence = np.array([mean_scores[train_size][3]*2 for train_size in trainsizes])
#plot the training scores
pl.figure()
pl.fill_between(trainsizes, mean_train - train_confidence, mean_train + train_confidence,
color = 'b', alpha = .2)
pl.plot(trainsizes, mean_train, 'o-k', c='b', label='Train score')
#plot the test scores
pl.fill_between(trainsizes, mean_test - test_confidence, mean_test + test_confidence,
color = 'g', alpha = .2)
pl.plot(trainsizes, mean_test, 'o-k', c='g', label='Test score')
#extra annotation
pl.xlabel('Training set size')
pl.ylabel('Score')
pl.xlim(0, max(trainsizes))
pl.ylim((None, 1.0)) # The best possible score is 1.0
pl.legend(loc='best')
pl.title('Main train and test scores +/- 2 standard errors')
In [11]:
plotlearningcurve(mean_scores)
plt.savefig("lrlc.pgf", format='pgf')
In [6]:
import pandas as pd
from pandas.tools.plotting import andrews_curves
In [7]:
pd.options.display.mpl_style = 'default'
In [130]:
fnpz = np.load("parrallel.coordinates.plot.oop.npz")
In [131]:
X = fnpz['arr_0']
y = fnpz['arr_1']
In [132]:
maxes = np.amax(abs(X),axis=0) + 1e-14
In [133]:
plotdata = pd.DataFrame(X/maxes)
plotdata['training labels'] = y
In [137]:
andrews_curves(plotdata,'training labels')
#plt.savefig(os.path.join(imagedir,"out.andrews.curves.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"out.andrews.curves.png"),format='png')
In [134]:
for rowi,row in enumerate(X):
#normalise row values
row = row/(maxes)
#then just plot it
if y[rowi] > 0.5:
plt.plot(row,color='green',alpha=0.5)
else:
plt.plot(row,color='red',alpha=0.05)
#plt.savefig(os.path.join(imagedir,"out.parallel.lines.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"out.parallel.lines.png"),format='png')
In [95]:
fnpz = np.load("parrallel.coordinates.plot.ip.npz")
X = fnpz['arr_0']
y = fnpz['arr_1']
maxes = np.amax(abs(X),axis=0) + 1e-14
plotdata = pd.DataFrame(X/maxes)
plotdata['training labels'] = y
In [140]:
andrews_curves(plotdata,'training labels')
#plt.savefig(os.path.join(imagedir,"in.andrews.curves.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"in.andrews.curves.png"),format='png')
In [94]:
for rowi,row in enumerate(X):
#normalise row values
row = row/(maxes)
#then just plot it
if y[rowi] > 0.5:
plt.plot(row,color='green',alpha=0.5)
else:
plt.plot(row,color='red',alpha=0.05)
#plt.savefig(os.path.join(imagedir,"in.parallel.lines.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"in.parallel.lines.png"),format='png')
In [96]:
nones,nzeros = 1000, 1000
In [97]:
X_pca = np.load("pca.oop.npz")['arr_0']
In [98]:
ones = plt.scatter(X_pca[:nones,0],X_pca[:nones,1],c='red',alpha=0.2)
zeros = plt.scatter(X_pca[nones:,0],X_pca[nones:,1],c='blue',marker="x",alpha=0.2)
l=plt.legend((ones,zeros),("interactions","non-interactions"),loc=0)
plt.savefig(os.path.join(imagedir,"out.pca.png"),format='png')
In [237]:
nones,nzeros = 1000, 600000
In [238]:
X_pca = np.load("pca.ip.npz")['arr_0']
In [239]:
ones = plt.scatter(X_pca[:nones,0],X_pca[:nones,1],c='red',alpha=0.2)
zeros = plt.scatter(X_pca[nones:,0],X_pca[nones:,1],c='blue',marker="x",alpha=0.05)
l=plt.legend((ones,zeros),("interactions","non-interactions"),loc=3)
#plt.savefig(os.path.join(imagedir,"in.pca.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"in.pca.png"),format='png')
In [102]:
nones,nzeros = 100, 100
In [103]:
X_tsne = np.load("tdsne.oop.npz")['arr_0']
In [104]:
ones = plt.scatter(X_tsne[:nones,0],X_tsne[:nones,1],c='red')
zeros = plt.scatter(X_tsne[nones:,0],X_tsne[nones:,1],c='blue',marker="x")
l=plt.legend((ones,zeros),("interactions","non-interactions"),loc=0)
plt.tick_params(labelleft='off',labelbottom='off')
#plt.savefig(os.path.join(imagedir,"out.tsne.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"out.tsne.png"),format='png')
In [105]:
nones,nzeros = 10, 6000
In [106]:
X_tsne = np.load("tdsne.ip.npz")['arr_0']
In [107]:
ones = plt.scatter(X_tsne[:nones,0],X_tsne[:nones,1],c='red',alpha=0.6)
zeros = plt.scatter(X_tsne[nones:,0],X_tsne[nones:,1],c='blue',marker="x",alpha=0.1)
l=plt.legend((ones,zeros),("interactions","non-interactions"),loc=0)
plt.tick_params(labelleft='off',labelbottom='off')
#plt.savefig(os.path.join(imagedir,"in.tsne.tikz"),format='pgf')
plt.savefig(os.path.join(imagedir,"in.tsne.png"),format='png')
In [193]:
dimensions = (4,2.4)
In [174]:
rfimportances = np.load("../hippie/random.forest.importances.npz")['arr_0']
In [194]:
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.plot(rfimportances)
plt.xlabel("Feature index")
plt.ylabel("Importance")
plt.savefig(os.path.join(imagedir,"unbalanced.weighting.tikz"),format='pgf')
In [176]:
logrefweights = np.load("logistic.regression.coef.npz")['arr_0']
In [195]:
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.plot(logrefweights)
plt.xlabel("Feature index")
plt.ylabel("Coefficient")
plt.savefig(os.path.join(imagedir,"logreg.weights.tikz"),format='pgf')
In [196]:
rfimportances = np.load("random.forest.importances.npz")["arr_0"]
In [197]:
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.plot(rfimportances)
plt.xlabel("Feature index")
plt.ylabel("Importance")
plt.savefig(os.path.join(imagedir,"rf.importances.tikz"),format='pgf')
In [198]:
def plotroc(fpr,tpr,name):
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.clf()
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.savefig(os.path.join(imagedir,name),format='pgf')
return None
In [199]:
fpr = np.load("logistic.regression.roc.npz")['arr_0']
tpr = np.load("logistic.regression.roc.npz")['arr_1']
In [200]:
plotroc(fpr,tpr,"logreg.roc.tikz")
In [201]:
fpr = np.load("random.forest.roc.npz")['arr_0'][0]
tpr = np.load("random.forest.roc.npz")['arr_0'][1]
In [202]:
plotroc(fpr,tpr,"rf.roc.tikz")
In [204]:
def drawprecisionrecall(precision,recall,name):
fig = plt.figure()
fig.set_size_inches(*dimensions)
plt.clf()
plt.plot(recall,precision)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.savefig(os.path.join(imagedir,name),format='pgf')
return None
In [205]:
with np.load("lr_precisionrecall.npz") as data:
precision = data['arr_0']
recall = data['arr_1']
In [206]:
drawprecisionrecall(precision,recall,"logreg.pr.tikz")
In [207]:
with np.load("random.forest.precisionrecall.npz") as data:
precision = data['arr_0']
recall = data['arr_1']
In [208]:
drawprecisionrecall(precision,recall,"rf.pr.tikz")
In [190]:
with np.load("postweightings.npz") as nf:
weights = nf['arr_0']
In [209]:
fig = plt.figure()
fig.set_size_inches(*dimensions)
h=plt.hist(weights,bins=50)
plt.xlabel("posterior probability")
plt.ylabel("frequency")
plt.savefig(os.path.join(imagedir,"bayes.weights.dist.tikz"),format='pgf')
In [210]:
with np.load("nx2933.npz") as nf:
uw29 = list(nf['arr_0'])
w33 = list(nf['arr_1'])
In [211]:
interactions = np.loadtxt("../../HBP/testdata/edgelist_update_weighted.txt",dtype=str)
interactions = interactions[1:]
In [212]:
import networkx as nx
In [213]:
def plotcommunities(com1,com2,fname,title):
fig = plt.figure()
fig.set_size_inches(4,3)
plt.title(title, size=12)
G = nx.Graph()
for l in interactions:
if l[0] in set(com1+com2) and l[1] in set(com1+com2):
G.add_edge(l[0],l[1],weight=float(l[2]))
edict = {}
lim = min([d['weight'] for (u,v,d) in G.edges(data=True)])
diff = np.linspace(lim,1.0,10)[1]- np.linspace(lim,1.0,10)[0]
for x in np.linspace(lim,1.0,10):
edict[x] = [(u,v) for (u,v,d) in G.edges(data=True) if d['weight'] > x and d['weight'] < x+diff]
pos = nx.circular_layout(com1)
pos2 = nx.circular_layout(set(com2)-set(com1))
for k in pos2:
pos2[k] = np.array([pos2[k][0]+1.5,pos2[k][1]])
pos = dict(pos.items()+pos2.items())
nx.draw_networkx_nodes(G,pos,node_size=20,alpha=0.5)
for k in edict:
nx.draw_networkx_edges(G,pos,edgelist=edict[k],alpha=(k-lim)*(1/(1-lim)),edge_color='r')
#nx.draw_networkx_edges(G,pos,edgelist=edict[k],edge_color='r')
l=nx.draw_networkx_labels(G,pos,font_size=5,font_family='sans-serif')
plt.tick_params(labelleft='off',labelbottom='off')
plt.savefig(os.path.join(imagedir,fname))
return None
In [214]:
plotcommunities(uw29,w33,"nx2933.pgf","Unweighted community 29 interactions")
In [215]:
plotcommunities(w33,uw29,"nx3329.pgf","Weighted community 33 interactions")
In [216]:
with np.load("nx6444.npz") as nf:
uw64 = list(nf['arr_0'])
w44 = list(nf['arr_1'])
In [217]:
plotcommunities(uw64,w44,"nx6444.pgf","Unweighted community 64 interactions")
In [218]:
plotcommunities(w44,uw64,"nx4464.pgf","Weighted community 44 interactions")