tutorial: Data Camp Jupyter Notebook Tutorial
cd /data640g1/data/documents/docs/projects/payton_venusar/VENUSAR_DEV/venusar
python3 thresholds.py -fpr 0.001 -m ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.txt -o ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt
python3 tf_expression.py -i ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf -o 1 -m ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt -e ../../data/ALL_ARRAYS_NORMALIZED_MAXPROBE_LOG2_COORDS.sorted.txt -mo ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.txt
python3 motifs.py -i ../../data/FLDL_CCCB_RARE_VARIANTS.MERGED.RNA_DP10.RNA_NODUPS.CHIP_MULTIMARK.SORTED.vcf -r ../../data/genome_reference/reference_genome_hg19.fa -m ../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.txt -o ../../data/output.motif.20170114.vcf -fm -fp -ci ../../data/GM12878.ENCODE.ALL_TFS.bed -co ../../data/output.chip_peaks_output.20170114.bed &> ../../data/0_run_logs/20170114_motifs_run_stdout.txt
python3 activity.py -i ../../data/output.motif.20170114.vcf -a ../../data/QN_FLDL_CCCB_K27AC_PEAKS_SIGNAL.bed -ov ../../data/output.activity.20170114.vcf -ob ../../data/output.activity.20170114.bed -th 2 &> ../../data/0_run_logs/20170114_activity_run_stdout.txt
python3 gene_expression.py -i ../../data/output.activity.20170114.vcf -e ../../data/ALL_ARRAYS_NORMALIZED_MAXPROBE_LOG2_COORDS.sorted.txt -ov ../../data/output.gene_expression.20170114.vcf -eth 5 -th 2 &> ../../data/0_run_logs/20170114_gene_expression_run_stdout.txt
In [2]:
import venusar
import motif
import thresholds
import motifs
import activity
import tf_expression
import gene_expression
In [54]:
# plotly by default wants to put everything in the cloud
# ref: https://plot.ly/python/getting-started/
# must run this code before plotly command calls to avoid account setup request and allow local usage
# note: plotly.plotly methods are cloud only (ridiculous should be able to use localhost server)
# instead must use plotly.offline and plotly.iplot
# iplot is jupyter notebook specific
# Plotly Offline allows you to create graphs offline and save them locally.
# two methods for plotting offline: plotly.offline.plot() and plotly.offline.iplot().
# ref:https://plot.ly/python/offline/
#
# ref: generally plotly plot ref: https://plot.ly/python/user-guide/
#import plotly.plotly as py # cloud only
#import plotly.graph_objs as go
import plotly
#help(plotly.offline.iplot)
print( plotly.__version__ )
plotly.offline.init_notebook_mode(connected=True)
In [39]:
motif_f_base='../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.txt'
pc = 0.1
th = 0
bp = [0.25, 0.25, 0.25, 0.25]
motif_set_base = motif.get_motifs(motif_f_base, pc, th, bp)
In [40]:
motif_f_expressed='../../data/HOCOMOCOv10.JASPAR_FORMAT.TF_IDS.fpr_0p001.txt.bed_reduced.txt'
motif_set_expressed = motif.get_motifs(motif_f_expressed, pc, th, bp)
In [41]:
print( "length " + format(len(motif_set_expressed.motifs)) )
print( "length " + format(motif_set_expressed.length()) )
In [58]:
motif_lengths_base = motif_set_base.element_positions_list(False)
motif_lengths_expressed = motif_set_expressed.element_positions_list(False)
print( "motif_lengths_base type: " + format(type(motif_lengths_base)) + " has " + format(len(motif_lengths_base)) + " elements.")
print( "motif_lengths_expressed type: " + format(type(motif_lengths_expressed)) + " has " + format(len(motif_lengths_expressed)) + " elements.")
print( "motif_lengths_expressed has " + format(len(motif_lengths_base) - len(motif_lengths_expressed)) + " fewer elements.")
In [46]:
# ref: https://plot.ly/python/histograms/
data = [
plotly.graph_objs.Histogram(
x=motif_lengths_base
)
]
layout = plotly.graph_objs.Layout(
title='TF Length Histogram after running tf_expression.py',
xaxis=dict(
title='TF Length',
ticklen=5,
zeroline=False,
gridwidth=2,
),
yaxis={'title':'Number of TF with Length'}
)
#plotly.offline.iplot(data) # basic plot
go_figure = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(go_figure)
In [62]:
# repeating graph setup using reproducible code
# ref: https://plot.ly/python/histograms/
# -- setup the layout information to reuse
xaxis_template = {
'title':'TF Length',
'ticklen':5,
'zeroline':False,
'gridwidth':2,
}
yaxis_template ={'title':'Number of TF with Length'}
# -- plot 1
hist_all_TF = plotly.graph_objs.Histogram(
name='All TF',
x=motif_lengths_base,
opacity=0.75
)
layout = plotly.graph_objs.Layout(
title='TF Length Histogram (all TF)',
xaxis=xaxis_template,
yaxis=yaxis_template
)
go_figure = plotly.graph_objs.Figure(data=[hist_all_TF], layout=layout)
plotly.offline.iplot(go_figure)
# -- plot 2
hist_expressed_TF = plotly.graph_objs.Histogram(
name='Expressed TF',
x=motif_lengths_expressed,
opacity=0.75
)
layout = plotly.graph_objs.Layout(
title='TF Length Histogram after running tf_expression.py',
xaxis=xaxis_template,
yaxis=yaxis_template
)
go_figure = plotly.graph_objs.Figure(data=[hist_expressed_TF], layout=layout)
plotly.offline.iplot(go_figure)
# -- plot overlay
layout = plotly.graph_objs.Layout(
title='TF Length Histogram comparison tf_expression.py dropped',
xaxis=xaxis_template,
yaxis=yaxis_template,
barmode='overlay'
)
go_figure = plotly.graph_objs.Figure(data=[hist_all_TF, hist_expressed_TF], layout=layout)
plotly.offline.iplot(go_figure)
In [61]:
help(go_figure.layout)
go_figure.layout
Out[61]:
In [ ]:
# next create a reader that exports motifs=TF, samples, and variant information from the vcf file
# as a python structure
#
# get vcf reader for variant elements from motifs.py
# get vcf reader for samples from gene_expression.py
#
# then do similar and expanded analysis looking at which motifs were selected/exported by variant
# could also group by variant type
# a->t, etc
#
In [1]:
# create a pandas data frame of the data table
import pandas as pandas
vcf_table_file = '../../data/output.gene_expression.20170114.table_full.txt'
vcf_table = pandas.read_csv(vcf_table_file, sep='\t')
In [3]:
# confirm type
type(vcf_table)
Out[3]:
In [10]:
vcf_table.sort_values('DISTANCE', ascending=False)
Out[10]:
In [21]:
vcf_table.size
Out[21]:
In [22]:
vcf_table[vcf_table.DISTANCE > 16].size
Out[22]:
In [16]:
vcf_table.plot.hist()
Out[16]:
In [52]:
# XXX: This doesn't work yet; fails creating new column
# ref: https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf
# ref: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html
import numpy
motif_score_scale = max(vcf_table['LOG2_VAR-REF_SCORE'] ** 2)
act_score_scale = max(vcf_table.ACT_ZSCORE ** 2)
gene_score_scale = max(vcf_table.EXP_ZSCORE ** 2)
print('Score Scaling Factors:\n\tMotif = \t', motif_score_scale, '\n\tActivity = \t', act_score_scale, '\n\tGene = \t\t', gene_score_scale )
#vcf_table['distance_w'] = vcf_table.apply( numpy.sqrt, ((vcf_table['LOG2_VAR-REF_SCORE'] ** 2) / motif_score_scale) +
# ((vcf_table.ACT_ZSCORE ** 2) / act_score_scale ) +
# ((vcf_table.EXP_ZSCORE ** 2) / gene_score_scale ) )
vcf_table['distance_w'] = numpy.sqrt( ((vcf_table['LOG2_VAR-REF_SCORE'] ** 2) / motif_score_scale) + ((vcf_table.ACT_ZSCORE ** 2) / act_score_scale ) + ((vcf_table.EXP_ZSCORE ** 2) / gene_score_scale ) )
vcf_table.sort_values('distance_w', ascending=False)
Out[52]:
In [55]:
data = [
plotly.graph_objs.Histogram(
x=vcf_table.distance_w
)
]
layout = plotly.graph_objs.Layout(
title='Scaled Score Distance Histogram',
xaxis=dict(
title='Scaled Score',
ticklen=5,
zeroline=False,
gridwidth=2,
),
yaxis={'title':'Number of Samples with Distance Score'}
)
#plotly.offline.iplot(data) # basic plot
go_figure = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(go_figure)
In [65]:
vcf_tabletopx = vcf_table[vcf_table.distance_w > .8]
vcf_tabletopx.size
Out[65]:
In [66]:
trace1 = plotly.graph_objs.Scatter3d(
name="Distances",
x=vcf_tabletopx['LOG2_VAR-REF_SCORE'],
y=vcf_tabletopx.ACT_ZSCORE,
z=vcf_tabletopx.EXP_ZSCORE,
mode='markers',
marker=dict(
size=4, # Using gene expression as color scale values for now.
color=vcf_tabletopx.EXP_ZSCORE, # set color to an array/list of desired values
colorscale='Viridis', # choose a colorscale
opacity=0.8
)
)
data = [trace1]
layout = plotly.graph_objs.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
),
title='Distances from Average for Individual Variant Events',
scene=dict(
xaxis=dict(
title='log2 Var/Ref Motif Log-Odds Ratio Difference',
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
),
yaxis=dict(
title='Enhancer Activity z-Score',
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
),
zaxis=dict(
title='Gene Expression z-Score',
titlefont=dict(
family='Courier New, monospace',
size=18,
color='#7f7f7f'
)
)
)
)
go_figure = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(go_figure)
In [ ]: