In [1]:
from IPython.core.display import HTML
import urllib2
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
%matplotlib inline
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
Out[1]:
Demo available at: https://github.com/dmarx/make_for_datascience
You're going to author a pipeline anyway (sometimes called a "master script", e.g. read_data_and_fit_models.r
). May as well do it right.
... https://en.wikipedia.org/wiki/List_of_build_automation_software
NB: Spaces != Tabs: recipes need to be indented with tabs
models/%.rdata: train_and_save_model.r data/basetable.rdata code/models/%.r
Rscript train_and_save_model.r $@
models/%.rdata: train_and_save_model.r data/basetable.rdata code/models/%.r
Rscript $< $@
Running the following Makefile
foo = $(bar)
bar = $(ugh)
ugh = Huh?
all:
echo $(foo)
will echo "Huh?":
$(foo)
expands to "\$(bar)" which expands to "\$(ugh)" which finally expands to "Huh?".
x := foo
y := $(x) bar
x := baz
is equivalent to
y := foo bar
x := baz
x := foo
test::
echo FIRST $(x)
x := bar
test::
echo SECOND $(x)
will not echo:
FIRST foo
SECOND bar
but rather:
FIRST bar
SECOND bar
3.7 How make Reads a Makefile
GNU make does its work in two distinct phases. During the first phase it reads all the makefiles, included makefiles, etc. and internalizes all the variables and their values, implicit and explicit rules, and constructs a dependency graph of all the targets and their prerequisites. During the second phase, make uses these internal structures to determine what targets will need to be rebuilt and to invoke the rules necessary to do so.
It’s important to understand this two-phase approach because it has a direct impact on how variable and function expansion happens; this is often a source of some confusion when writing makefiles.
-- https://www.gnu.org/software/make/manual/html_node/Reading-Makefiles.html#Reading-Makefiles
There are a ton of different functions. Here are a few that I find especially useful
$(patsubst pattern,replacement,text)
$(patsubst foo/%.bar,%.baz,foo/fname.bar)
: -> fname.baz
$(wildcard pattern)
$(wildcard data/*.csv)
-> raw.csv results.csv errors.csv
$(dir names…)
$(dir path/to/file.txt foo/bar.csv)
: -> 'path/to/ foo/`$(notdir names…)
$(dir path/to/file.txt foo/bar.csv)
: -> 'file.txt bar.csv'$(shell statement)
$(eval statement)
x := who
v1 = first/path
$(v1)_x := foo
test::
$(eval x := $($(v1)_x))
echo FIRST $(x)
x := cares
v2 = second/path
$(v2)_x := bar
test::
$(eval x := $($(v2)_x))
echo SECOND $(x)
will echo:
FIRST foo
SECOND bar
rather than:
FIRST cares
SECOND cares
In [2]:
np.random.seed(111)
common = [
["internet", "scraper"],
["database", "db queries"],
["scraper", "raw data 0"],
["db queries", "raw data 1"],
["local storage", "raw data 2"]
]
j = 0
for i in range(3):
common.append(["raw data " + str(i), "features " + str(j)])
j+=1
common.append(["raw data " + str(i), "features " + str(3)])
n_tasks = 2
tasks = defaultdict(list)
for i in range(n_tasks):
k = []
n=2
while len(k)!=n:
k = set(np.random.randint(low=1, high=2*n_tasks, size=n))
task = "task" + str(i)
for j in k:
tasks[task].append(["features " + str(j), task + " ABT"])
models = defaultdict(list)
for i in range(n_tasks):
for j in range(2):
task = "task" + str(i)
task_model = task + "model" + str(j)
tasks[task].append([task + " ABT", task_model])
models[task].append(task_model)
for ev in ["bootstrap", "target shuffle", "holdout"]:
i=0
for task, task_models in models.iteritems():
for mod in task_models:
mod_eval = ev + str(i)
tasks[task].append([mod, mod_eval])
i+=1
def get_giant_component(g):
n=0
main_comp = None
for comp in nx.components.weakly_connected_component_subgraphs(g):
if len(comp) > n:
n = len(comp)
main_comp = comp
return main_comp
In [3]:
g = nx.DiGraph()
g.add_edges_from(common)
np.random.seed(111)
pos = nx.spring_layout(g,k=0.2, iterations = 20)
nx.draw(g, pos=pos, with_labels='true')
plt.title('"Common" Data Pipeline')
plt.show()
In [4]:
g = nx.DiGraph()
g.add_edges_from(tasks['task0'])
g = get_giant_component(g)
np.random.seed(111)
pos = nx.spring_layout(g,k=0.15, iterations = 25)
nx.draw(g, pos=pos, with_labels='true')
plt.title("(Downstream) Modeling Task")
plt.show()
In [5]:
g = nx.DiGraph()
g.add_edges_from(common)
g.add_edges_from(tasks['task0'])
g = get_giant_component(g)
np.random.seed(333)
pos = nx.spring_layout(g,k=0.08, iterations = 20)
nx.draw(g, pos = pos, with_labels='true')
plt.title("Modeling task + Upstream Data ETL")
plt.show()
In [6]:
g = nx.DiGraph()
g.add_edges_from(common)
g.add_edges_from(tasks['task0'])
g.add_edges_from(tasks['task1'])
g = get_giant_component(g)
np.random.seed(111)
#pos = nx.spring_layout(g,k=0.10, iterations = 15)
pos = nx.spring_layout(g,k=0.10, iterations = 25)
nx.draw(g, pos=pos, with_labels='true')
plt.title("Full Pipeline, Multiple Tasks")
plt.show()
data/iris.rdata:
Rscript -e 'data(iris); save(iris, file=\"data/iris.rdata\")'
data/train.rdata data/test.rdata: data/iris.rdata src/train_test_split.r
Rscript src/train_test_split.r
models/logreg.rdata: data/train.rdata src/logreg.r
Rscript src/logreg.r
reports/confusion_matrix.txt: models/logreg.rdata data/test.rdata
Rscript src/eval_model.r models/logreg.rdata
In [7]:
makefile = """
data/iris.rdata:
Rscript -e 'data(iris); save(iris, file=\"data/iris.rdata\")'
data/train.rdata data/test.rdata: data/iris.rdata src/train_test_split.r
Rscript src/train_test_split.r
models/logreg.rdata: data/train.rdata src/logreg.r
Rscript src/logreg.r
reports/confusion_matrix.txt: models/logreg.rdata data/test.rdata
Rscript src/eval_model.r models/logreg.rdata
"""
In [8]:
lines = makefile.split('\n')
rules = [line for line in lines if ':' in line]
g = nx.DiGraph()
for rule in rules:
tgts, deps = rule.split(':')
for tgt in tgts.split():
for dep in deps.split():
g.add_edge(dep, tgt)
np.random.seed(10)
nx.draw(g, with_labels='true')
############################
## Project specific rules ##
############################
data/iris.rdata:
Rscript -e 'data(iris); base_table <- iris; save(base_table, file=\"data/base_table.rdata\")'
###################
## General rules ##
###################
r_model_specs := $(wildcard $src/models/*.r)
r_mod_names := $(notdir $(r_model_specs))
r_reports := $(patsubst %,reports/%_confusion_matrix.txt, $(r_mod_names))
.PHONY:all
all: $(r_reports)
data/train.rdata data/test.rdata: data/base_table.rdata src/train_test_split.r
Rscript src/train_test_split.r $<
models/%.rdata: src/models/%.r train_and_save_model.r data/train.rdata
Rscript build_model.r $<
reports/%_confusion_matrix.txt: models/%.rdata data/test.rdata
Rscript src/eval_model.r $<
## proectName/Makefile ##
train_data=data/processed/train.rdata
test_data=data/processed/test.rdata
abt=data/processed/analyticBaseTable.rdata
abt_script=src/data/build_base_table.r
R_INTERPRETER=Rscript
MODULES=$(patsubst ./%/Makefile,%, $(filter ./%/Makefile, $(shell find . -type f -name 'Makefile')))
include $(addsuffix /Makefile,$(MODULES))
.PHONY all
all::
## projectName/_header.mak ##
_MAKEFILES := $(filter-out _header.mak _footer.mak,$(MAKEFILE_LIST))
_MODULE := $(patsubst %/,%,$(dir $(word $(words $(_MAKEFILES)),$(_MAKEFILES))))
$(_MODULE)/$(abt): $(_MODULE)/$(abt_script) common/data/processed/features.rdata
$(R_INTERPRETER) $<
## projectName/taskName/Makefile ##
include _header.mak
$(_MODULE)/$(abt): $(_MODULE)/$(abt_script) common/data/processed/other_features.rdata
$(R_INTERPRETER) $<
include _footer.mak
## projectName/_footer.mak
r_model_specs := $(wildcard $(_MODULE)/src/models/*.r)
r_mod_names := $(notdir $(r_model_specs))
r_test_acc := $(patsubst %,reports/%_confusion_matrix.txt, $(r_mod_names))
TGTS += $(abt) $(train_data) $(test_data)
TGTS += $(patsubst %,models/%data, $(r_mod_names))
TGTS += $(r_test_acc)
$(_MODULE)/reports/%.r_confusion_matrix.txt: $(_MODULE)/models/%.rdata $(_MODULE)/$(test_data) common/src/eval/eval_model.r
$(R_INTERPRETER) common/src/eval/eval_model.r $@
$(_MODULE)/models/%.rdata: $(_MODULE)/src/models/%.r common/src/utils/train_and_save_model.r $(_MODULE)/$(train_data)
$(R_INTERPRETER) common/src/utils/train_and_save_model.r $@
$(_MODULE)/$(train_data) $(_MODULE)/$(test_data): $(_MODULE)/$(abt) common/src/data/train_test_split.r
$(R_INTERPRETER) common/src/data/train_test_split.r $<
$(_MODULE)_TGTS := $(addprefix $($(_MODULE)_OUTPUT)/,$(TGTS))
$(_MODULE): $($(_MODULE)_TGTS)
all:: $(_MODULE)
projectName/common/Makefile
_header.mak
and _footer.mak
projectName/taskName/src/models/modelName.r
train_model()
and predict_model()