This notebook is used to run code and generate graphs
In [1]:
num_of_runs = 10
In [2]:
import subprocess
import os
import time
import re
import postgres_import as pos_i
import mysql_import as my_i
import memsql_import as mem_i
import random
In [3]:
import numpy as np
from matplotlib import pyplot as plt
def bar_plot(data, items, colours, title):
fig = plt.figure()
fig.set_size_inches(15, 7)
width = .45
ind = np.arange(len(data))
plt.bar(0.3+ind, data, width=width, color=colours)
plt.xticks(0.3+ind, items)
plt.title(title)
fig.autofmt_xdate()
plt.show()
In [4]:
def run_command(command, verbose=True, save_mem=False):
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
returncode = process.wait()
output = ""
if not(save_mem):
output = process.stdout.read().decode('utf-8', 'replace').replace("\r", "").split("\n")
if verbose:
for line in output:
print (line)
return output
def run_command2(command, verbose=True, save_mem=False):
process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
returncode = process.wait()
output = ""
if not(save_mem):
output = process.stdout.read().decode('utf-8', 'replace').replace("\r", "").split("\n")
if verbose:
for line in output:
print (line)
return output
In [5]:
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose up -d')
In [6]:
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose down')
In [7]:
postgres_container = "postgres"
containers = ["mongo", "redis", "neo4j", "orientdb", "cassandra", postgres_container, "mysql", "memsql", "voltdb", "cockroach"]
colours = ["lightgreen", "red", "deepskyblue", "orange", "skyblue", "lightsteelblue", "steelblue", "cornflowerblue", "firebrick", "darkgreen"]
In [8]:
def exec_and_time(container, command, verbose=True, save_mem=False):
start = time.time()
if container == postgres_container or container == "mysql" or container == "memsql":
command[0](os.path.join(os.path.abspath('./Docker/shared_data'),command[1]))
elif container == "cockroach":
_ = run_command('docker exec --privileged ' + container + ' sh -c "' + command[0] + '"', verbose, save_mem)
_ = run_command('docker exec --privileged --user postgres '+ postgres_container +' sh -c "' + command[1] + '"', verbose, save_mem)
elif container == "cassandra":
_ = run_command2('docker exec --privileged ' + container + ' '+ command[0] +' -c "' + command[1] + '"', verbose, save_mem)
else:
_ = run_command('docker exec --privileged ' + container + ' sh -c "' + command + '"', verbose, save_mem)
end = time.time()
return end - start
In [9]:
def average_time(commands, containers, iterations, verbose=True, import_data=False):
times = [0] * len(containers)
peak_times = [0] * len(containers)
for j in range(0, iterations):
if verbose:
print("iteration " + str(j+1))
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose up -d', verbose=False)
if verbose:
print("waiting...")
time.sleep(25)
for i in range(0, len(containers)):
if verbose:
print(containers[i])
coackroach_ip = ""
app1 = data_import_commands[i]
app2 = commands[i]
if containers[i] == "cockroach":
ip = run_command('docker exec cockroach ip route', verbose=False)
ip = ip[len(ip)-2].split(" ")
ip = ip[len(ip)-2]
cockroach_ip = ip
#print(cockroach_ip)
data_import_commands[i][1] = data_import_commands[i][1].replace("{{cockroach_ip}}", cockroach_ip)
commands[i][1].replace("{{coackroach_ip}}", coackroach_ip)
if import_data:
exec_and_time(containers[i], data_import_commands[i], verbose=False, save_mem=True)
data_import_commands[i] = app1
commands[i] = app2
time_taken = exec_and_time(containers[i], commands[i], verbose=False, save_mem=True)
times[i] += time_taken
if peak_times[i] < time_taken:
peak_times[i] = time_taken
if j < iterations-1:
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose down', verbose=False)
for i in range(0, len(containers)):
times[i] = times[i]/iterations
print(containers[i] + ": " + str(times[i]))
return (times, peak_times)
In [10]:
data_import_commands = [
"mongoimport --db testing_db --collection customers --type csv --headerline --file /shared_data/mock_data_10000_rows.csv && mongo testing_db --eval 'db.customers.createIndex({Id:1});'",
"cat /shared_data/mock_data_10000_rows2.csv | redis-cli --pipe",
"cp /shared_data/mock_data_10000_neo.csv /var/lib/neo4j/import && cat /shared_scripts/import_mock_10000.cypher | /var/lib/neo4j/bin/cypher-shell",
"/orientdb/bin/oetl.sh /shared_scripts/import_orient_10000.json",
["bash", "cqlsh -f /shared_scripts/import_mock_10000.cql"],
[pos_i.sql_import, "mock_data_10000_rows_utf8.csv"], #postgres
[my_i.sql_import, "mock_data_10000_rows_utf8.csv"], #mysql
[mem_i.sql_import, "mock_data_10000_rows_utf8.csv"], #memsql
"cat /shared_scripts/import_voltdb.sql | sqlcmd && /opt/voltdb/bin/csvloader customers -f /shared_data/mock_data_10000_rows.csv",
["./cockroach sql --insecure --execute='create database if not exists mock;'", "psql -p 26257 -h {{cockroach_ip}} -d mock -U root < /shared_data/postgres_dump.sql"]
]
In [11]:
(import_times_avg, import_times_peak) = average_time(data_import_commands, containers, num_of_runs)
In [12]:
bar_plot(import_times_avg, containers, colours, "Average bulk import time")
In [13]:
def exec_and_time2(container, command, verbose=True, save_mem=False, long_output=False):
start = time.time()
if container == "cassandra" or long_output:
_ = run_command2(command, verbose, save_mem)
else:
_ = run_command(command, verbose, save_mem)
end = time.time()
return end - start
In [14]:
def average_time2(commands, containers, iterations, verbose=True, import_data=False):
times = [0] * len(containers)
peak_times = [0] * len(containers)
for j in range(0, iterations):
if verbose:
print("iteration " + str(j+1))
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose up -d', verbose=False)
if verbose:
print("waiting...")
time.sleep(25)
for i in range(0, len(containers)):
if verbose:
print(containers[i])
coackroach_ip = ""
app1 = data_import_commands[i]
if containers[i] == "cockroach":
ip = run_command('docker exec cockroach ip route', verbose=False)
ip = ip[len(ip)-2].split(" ")
ip = ip[len(ip)-2]
cockroach_ip = ip
#print(cockroach_ip)
data_import_commands[i][1] = data_import_commands[i][1].replace("{{cockroach_ip}}", cockroach_ip)
#print(data_import_commands[i])
if import_data:
if containers[i] == "redis":
exec_and_time2("redis", r"""docker exec --privileged redis sh -c "cat /shared_data/mock_data_10000_set.csv | redis-cli --pipe" """, verbose=False, save_mem=True)
else:
exec_and_time(containers[i], data_import_commands[i], verbose=False, save_mem=True)
if containers[i] == "neo4j":
exec_and_time2("neo4j", r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n:Customer) CREATE (n)-[r:Autoedge]->(n) RETURN r;\" | /var/lib/neo4j/bin/cypher-shell" """, verbose=False, save_mem=True)
data_import_commands[i] = app1
time_taken = exec_and_time2(containers[i], commands[i].replace("{{id}}", str(random.randrange(0, 9999, 1))), verbose=False, save_mem=True)
times[i] += time_taken
if peak_times[i] < time_taken:
peak_times[i] = time_taken
if j < iterations-1:
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose down', verbose=False)
for i in range(0, len(containers)):
times[i] = times[i]/iterations
print(containers[i] + ": " + str(times[i]))
return (times, peak_times)
In [15]:
unindexed_read_commands = [
r"""docker exec --privileged mongo mongo testing_db --eval "db.customers.find({first_name:'Hosea'}).pretty().shellPrint();" """,
"",
r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n) WHERE n.firstName = 'Hosea' RETURN n;\" | /var/lib/neo4j/bin/cypher-shell" """,
r"""docker exec --privileged orientdb /orientdb/bin/console.sh "connect plocal:/temp/databases/mock admin admin; select from Customer where first_name = 'Hosea' " """,
r"""docker exec --privileged cassandra cqlsh -k mock_keyspace -e "select * from customers where first_name = 'Hosea' allow filtering;" """,
r"""docker exec --privileged postgres sh -c "echo \"SELECT * FROM customers WHERE first_name = 'Hosea'\" | psql -U postgres mock" """, #postgres
r"""docker exec --privileged mysql mysql -u root --password=password --database mock -e "SELECT * FROM customers where first_name = 'Hosea'" """,
r"""docker exec --privileged memsql memsql-shell --database mock -e "SELECT * FROM customers where first_name = 'Hosea'" """,
r"""docker exec --privileged voltdb sh -c "echo \"select * from customers where first_name = 'Hosea';\" | sqlcmd" """,
r"""docker exec --privileged cockroach ./cockroach sql --insecure --execute="SET DATABASE = mock; SELECT * FROM customers WHERE first_name = 'Hosea';" """
]
In [16]:
(unindexed_read_times_avg, unindexed_read_times_peak) = average_time2(unindexed_read_commands, containers, num_of_runs)
unindexed_read_times_avg[1] = 0; # redis has no unindexed search
In [17]:
bar_plot(unindexed_read_times_avg, containers, colours, "Average search time")
In [18]:
indexed_read_commands = [
r"""docker exec --privileged mongo mongo testing_db --eval "db.customers.find({Id:{{id}}}).pretty().shellPrint()" """,
r"""docker exec --privileged redis sh -c "echo GET {{id}}} | redis-cli" """,
r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n) WHERE ID(n) = {{id}}} RETURN n;\" | /var/lib/neo4j/bin/cypher-shell" """,
r"""docker exec --privileged orientdb /orientdb/bin/console.sh "connect plocal:/temp/databases/mock admin admin; select from Customer where Id = {{id}}}" """,
r"""docker exec --privileged cassandra cqlsh -k mock_keyspace -e "select * from customers where id = {{id}}} allow filtering;" """,
r"""docker exec --privileged postgres sh -c "echo \"SELECT * FROM customers where id = {{id}}}\" | psql -U postgres mock" """, #postgres
r"""docker exec --privileged mysql mysql -u root --password=password --database mock -e "SELECT * FROM customers where id = {{id}}}" """,
r"""docker exec --privileged memsql memsql-shell --database mock -e "SELECT * FROM customers where id = {{id}}}" """,
r"""docker exec --privileged voltdb sh -c "echo \"SELECT * FROM customers where id = {{id}}}\" | sqlcmd" """,
r"""docker exec --privileged cockroach ./cockroach sql --insecure --execute="SET DATABASE = mock; SELECT * FROM customers where id = {{id}}};" """
]
In [19]:
(indexed_read_times_avg, indexed_read_times_peak) = average_time2(indexed_read_commands, containers, num_of_runs)
In [20]:
bar_plot(indexed_read_times_avg, containers, colours, "Average indexed search time")
In [21]:
aggregate_read_commands = [
r"""docker exec --privileged mongo mongo testing_db --eval 'db.customers.aggregate( [ { $group: { "_id": "$first_name", "avg": { $avg: "$income" } } } ] ).pretty().shellPrint();' """,
r"""docker exec --privileged redis sh -c "echo zcount mylist -inf +inf | redis-cli" """,
r"""docker exec --privileged neo4j sh -c "echo \"match (n:Customer) return avg(n.income);\" | /var/lib/neo4j/bin/cypher-shell" """,
r"""docker exec --privileged orientdb /orientdb/bin/console.sh "connect plocal:/temp/databases/mock admin admin; select avg(income) FROM Customer" """,
r"""docker exec --privileged cassandra cqlsh -k mock_keyspace -e "select avg(income) as avg_income from customers;" """,
r"""docker exec --privileged postgres sh -c "echo \"select avg(income) as avg_income from customers\" | psql -U postgres mock" """, #postgres
r"""docker exec --privileged mysql mysql -u root --password=password --database mock -e "select avg(income) as avg_income from customers" """,
r"""docker exec --privileged memsql memsql-shell --database mock -e "select avg(income) as avg_income from customers" """,
r"""docker exec --privileged voltdb sh -c "echo \"select avg(income) as avg_income from customers\" | sqlcmd" """,
r"""docker exec --privileged cockroach ./cockroach sql --insecure --execute="SET DATABASE = mock; select avg(income) as avg_income from customers;" """
]
In [22]:
(aggregate_read_times_avg, aggregate_read_times_peak) = average_time2(aggregate_read_commands, containers, num_of_runs)
In [23]:
bar_plot(aggregate_read_times_avg, containers, colours, "Average aggregation time")
In [24]:
join_commands = [
"",
"",
r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n:Customer)-[:Autoedge]->(m:Customer) WHERE n.firstName = m.firstName RETURN n, m;\" | /var/lib/neo4j/bin/cypher-shell" """,
"",
"",
r"""docker exec --privileged postgres sh -c "psql -q -U postgres mock -c 'select * from customers a, customers b where a.first_name = b.first_name;' > join.txt" """,
r"""docker exec --privileged mysql sh -c "mysql -u root --password=password --database mock -e 'select * from customers a, customers b where a.first_name = b.first_name;' > join.txt " """,
r"""docker exec --privileged memsql sh -c "memsql-shell --database mock -e 'select * from customers a, customers b where a.first_name = b.first_name;' > join.txt" """,
r"""docker exec --privileged voltdb sh -c "echo 'select * from customers a, customers b where a.first_name = b.first_name;' | sqlcmd > join.txt" """,
r"""docker exec --privileged cockroach sh -c "./cockroach sql --insecure --execute='SET DATABASE = mock; select * from customers a, customers b where a.first_name = b.first_name;' > join.txt" """
]
In [25]:
(join_times_avg, join_times_peak) = average_time2(join_commands, containers, num_of_runs)
join_times_avg[0] = 0; # mongodb does not support relational join operation
join_times_avg[1] = 0; # redis does not support relational join operation
join_times_avg[3] = 0; # orientdb does not support relational join operation
join_times_avg[4] = 0; # cassandra does not support relational join operation
In [26]:
bar_plot(join_times_avg, containers, colours, "Average join time")
In [27]:
def read_throughput(commands, containers, iterations=1000, verbose=True, import_data=False):
times = [0] * len(containers)
ratios = [0] * len(containers)
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose up -d', verbose=False)
time.sleep(25)
for i in range(0, len(containers)):
#imports
if verbose:
print(containers[i])
coackroach_ip = ""
app1 = data_import_commands[i]
app2 = commands[i]
if containers[i] == "cockroach":
ip = run_command('docker exec cockroach ip route', verbose=False)
ip = ip[len(ip)-2].split(" ")
ip = ip[len(ip)-2]
cockroach_ip = ip
data_import_commands[i][1] = data_import_commands[i][1].replace("{{cockroach_ip}}", cockroach_ip)
if import_data:
if containers[i] == "redis":
exec_and_time2("redis", r"""docker exec --privileged redis sh -c "cat /shared_data/mock_data_10000_set.csv | redis-cli --pipe" """, verbose=False, save_mem=True)
else:
exec_and_time(containers[i], data_import_commands[i], verbose=False, save_mem=True)
if containers[i] == "neo4j":
exec_and_time2("neo4j", r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n:Customer) CREATE (n)-[r:Autoedge]->(n) RETURN r;\" | /var/lib/neo4j/bin/cypher-shell" """, verbose=False, save_mem=True)
#queries
start = time.time()
for j in range(0, iterations):
_ = exec_and_time2(containers[i], commands[i].replace("{{id}}", str(random.randrange(0, 9999, 1))), verbose=False, save_mem=True)
data_import_commands[i] = app1
commands[i] = app2
end = time.time()
times[i] = end - start
ratios[i] = iterations/times[i]
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose down', verbose=False)
for i in range(0, len(containers)):
print(containers[i] + ": " + str(ratios[i]))
return ratios
In [28]:
indexed_read_throughput = read_throughput(indexed_read_commands, containers, 50)
In [29]:
bar_plot(indexed_read_throughput, containers, colours, "Indexed read throughput")
In [30]:
write_throughput = [10000/x for x in import_times_avg]
bar_plot(write_throughput, containers, colours, "Write throughput")
In [31]:
def cluster_times(commands, dbms, index,iterations=1000, verbose=True, import_data=False):
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker\cluster\\' + dbms + '" && docker-compose up -d', verbose=True)
if dbms == "mysql":
_ = run_command("docker network connect bridge mysql")
time.sleep(15)
#imports
if verbose:
print(dbms)
coackroach_ip = ""
app1 = data_import_commands[index]
app2 = commands[index]
if dbms == "cockroach":
ip = run_command('docker exec cockroach ip route', verbose=False)
ip = ip[len(ip)-2].split(" ")
ip = ip[len(ip)-2]
cockroach_ip = ip
data_import_commands[index][1] = data_import_commands[index][1].replace("{{cockroach_ip}}", cockroach_ip)
print("Importing data...")
start_i = time.time()
exec_and_time(dbms, data_import_commands[index], verbose=False, save_mem=True)
import_time = time.time() - start_i
#queries
start = time.time()
for j in range(0, iterations):
_ = exec_and_time2(containers[index], commands[index].replace("{{id}}", str(random.randrange(0, 9999, 1))), verbose=False, save_mem=True)
data_import_commands[index] = app1
commands[index] = app2
end = time.time()
time_taken = end - start
ratio = iterations/time_taken
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker\cluster\\' + dbms + '" && docker-compose down', verbose=False)
print(dbms + ": " + str(ratio))
return (import_time, ratio)
In [50]:
cluster_import_times = [0]*4
cluster_read_ratios = [0]*4
In [75]:
(cluster_import_times[0], cluster_read_ratios[0]) = cluster_times(indexed_read_commands, "voltdb", 8)
In [76]:
(cluster_import_times[1], cluster_read_ratios[1]) = cluster_times(indexed_read_commands, "cockroach", 9)
In [77]:
(cluster_import_times[2], cluster_read_ratios[2]) = cluster_times(indexed_read_commands, "cassandra", 4)
In [72]:
(cluster_import_times[3], cluster_read_ratios[3]) = cluster_times(indexed_read_commands, "mysql", 6)
In [78]:
bar_plot(cluster_import_times, ["voltdb", "cockroach", "cassandra", "mysql cluster"], [colours[8], colours[9], colours[4], colours[6]], "Average bulk import time (cluster)")
In [79]:
bar_plot(cluster_read_ratios, ["voltdb", "cockroach", "cassandra", "mysql cluster"], [colours[8], colours[9], colours[4], colours[6]], "Average read throughput (cluster)")