This notebook is used to run code and generate graphs
In [1]:
    
num_of_runs = 10
    
In [2]:
    
import subprocess
import os
import time
import re
import postgres_import as pos_i
import mysql_import as my_i
import memsql_import as mem_i
import random
    
In [3]:
    
import numpy as np
from matplotlib import pyplot as plt
def bar_plot(data, items, colours, title):
    fig = plt.figure()
    fig.set_size_inches(15, 7)
    width = .45
    ind = np.arange(len(data))
    
    plt.bar(0.3+ind, data, width=width, color=colours)
    plt.xticks(0.3+ind, items)
    plt.title(title)
    fig.autofmt_xdate()
    plt.show()
    
In [4]:
    
def run_command(command, verbose=True, save_mem=False):
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
    returncode = process.wait()
    output = ""
    if not(save_mem):
        output = process.stdout.read().decode('utf-8', 'replace').replace("\r", "").split("\n")
        if verbose: 
            for line in output:
                print (line)
    return output
def run_command2(command, verbose=True, save_mem=False):
    process = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
    returncode = process.wait()
    output = ""
    if not(save_mem):
        output = process.stdout.read().decode('utf-8', 'replace').replace("\r", "").split("\n")
        if verbose: 
            for line in output:
                print (line)
    return output
    
In [5]:
    
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose up -d')
    
    
In [6]:
    
_ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose down')
    
    
In [7]:
    
postgres_container = "postgres"
containers = ["mongo", "redis", "neo4j", "orientdb", "cassandra", postgres_container, "mysql", "memsql", "voltdb", "cockroach"]
colours = ["lightgreen", "red", "deepskyblue", "orange", "skyblue", "lightsteelblue", "steelblue", "cornflowerblue", "firebrick", "darkgreen"]
    
In [8]:
    
def exec_and_time(container, command, verbose=True, save_mem=False):
    start = time.time()
    if container == postgres_container or container == "mysql" or container == "memsql":
        command[0](os.path.join(os.path.abspath('./Docker/shared_data'),command[1]))
    elif container == "cockroach":
        _ = run_command('docker exec --privileged ' + container + ' sh -c "' + command[0] + '"', verbose, save_mem)
        _ = run_command('docker exec --privileged --user postgres '+ postgres_container +' sh -c "' + command[1] + '"', verbose, save_mem)
    elif container == "cassandra":
        _ = run_command2('docker exec --privileged ' + container + ' '+ command[0] +' -c "' + command[1] + '"', verbose, save_mem)
    else: 
        _ = run_command('docker exec --privileged ' + container + ' sh -c "' + command + '"', verbose, save_mem)
    end = time.time()
    return end - start
    
In [9]:
    
def average_time(commands, containers, iterations, verbose=True, import_data=False):
    times = [0] * len(containers)
    peak_times = [0] * len(containers)
    for j in range(0, iterations):
        if verbose:
            print("iteration " + str(j+1))
        _ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose up -d', verbose=False)
        if verbose:
            print("waiting...")
        time.sleep(25) 
        for i in range(0, len(containers)):
            if verbose:
                print(containers[i])
                
            coackroach_ip = ""
            
            app1 = data_import_commands[i]
            app2 = commands[i]
            
            if containers[i] == "cockroach":
                ip = run_command('docker exec cockroach ip route', verbose=False)
                ip = ip[len(ip)-2].split(" ")
                ip = ip[len(ip)-2]
                cockroach_ip = ip
                #print(cockroach_ip)
                data_import_commands[i][1] = data_import_commands[i][1].replace("{{cockroach_ip}}", cockroach_ip)
                commands[i][1].replace("{{coackroach_ip}}", coackroach_ip)
            if import_data:
                exec_and_time(containers[i], data_import_commands[i], verbose=False, save_mem=True)
                
            data_import_commands[i] = app1
            commands[i] = app2
            
            time_taken = exec_and_time(containers[i], commands[i], verbose=False, save_mem=True)
            times[i] += time_taken
            if peak_times[i] < time_taken:
                peak_times[i] = time_taken
        if j < iterations-1:
            _ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose down', verbose=False)
    for i in range(0, len(containers)):
        times[i] = times[i]/iterations
        print(containers[i] + ": " + str(times[i]))
    return (times, peak_times)
    
In [10]:
    
data_import_commands = [
    "mongoimport --db testing_db --collection customers --type csv --headerline --file /shared_data/mock_data_10000_rows.csv && mongo testing_db --eval 'db.customers.createIndex({Id:1});'",
    "cat /shared_data/mock_data_10000_rows2.csv | redis-cli --pipe",
    "cp /shared_data/mock_data_10000_neo.csv /var/lib/neo4j/import && cat /shared_scripts/import_mock_10000.cypher | /var/lib/neo4j/bin/cypher-shell",
    "/orientdb/bin/oetl.sh /shared_scripts/import_orient_10000.json",
    ["bash", "cqlsh -f /shared_scripts/import_mock_10000.cql"],
    [pos_i.sql_import, "mock_data_10000_rows_utf8.csv"], #postgres
    [my_i.sql_import, "mock_data_10000_rows_utf8.csv"], #mysql
    [mem_i.sql_import, "mock_data_10000_rows_utf8.csv"], #memsql
    "cat /shared_scripts/import_voltdb.sql | sqlcmd && /opt/voltdb/bin/csvloader customers -f /shared_data/mock_data_10000_rows.csv",
    ["./cockroach sql --insecure --execute='create database if not exists mock;'", "psql -p 26257 -h {{cockroach_ip}} -d mock -U root < /shared_data/postgres_dump.sql"]
]
    
In [11]:
    
(import_times_avg, import_times_peak) = average_time(data_import_commands, containers, num_of_runs)
    
    
In [12]:
    
bar_plot(import_times_avg, containers, colours, "Average bulk import time")
    
    
In [13]:
    
def exec_and_time2(container, command, verbose=True, save_mem=False, long_output=False):
    start = time.time()
    if container == "cassandra" or long_output:
        _ = run_command2(command, verbose, save_mem)
    else:
        _ = run_command(command, verbose, save_mem)
    end = time.time()
    return end - start
    
In [14]:
    
def average_time2(commands, containers, iterations, verbose=True, import_data=False):
    times = [0] * len(containers)
    peak_times = [0] * len(containers)
    for j in range(0, iterations):
        if verbose:
            print("iteration " + str(j+1))
        _ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose up -d', verbose=False)
        if verbose:
            print("waiting...")
        time.sleep(25) 
        for i in range(0, len(containers)):
            if verbose:
                print(containers[i])
                
            coackroach_ip = ""
            
            app1 = data_import_commands[i]
            
            if containers[i] == "cockroach":
                ip = run_command('docker exec cockroach ip route', verbose=False)
                ip = ip[len(ip)-2].split(" ")
                ip = ip[len(ip)-2]
                cockroach_ip = ip
                #print(cockroach_ip)
                data_import_commands[i][1] = data_import_commands[i][1].replace("{{cockroach_ip}}", cockroach_ip)
                #print(data_import_commands[i])
            if import_data:
                if containers[i] == "redis":
                    exec_and_time2("redis", r"""docker exec --privileged redis sh -c "cat /shared_data/mock_data_10000_set.csv | redis-cli --pipe" """, verbose=False, save_mem=True)
                else: 
                    exec_and_time(containers[i], data_import_commands[i], verbose=False, save_mem=True)
            if containers[i] == "neo4j":
                exec_and_time2("neo4j", r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n:Customer) CREATE (n)-[r:Autoedge]->(n) RETURN r;\" | /var/lib/neo4j/bin/cypher-shell" """, verbose=False, save_mem=True)
            
            data_import_commands[i] = app1
            
            time_taken = exec_and_time2(containers[i], commands[i].replace("{{id}}", str(random.randrange(0, 9999, 1))), verbose=False, save_mem=True)
            times[i] += time_taken
            if peak_times[i] < time_taken:
                peak_times[i] = time_taken
        if j < iterations-1:
            _ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose down', verbose=False)
    for i in range(0, len(containers)):
        times[i] = times[i]/iterations
        print(containers[i] + ": " + str(times[i]))
    return (times, peak_times)
    
In [15]:
    
unindexed_read_commands = [
    r"""docker exec --privileged mongo mongo testing_db --eval "db.customers.find({first_name:'Hosea'}).pretty().shellPrint();" """,
    "",
    r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n) WHERE n.firstName = 'Hosea' RETURN n;\" | /var/lib/neo4j/bin/cypher-shell" """,
    r"""docker exec --privileged orientdb /orientdb/bin/console.sh "connect plocal:/temp/databases/mock admin admin; select from Customer where first_name = 'Hosea' " """,
    r"""docker exec --privileged cassandra cqlsh -k mock_keyspace -e "select * from customers where first_name = 'Hosea' allow filtering;" """,
    r"""docker exec --privileged postgres sh -c "echo \"SELECT * FROM customers WHERE first_name = 'Hosea'\" | psql -U postgres mock" """, #postgres
    r"""docker exec --privileged mysql mysql -u root --password=password --database mock -e "SELECT * FROM customers where first_name = 'Hosea'" """,
    r"""docker exec --privileged memsql memsql-shell --database mock -e "SELECT * FROM customers where first_name = 'Hosea'" """,
    r"""docker exec --privileged voltdb sh -c "echo \"select * from customers where first_name = 'Hosea';\" | sqlcmd"  """,
    r"""docker exec --privileged cockroach ./cockroach sql --insecure --execute="SET DATABASE = mock; SELECT * FROM customers WHERE first_name = 'Hosea';" """ 
]
    
In [16]:
    
(unindexed_read_times_avg, unindexed_read_times_peak) = average_time2(unindexed_read_commands, containers, num_of_runs)
unindexed_read_times_avg[1] = 0; # redis has no unindexed search
    
    
In [17]:
    
bar_plot(unindexed_read_times_avg, containers, colours, "Average search time")
    
    
In [18]:
    
indexed_read_commands = [
    r"""docker exec --privileged mongo mongo testing_db --eval "db.customers.find({Id:{{id}}}).pretty().shellPrint()" """,
    r"""docker exec --privileged redis sh -c "echo GET {{id}}} | redis-cli" """,
    r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n) WHERE ID(n) = {{id}}} RETURN n;\" | /var/lib/neo4j/bin/cypher-shell" """,
    r"""docker exec --privileged orientdb /orientdb/bin/console.sh "connect plocal:/temp/databases/mock admin admin; select from Customer where Id = {{id}}}" """,
    r"""docker exec --privileged cassandra cqlsh -k mock_keyspace -e "select * from customers where id = {{id}}} allow filtering;" """,
    r"""docker exec --privileged postgres sh -c "echo \"SELECT * FROM customers where id = {{id}}}\" | psql -U postgres mock" """, #postgres
    r"""docker exec --privileged mysql mysql -u root --password=password --database mock -e "SELECT * FROM customers where id = {{id}}}" """,
    r"""docker exec --privileged memsql memsql-shell --database mock -e "SELECT * FROM customers where id = {{id}}}" """,
    r"""docker exec --privileged voltdb sh -c "echo \"SELECT * FROM customers where id = {{id}}}\" | sqlcmd"  """,
    r"""docker exec --privileged cockroach ./cockroach sql --insecure --execute="SET DATABASE = mock; SELECT * FROM customers where id = {{id}}};" """ 
]
    
In [19]:
    
(indexed_read_times_avg, indexed_read_times_peak) = average_time2(indexed_read_commands, containers, num_of_runs)
    
    
In [20]:
    
bar_plot(indexed_read_times_avg, containers, colours, "Average indexed search time")
    
    
In [21]:
    
aggregate_read_commands = [
    r"""docker exec --privileged mongo mongo testing_db --eval 'db.customers.aggregate(    [      { $group: { "_id": "$first_name", "avg": { $avg: "$income" } } }    ] ).pretty().shellPrint();' """,
    r"""docker exec --privileged redis sh -c "echo zcount mylist -inf +inf | redis-cli" """,
    r"""docker exec --privileged neo4j sh -c "echo \"match (n:Customer) return avg(n.income);\" | /var/lib/neo4j/bin/cypher-shell" """,
    r"""docker exec --privileged orientdb /orientdb/bin/console.sh "connect plocal:/temp/databases/mock admin admin; select avg(income) FROM Customer" """,
    r"""docker exec --privileged cassandra cqlsh -k mock_keyspace -e "select avg(income) as avg_income from customers;" """,
    r"""docker exec --privileged postgres sh -c "echo \"select avg(income) as avg_income from customers\" | psql -U postgres mock" """, #postgres
    r"""docker exec --privileged mysql mysql -u root --password=password --database mock -e "select avg(income) as avg_income from customers" """,
    r"""docker exec --privileged memsql memsql-shell --database mock -e "select avg(income) as avg_income from customers" """,
    r"""docker exec --privileged voltdb sh -c "echo \"select avg(income) as avg_income from customers\" | sqlcmd"  """,
    r"""docker exec --privileged cockroach ./cockroach sql --insecure --execute="SET DATABASE = mock; select avg(income) as avg_income from customers;" """ 
]
    
In [22]:
    
(aggregate_read_times_avg, aggregate_read_times_peak) = average_time2(aggregate_read_commands, containers, num_of_runs)
    
    
In [23]:
    
bar_plot(aggregate_read_times_avg, containers, colours, "Average aggregation time")
    
    
In [24]:
    
join_commands = [
    "",
    "",
    r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n:Customer)-[:Autoedge]->(m:Customer) WHERE n.firstName = m.firstName RETURN n, m;\" | /var/lib/neo4j/bin/cypher-shell" """,
    "",
    "",
    r"""docker exec --privileged postgres sh -c "psql -q -U postgres mock -c 'select * from customers a, customers b where a.first_name = b.first_name;' > join.txt" """,
    r"""docker exec --privileged mysql sh -c "mysql -u root --password=password --database mock -e 'select * from customers a, customers b where a.first_name = b.first_name;' > join.txt " """,
    r"""docker exec --privileged memsql sh -c "memsql-shell --database mock -e 'select * from customers a, customers b where a.first_name = b.first_name;' > join.txt" """,
    r"""docker exec --privileged voltdb sh -c "echo 'select * from customers a, customers b where a.first_name = b.first_name;' | sqlcmd > join.txt"  """,
    r"""docker exec --privileged cockroach sh -c "./cockroach sql --insecure --execute='SET DATABASE = mock; select * from customers a, customers b where a.first_name = b.first_name;' > join.txt" """ 
]
    
In [25]:
    
(join_times_avg, join_times_peak) = average_time2(join_commands, containers, num_of_runs)
join_times_avg[0] = 0; # mongodb does not support relational join operation
join_times_avg[1] = 0; # redis does not support relational join operation
join_times_avg[3] = 0; # orientdb does not support relational join operation
join_times_avg[4] = 0; # cassandra does not support relational join operation
    
    
In [26]:
    
bar_plot(join_times_avg, containers, colours, "Average join time")
    
    
In [27]:
    
def read_throughput(commands, containers, iterations=1000, verbose=True, import_data=False):
    times = [0] * len(containers)
    ratios = [0] * len(containers)
    _ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose up -d', verbose=False)
    time.sleep(25) 
    for i in range(0, len(containers)):
        #imports
        if verbose:
            print(containers[i])
            
        coackroach_ip = ""
        
        app1 = data_import_commands[i]
        app2 = commands[i]
        
        if containers[i] == "cockroach":
            ip = run_command('docker exec cockroach ip route', verbose=False)
            ip = ip[len(ip)-2].split(" ")
            ip = ip[len(ip)-2]
            cockroach_ip = ip
            data_import_commands[i][1] = data_import_commands[i][1].replace("{{cockroach_ip}}", cockroach_ip)
        if import_data:
            if containers[i] == "redis":
                exec_and_time2("redis", r"""docker exec --privileged redis sh -c "cat /shared_data/mock_data_10000_set.csv | redis-cli --pipe" """, verbose=False, save_mem=True)
            else: 
                exec_and_time(containers[i], data_import_commands[i], verbose=False, save_mem=True)
        if containers[i] == "neo4j":
            exec_and_time2("neo4j", r"""docker exec --privileged neo4j sh -c "echo \"MATCH (n:Customer) CREATE (n)-[r:Autoedge]->(n) RETURN r;\" | /var/lib/neo4j/bin/cypher-shell" """, verbose=False, save_mem=True)
        #queries
        start = time.time()
        for j in range(0, iterations):  
            _ = exec_and_time2(containers[i], commands[i].replace("{{id}}", str(random.randrange(0, 9999, 1))), verbose=False, save_mem=True)
        
        data_import_commands[i] = app1
        commands[i] = app2
        
        end = time.time()
        times[i] = end - start
        ratios[i] = iterations/times[i]
    _ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker" && docker-compose down', verbose=False)
    for i in range(0, len(containers)):
        print(containers[i] + ": " + str(ratios[i]))
    return ratios
    
In [28]:
    
indexed_read_throughput = read_throughput(indexed_read_commands, containers, 50)
    
    
In [29]:
    
bar_plot(indexed_read_throughput, containers, colours, "Indexed read throughput")
    
    
In [30]:
    
write_throughput = [10000/x for x in import_times_avg]
bar_plot(write_throughput, containers, colours, "Write throughput")
    
    
In [31]:
    
def cluster_times(commands, dbms, index,iterations=1000, verbose=True, import_data=False):
    _ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker\cluster\\' + dbms + '" && docker-compose up -d', verbose=True)
    if dbms == "mysql":
        _ = run_command("docker network connect bridge mysql")
    time.sleep(15) 
    #imports
    if verbose:
        print(dbms)
    coackroach_ip = ""
    app1 = data_import_commands[index]
    app2 = commands[index]
    if dbms == "cockroach":
        ip = run_command('docker exec cockroach ip route', verbose=False)
        ip = ip[len(ip)-2].split(" ")
        ip = ip[len(ip)-2]
        cockroach_ip = ip
        data_import_commands[index][1] = data_import_commands[index][1].replace("{{cockroach_ip}}", cockroach_ip)
    print("Importing data...")
    start_i = time.time()
    exec_and_time(dbms, data_import_commands[index], verbose=False, save_mem=True)
    import_time = time.time() - start_i
    #queries
    start = time.time()
    for j in range(0, iterations):  
        _ = exec_and_time2(containers[index], commands[index].replace("{{id}}", str(random.randrange(0, 9999, 1))), verbose=False, save_mem=True)
    data_import_commands[index] = app1
    commands[index] = app2
    end = time.time()
    time_taken = end - start
    ratio = iterations/time_taken
    
    _ = run_command('D: && cd "D:\GitHub\BigData\Project2\Docker\cluster\\' + dbms + '" && docker-compose down', verbose=False)
    
    print(dbms + ": " + str(ratio))
    return (import_time, ratio)
    
In [50]:
    
cluster_import_times = [0]*4
cluster_read_ratios = [0]*4
    
In [75]:
    
(cluster_import_times[0], cluster_read_ratios[0]) = cluster_times(indexed_read_commands, "voltdb", 8)
    
    
In [76]:
    
(cluster_import_times[1], cluster_read_ratios[1]) = cluster_times(indexed_read_commands, "cockroach", 9)
    
    
In [77]:
    
(cluster_import_times[2], cluster_read_ratios[2]) = cluster_times(indexed_read_commands, "cassandra", 4)
    
    
In [72]:
    
(cluster_import_times[3], cluster_read_ratios[3]) = cluster_times(indexed_read_commands, "mysql", 6)
    
    
In [78]:
    
bar_plot(cluster_import_times, ["voltdb", "cockroach", "cassandra", "mysql cluster"], [colours[8], colours[9], colours[4], colours[6]], "Average bulk import time (cluster)")
    
    
In [79]:
    
bar_plot(cluster_read_ratios, ["voltdb", "cockroach", "cassandra", "mysql cluster"], [colours[8], colours[9], colours[4], colours[6]], "Average read throughput (cluster)")