In [1]:
%pylab inline

import sys
import os.path as op
import shutil
# sys.path.insert(0, "/home/mjirik/projects/pyseg_base/")
sys.path.insert(0, op.abspath("../"))
import scipy
import time
import pandas as pd

from imcut import pycut
import sed3
import itertools

latex_dir = "../../papers/iwcia18/"

# sh 155,160, r10, dpoff 3, seeds 3
dataparams_sh =  list(range(44, 195, 10))
dataparams_sh =  list(range(44, 195, 1))
# dataparams_sh =  range(44, 195, 50)
dp_radius = [10]
dp_offset = [3, 5]
dp_seedsz = [3]

fname = "exp062-multiscale.csv"
fnamenew = "msgc_experiment.csv"


rnd_seed=1


Populating the interactive namespace from numpy and matplotlib

In [2]:
%pwd


Out[2]:
'/auto/plzen1/home/mjirik/projects/imcut/examples'

Methods setup


In [3]:
# block size bylo 10
segparams0 = {
    'method':'graphcut',
#     'method':'multiscale_graphcut',
    'use_boundary_penalties': True,
    'boundary_dilatation_distance': 2,
    'boundary_penalties_weight': 1,
    'block_size': 10,
    'tile_zoom_constant': 1
    }

segparams1 = {
    # 'method':'graphcut',
    'method':'multiscale_graphcut_hi2lo',
    'use_boundary_penalties': True,
    'boundary_dilatation_distance': 2,
    'boundary_penalties_weight': 1,
    'block_size': 10,
    'tile_zoom_constant': 1
    }

segparams2 = {
    # 'method':'graphcut',
    'method':'multiscale_graphcut_lo2hi',
    'use_boundary_penalties': True,
    'boundary_dilatation_distance': 2,
    'boundary_penalties_weight': 1,
    'block_size': 10,
    'tile_zoom_constant': 1
    }


labels = [
    "ssgc ",
    "msgc_hi2lo ",
    "msgc_lo2hi ",
]

In [4]:
def make_data(sz=32, offset=0, radius=7, seedsz=3):
    #seedsz= int(sz/10)
    space=2
    seeds = np.zeros([sz, sz+1, sz+2], dtype=np.int8)
    xmin = radius + seedsz + offset + 2
    ymin = radius + seedsz + offset + 6
    seeds[offset + 12, xmin + 3:xmin + 7 + seedsz, ymin:ymin+2] = 1
    seeds[offset + 20, xmin + 7:xmin + 12 + seedsz, ymin+5:ymin+7] = 1
    img = np.ones([sz, sz+1, sz+2])
    img = img - seeds

    seeds[
        2:10 + seedsz, 
        2:9+ seedsz, 
        2:3+ seedsz] = 2
    img = scipy.ndimage.morphology.distance_transform_edt(img)
    segm = img < radius
    img = (100 * segm + 80 * np.random.random(img.shape)).astype(np.uint8)
    return img, segm, seeds

Data screenshots


In [5]:
img, seg, seeds = make_data(45, 3, 10, 3)
plt.figure(figsize=(10,15))
import copy
imgview = copy.copy(img)
imgview[:,:,-1] = 0
imgview[:,-1,:] = 0
imgview[-1,:,:] = 0
sed3.show_slices(imgview, contour=seg, seeds=seeds, show=False, slice_step=3.9)
plt.axis("off")
plt.savefig("../graphics/exp062-imgsample_separated.png")


/storage/plzen1/home/mjirik/miniconda/lib/python3.6/site-packages/numpy/ma/core.py:6385: MaskedArrayFutureWarning: In the future the default for ma.minimum.reduce will be axis=0, not the current None, to match np.minimum.reduce. Explicitly pass 0 or None to silence this warning.
  return self.reduce(a)
/storage/plzen1/home/mjirik/miniconda/lib/python3.6/site-packages/numpy/ma/core.py:6385: MaskedArrayFutureWarning: In the future the default for ma.maximum.reduce will be axis=0, not the current None, to match np.maximum.reduce. Explicitly pass 0 or None to silence this warning.
  return self.reduce(a)

In [6]:
img, seg, seeds = make_data(50, 10, 15)
plt.figure(figsize=(10,15))
sed3.show_slices(img, contour=seg, seeds=seeds)


/storage/plzen1/home/mjirik/miniconda/lib/python3.6/site-packages/numpy/ma/core.py:6385: MaskedArrayFutureWarning: In the future the default for ma.minimum.reduce will be axis=0, not the current None, to match np.minimum.reduce. Explicitly pass 0 or None to silence this warning.
  return self.reduce(a)
/storage/plzen1/home/mjirik/miniconda/lib/python3.6/site-packages/numpy/ma/core.py:6385: MaskedArrayFutureWarning: In the future the default for ma.maximum.reduce will be axis=0, not the current None, to match np.maximum.reduce. Explicitly pass 0 or None to silence this warning.
  return self.reduce(a)

In [7]:
np.unique(seeds)


Out[7]:
array([0, 1, 2], dtype=int8)

LaTeX export functions


In [8]:
def to_latex_file(df, fn):
    with open(fn, "w") as f:
        f.write(df.to_latex())
        
def latex_float(f, precision=4):
    float_str = "{0:." + str(int(precision)) + "g}"
    float_str = float_str.format(f)
    if "e" in float_str:
        base, exponent = float_str.split("e")
        return r"{0} \times 10^{{{1}}}".format(base, int(exponent))
    else:
        return float_str
    
def float_to_latex_file(fl, fn, precision=4):
    string = latex_float(fl, precision=precision)
    with open(fn, "w") as f:
        f.write(string)

def num2latex(num, filename=None, precision=4):
    if type(num) is str:
        float_str = num
    else:
        float_str = "{0:." + str(int(precision)) + "g}"
        float_str = float_str.format(num)
        
    if float_str[:4] == r"\num":
        pass
    else:
        float_str = "\\num{" + float_str + "}" 
    if filename is not None:
        with open(filename, "w") as f:
            f.write(float_str)
    return float_str

def to_file(text, fn):
    with open(fn, "w") as f:
        f.write(text)

Umělá data, opakovaný experiment


In [9]:
def process_gc_stats(stats1, prefix=None):
    if prefix is None:
        prefix = ""
    
        
    outstats = {}
    for key in stats1:
        outstats[prefix + key] = stats1[key]
        
    outstats[prefix + "nlinks number"] = np.sum(np.asarray(outstats[prefix + "nlinks shape"]), axis=0)[0]
    outstats[prefix + "tlinks number"] = np.sum(np.asarray(outstats[prefix + "tlinks shape"]), axis=0)[0]
    outstats.pop(prefix + "tlinks shape")
    outstats.pop(prefix + "nlinks shape")
    outstats[prefix + "edge number"] = outstats[prefix + "nlinks number"] + outstats[prefix + "tlinks number"]

    return outstats

    
def merge_stats(stats0, stats1, stats2, labels=None):
    if labels is None:
        labels = [""] * 3
    
   
    stats0 = process_gc_stats(stats0, labels[0])
    stats1 = process_gc_stats(stats1, labels[1])
    stats2 = process_gc_stats(stats2, labels[2])
    stats = {}
    stats.update(stats0)
    stats.update(stats1)
    stats.update(stats2)

    
    return stats

def run_gc_with_defined_setup(img, segparams):
    
    start = time.time()
    gc = pycut.ImageGraphCut(img, segparams=segparams)
    gc.set_seeds(seeds)
    gc.run()
    sg1 = gc.segmentation
    stats1 = gc.stats
    elapsed1 = (time.time() - start)
    err1 = np.sum(np.abs(seg - (1 - sg1)))
    stats1["time"] = elapsed1
    stats1["error"] = err1
    return stats1


def add_data_and_algoritm_info(stats, data_params, segparams, start):
    #     stats['msgc time'] = elapsed1
#     stats['normal time'] = elapsed2
    stats['data size'] = data_params[0]
    stats['data offset'] = data_params[1]
    stats['data radius'] = data_params[2]
    stats["block size"] = segparams["block_size"]
    stats["data seedsz"] = data_params[3]
#     stats["GC error"] = err2
#     stats["MSGC error"] = err1
    stats['machine hostname'] = machine_hostname
    stats['experiment iteration start time'] = start
    
    return stats

def add_data_seaborn(stats, data_params, segparams, start, i, label):
    stats = process_gc_stats(stats, "")
    stats = add_data_and_algoritm_info(stats, data_params, segparams, start)
    stats["method"] = label
    dfinew = pd.DataFrame(stats, index=[i*3 + 0])
    #dfnew = dfnew.append(dfinew, sort=True)
    
    return dfinew

In [10]:
#for par in it:
#    print par
i = 0

In [11]:
force_rewrite = False
force_rewrite = True

if op.exists(fname) and not force_rewrite:
    df = pd.read_csv(fname)#, index_col=0)
else:
    df = pd.DataFrame([])
    
if op.exists(fnamenew) and not force_rewrite:
    dfnew = pd.read_csv(fnamenew)#, index_col=0)
else:
    dfnew = pd.DataFrame([])


i = 0
np.random.seed(rnd_seed)

import platform
machine_hostname = platform.node()

it = itertools.product(dataparams_sh, dp_offset, dp_radius, dp_seedsz)
for data_params in it:

    start = time.time()
    img, seg, seeds = make_data(data_params[0], data_params[1], data_params[2], data_params[3])
    stats0 = run_gc_with_defined_setup(img, segparams0)
    stats1 = run_gc_with_defined_setup(img, segparams1)
    stats2 = run_gc_with_defined_setup(img, segparams2)

#     stats2 = gc.stats
#     elapsed2 = (time.time() - start)
#     err2 = np.sum(np.abs(seg - (1 - sg2)))
    
    
#     print(err1)
#     print("t1 / t2 =", elapsed1, "/" , elapsed2 ,' = ', elapsed1/elapsed2)
    
    stats = merge_stats(stats0, stats1, stats2, labels)
    
    stats = add_data_and_algoritm_info(stats, data_params, segparams0, start)
#     stats['msgc time'] = elapsed1
#     stats['normal time'] = elapsed2
#     stats['data size'] = data_params[0]
#     stats['data offset'] = data_params[1]
#     stats['data radius'] = data_params[2]
#     stats["block size"] = segparams1["block_size"]
#     stats["data seedsz"] = data_params[3]
# #     stats["GC error"] = err2
# #     stats["MSGC error"] = err1
#     stats['machine hostname'] = machine_hostname
#     stats['start time'] = start
    
    dfi = pd.DataFrame(stats, index=[i])
    
    # display(df)
    df = df.append(dfi, sort=True)
    df.to_csv(fname, index=False)
    
    
    
    
#     stats = process_gc_stats(stats0, "")
    dfinew = add_data_seaborn(stats0, data_params, segparams0, start, i, labels[0])
    dfnew = dfnew.append(dfinew, sort=True)
    dfinew = add_data_seaborn(stats1, data_params, segparams1, start, i, labels[1])
    dfnew = dfnew.append(dfinew, sort=True)
    dfinew = add_data_seaborn(stats2, data_params, segparams2, start, i, labels[2])
    dfnew = dfnew.append(dfinew, sort=True)
    
#     stats = process_gc_stats(stats1, "")
#     stats = add_data_and_algoritm_info(stats, data_params, segparams1, start)
#     dfinew = pd.DataFrame(stats, index=[i*3 + 1])
#     dfnew = dfnew.append(dfinew, sort=True)
    
#     stats = process_gc_stats(stats2, "")
#     stats = add_data_and_algoritm_info(stats, data_params, segparams2, start)
#     dfinew = pd.DataFrame(stats, index=[i*3 + 2])
#     dfnew = dfnew.append(dfinew, sort=True)
    dfnew.to_csv(fnamenew, index=False)
    
    
    
    
    
    i += 1
    

#     plt.figure(figsize=[10,15])
#     sed3.show_slices(img, contour=sg1, seeds=seeds, slice_step=10)
#     plt.figure(figsize=[10,15])
#     sed3.show_slices(img, contour=sg2, seeds=seeds, slice_step=10)


/storage/plzen1/home/mjirik/miniconda/lib/python3.6/site-packages/scipy/ndimage/interpolation.py:583: UserWarning: From scipy 0.13.0, the output shape of zoom() is calculated with round() instead of int() - for these inputs the size of the returned array has changed.
  "the returned array has changed.", UserWarning)

In [12]:
# dfnew.to_csv(fnamenew, index=False)
to_file(str(i), op.join(latex_dir, "msgc_dataset_size.tex"))

In [13]:
len(list(itertools.product(dataparams_sh, dp_offset, dp_radius, dp_seedsz)))


Out[13]:
302

In [14]:
stats


Out[14]:
{'ssgc _create_nlinks time': 3.8474199771881104,
 'ssgc gc time': 35.363853216171265,
 'ssgc time': 42.24955487251282,
 'ssgc error': 0,
 'ssgc nlinks number': 22129966,
 'ssgc tlinks number': 7414680,
 'ssgc edge number': 29544646,
 'msgc_hi2lo t1': 5.4836273193359375e-05,
 'msgc_hi2lo _create_nlinks time': 2.4560203552246094,
 'msgc_hi2lo gc time': 1.33320951461792,
 'msgc_hi2lo t2': 0.27971959114074707,
 'msgc_hi2lo t3': 0.282670259475708,
 'msgc_hi2lo t4': 1.7206535339355469,
 'msgc_hi2lo t5': 4.224681854248047,
 'msgc_hi2lo t6': 15.449214220046997,
 'msgc_hi2lo t7': 16.3068528175354,
 'msgc_hi2lo t8': 26.637968063354492,
 'msgc_hi2lo t9': 26.657761096954346,
 'msgc_hi2lo time': 28.10735011100769,
 'msgc_hi2lo error': 0,
 'msgc_hi2lo nlinks number': 658531,
 'msgc_hi2lo tlinks number': 219789,
 'msgc_hi2lo edge number': 878320,
 'msgc_lo2hi t1': 0.012177467346191406,
 'msgc_lo2hi _create_nlinks time': 0.003682374954223633,
 'msgc_lo2hi gc time': 0.9981119632720947,
 'msgc_lo2hi t2': 0.32938075065612793,
 'msgc_lo2hi t3': 0.3322787284851074,
 'msgc_lo2hi t9': 6.7539942264556885,
 'msgc_lo2hi time': 7.8852763175964355,
 'msgc_lo2hi error': 111013,
 'msgc_lo2hi nlinks number': 735900,
 'msgc_lo2hi tlinks number': 245770,
 'msgc_lo2hi edge number': 981670,
 'data size': 194,
 'data offset': 5,
 'data radius': 10,
 'block size': 10,
 'data seedsz': 3,
 'machine hostname': 'minos47.zcu.cz',
 'experiment iteration start time': 1534199870.2989566}

Data processing graphs, statistics


In [15]:
df = pd.read_csv(fname)
# df.rename(columns={"msgc time": "MSGC time"})
dfs = df[(df["data seedsz"]==3) & (df["data offset"] == 3) & (df["data radius"] == 10)]
dfs_plus = dfs[dfs['data size'] > 160]

Old graphs


In [ ]:


In [16]:
# df["GC total time"] = df["normal time"]
# df["MSGC total time"] = df["msgc time"]
# df["GC time"] = df["normal gc time"]
# df["MSGC time"] = df["gc time"]
# df["MSGC links number"] = df["nlinks number"] + df["tlinks number"]
# df["GC links number"] = df["normal nlinks number"] + df["normal tlinks number"]
# df["time rate"] = df["gc time"] / df["normal gc time"]
# df["MSGC time rate"] = df["gc time"] / df["data size"]
# df["GC time rate"] = df["normal gc time"] / df["data size"]

def func(x, a, c, d):
    return a*np.exp(-c*x)+d

#from scipy.optimize import curve_fit
#popt, pcov = curve_fit(func, df["data size"], df["MSGC time"])
#print popt, pcov

#def msgcp(x):
#    a, c, d = popt
#    return a*np.exp(-c*x)+d
msgcp = np.poly1d(np.polyfit(df["data size"], df[labels[0] + "time"], 2))
df[labels[0] + "time trend"] = msgcp(df["data size"])
msgctp = np.poly1d(np.polyfit(df["data size"], df[labels[1] + "time"], 2))
df[labels[1] + "time trend"] = msgctp(df["data size"])


plt.figure()
df[[labels[0] + 'time', labels[1] + 'time', labels[2] + "time", 'data size']].sort_values("data size").plot(x='data size')
plt.figure(figsize=(10,15))
df[[labels[0] + 'time', labels[1] + 'time', labels[2] + "time",
    labels[0] + 'gc time', labels[1] + 'gc time', labels[2] + "gc time", 'data size',
   ]].sort_values("data size").plot(x='data size', style=["-o", "-^", "-s", ":", ":", ":"], color=["r", "g", "b", "r", "g", "b"])
plt.savefig("../graphics/exp062-msgc_time_size_comparison.pdf")


plt.figure()
#df[['normal nlinks number', "nlinks number", 'data size']].sort("data size").plot(x='data size')
df[[labels[0] + 'edge number', labels[1] + 'edge number', labels[2] + 'edge number', 'data size']].sort_values("data size").plot(
    x='data size')


# plt.figure()
# df[[labels[0] + 'time', labels[1] + "time", labels[1] + "time trend", 'data size']].sort_values("data size").plot(
#     x='data size', style=["-", "x", "-"])
# plt.savefig("../graphics/exp062-multiscale-time-data_size.pdf")
# plt.figure()
# df[['GC total time', "MSGC total time", "MSGC total time trend", 'data size']].sort_values("data size").plot(
#     x='data size', style=["-", "x", "-"])
# plt.savefig("../graphics/exp062-multiscale-total_time-data_size.pdf")

plt.figure()
df[[labels[0] + "time", labels[1] + "time", labels[2] + "time",]].boxplot(showfliers=False)
plt.savefig("../graphics/exp062-multiscale-gc_time.pdf")
plt.figure()
df[[labels[0] + 'time', labels[0] + 'gc time', labels[1] + "time",
    labels[1] + 'gc time', labels[2] + 'time', labels[2] + "gc time",
   
   ]].boxplot(showfliers=False, rot=90)
plt.savefig("../graphics/exp062-multiscale-gc_time-total_time.pdf")
# plt.figure()
# df[["GC total time", "MSGC total time"]].boxplot(showfliers=False)
# plt.savefig("../graphics/exp062-multiscale-total_time.pdf")

# plt.figure()
# df[['MSGC time', "GC links number", 'GC time']].sort_values("GC links number").plot(x='GC links number')



dfs = df[(df["data seedsz"]==3) & (df["data offset"] == 3) & (df["data radius"] == 10)]
# plt.figure()
# dfs[["GC total time", "MSGC total time", 'GC time', 'MSGC time', 'data size']].sort_values("data size").plot(x='data size', style=["-", "-", "--","--"])
# plt.savefig("../graphics/exp062-multiscale-all.pdf")

plt.figure()
dfs[[labels[0] + "edge number", labels[1] + "edge number", 'data size']].sort_values("data size").plot(x='data size', style=["-", "-", "--","--"])
plt.savefig("../graphics/exp062-multiscale-links_number.pdf")

df_mn = df[[labels[0] + "time", labels[1] + "time", labels[2] + "time", labels[0] + "gc time", labels[1] + "gc time", labels[2] + "gc time"]].describe()
display(df_mn)
def to_latex_file(df, fn):
    with open(fn, "w") as f:
        f.write(df.to_latex())
to_latex_file(df_mn, op.abspath("../includes/exp062-all.tex"))


ssgc time msgc_hi2lo time msgc_lo2hi time ssgc gc time msgc_hi2lo gc time msgc_lo2hi gc time
count 302.000000 302.000000 302.000000 302.000000 302.000000 302.000000
mean 13.686866 11.145373 4.245357 11.358913 1.314282 0.955624
std 12.387634 8.123918 1.893358 10.173616 0.232893 0.167319
min 0.464404 1.484253 1.286131 0.387758 0.600486 0.563548
25% 3.192865 4.490379 2.704294 2.719636 1.185286 0.853952
50% 9.933878 8.657265 3.801340 8.268384 1.308124 0.940260
75% 21.010241 16.426572 5.482555 17.604554 1.465036 1.041503
max 46.299754 32.707072 9.641186 38.583321 2.000985 1.680861
<matplotlib.figure.Figure at 0x2ab0627ef908>
<matplotlib.figure.Figure at 0x2ab0627e2b00>
<matplotlib.figure.Figure at 0x2ab0627c95c0>
<matplotlib.figure.Figure at 0x2ab062f473c8>

In [17]:
i


Out[17]:
302

In [ ]:


In [18]:
df.keys()


Out[18]:
Index(['block size', 'data offset', 'data radius', 'data seedsz', 'data size',
       'experiment iteration start time', 'machine hostname',
       'msgc_hi2lo _create_nlinks time', 'msgc_hi2lo edge number',
       'msgc_hi2lo error', 'msgc_hi2lo gc time', 'msgc_hi2lo nlinks number',
       'msgc_hi2lo t1', 'msgc_hi2lo t2', 'msgc_hi2lo t3', 'msgc_hi2lo t4',
       'msgc_hi2lo t5', 'msgc_hi2lo t6', 'msgc_hi2lo t7', 'msgc_hi2lo t8',
       'msgc_hi2lo t9', 'msgc_hi2lo time', 'msgc_hi2lo tlinks number',
       'msgc_lo2hi _create_nlinks time', 'msgc_lo2hi edge number',
       'msgc_lo2hi error', 'msgc_lo2hi gc time', 'msgc_lo2hi nlinks number',
       'msgc_lo2hi t1', 'msgc_lo2hi t2', 'msgc_lo2hi t3', 'msgc_lo2hi t9',
       'msgc_lo2hi time', 'msgc_lo2hi tlinks number',
       'ssgc _create_nlinks time', 'ssgc edge number', 'ssgc error',
       'ssgc gc time', 'ssgc nlinks number', 'ssgc time', 'ssgc tlinks number',
       'ssgc time trend', 'msgc_hi2lo time trend'],
      dtype='object')

Seaborn graphs


In [19]:
df = pd.read_csv(fnamenew)
# df.rename(columns={"msgc time": "MSGC time"})
dfs = df[(df["data seedsz"]==3) & (df["data offset"] == 3) & (df["data radius"] == 10)]
dfs_plus = dfs[dfs['data size'] > 160]

import seaborn as sns
sns.set_context("paper")
sns.set_style("white")

In [21]:
df.keys()


Out[21]:
Index(['_create_nlinks time', 'block size', 'data offset', 'data radius',
       'data seedsz', 'data size', 'edge number', 'error',
       'experiment iteration start time', 'gc time', 'machine hostname',
       'method', 'nlinks number', 't1', 't2', 't3', 't4', 't5', 't6', 't7',
       't8', 't9', 'time', 'tlinks number'],
      dtype='object')

In [23]:
sns.boxplot(data=df, y="time", x="method")


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x2ab063d8c208>

In [24]:
# df

In [25]:
uu = pd.melt(df.rename(columns={"gc time": "gc", "time": "total"}), value_vars=["gc", "total"], id_vars=["method"], var_name="time type", value_name="time")
# uu = pd.melt(dfs, value_vars=["gc time", "time"], id_vars=["method"], var_name="type", value_name="time")
# uu

In [26]:
sns.boxplot(data=uu, hue="time type",y="time", x="method")
plt.savefig(op.join(latex_dir, "msgc_time_boxplot.pdf"), dpi=1000)



In [27]:
# sns.boxplot(data=dfs, y="error", x="method")

In [28]:
lm = sns.lmplot(data=df, x="data size", y="time", hue="method", order=3, scatter_kws={"s": 3, "marker": "x", "alpha": 0.5})
axes = lm.axes
axes[0,0].set_xlim(30,200)
axes[0,0].set_ylim(0,50)

lines = lm.ax.get_lines()
line = lines[0]
line.set_linestyle("--")
# line.set_marker("s")

plt.savefig(op.join(latex_dir, "msgc_time_datasize_plot.pdf"), dpi=1000)
# axes[0,1].set_ylim(0,)

# lm.ax.get_lines()


Vykreslení dvou lmplotů do jednoho není tak snadné


In [29]:
# better melt
from pandas.core.dtypes.common import is_list_like
from pandas.core.frame import DataFrame
from pandas.core.index import MultiIndex
from pandas import compat
from IPython.display import display
from pandas.core.reshape.concat import concat
import re
from pandas.core.tools.numeric import to_numeric
from pandas.util._decorators import Appender
from pandas.core.frame import _shared_docs
import numpy as np
import pandas as pd
import pandas.util.testing as tm


def _melt(frame, id_vars=None, value_vars=None, var_name=None,
          value_name='value', col_level=None, stubnames=False,
          suffix=r'\d+', sep='', extra_group=0, var_end=None):
    # TODO: what about the existing index?

    def check_vars(frame, var, var_string):
        for v in var:
            if num_col_levels > 1:
                if not isinstance(v, tuple):
                    raise ValueError('{} must be a list of tuples'
                                     ' when columns are a MultiIndex'
                                     .format(var_string))
                elif len(v) != num_col_levels:
                    raise ValueError('all tuples in {} must be length {}'
                                     .format(var_string,
                                             frame.columns.nlevels))
            else:
                if is_list_like(v) and len(v) > 1:
                    raise ValueError('DataFrame has only a single level of '
                                     'columns. {} is not a column'.format(v))

    if len(col_level) == 0:
        num_col_levels = frame.columns.nlevels
    else:
        num_col_levels = len(col_level)

    check_vars(frame, id_vars, 'id_vars')
    check_vars(frame, value_vars, 'value_vars')

    if var_name != [] and len(var_name) != num_col_levels:
        raise ValueError('Length of var_name must match effective number of '
                         'column levels.')
    
    if col_level != []:
        droplevels = list(range(frame.columns.nlevels))
        for level in col_level:
            if isinstance(level, int):
                droplevels.remove(level)
            else:
                droplevels.remove(frame.columns.names.index(level))
        if droplevels != []:
            frame = frame.copy()
            frame.columns = frame.columns.droplevel(droplevels)

    if stubnames and isinstance(frame.columns, MultiIndex):
        raise ValueError('Stubnames only work with single-index DataFrames')
        
    for iv in id_vars:
        if iv not in frame.columns:
            raise KeyError('{} not in columns'.format(iv))

    if value_vars != []:
        for vv in value_vars:
            if vv not in frame.columns:
                raise KeyError('{} not in columns'.format(vv))
                
    if var_name == []:
        names = list(frame.columns.names)
        if len(names) == 1:
            if names[0] is None:
                var_name.append('variable')
            else:
                var_name.append(names[0])
        elif names.count(None) == 1:
            names[names.index(None)] = 'variable'
            var_name = names
        else:
            missing_name_count = 0
            for name in names:
                if name is None:
                    var_name.append('variable_{}'.format(missing_name_count))
                    missing_name_count += 1
                else:
                    var_name.append(name)
    if var_end is not None:
        var_name = [vn + '_' + str(var_end) for vn in var_name]
    
    N = len(frame)
    
    non_id_ilocs = []
    if value_vars != []:
        for v in value_vars:
            for i, v1 in enumerate(frame.columns):
                if v == v1:
                    non_id_ilocs.append(i)
    else:
        if id_vars == []:
            non_id_ilocs = list(range(frame.shape[1]))
        else:
            for i, v in enumerate(frame.columns):
                if v not in id_vars:
                    non_id_ilocs.append(i)
                        
    K = len(non_id_ilocs)

    mdata = {}
    mcolumns = []
    for col in id_vars:
        pandas_obj = frame[col]
        if isinstance(pandas_obj, DataFrame):
            for i in range(pandas_obj.shape[1]):
                col_name = col + '_id_' + str(i)
                mdata[col_name] = np.tile(pandas_obj.iloc[:, i].values, K + extra_group)
                mcolumns.append(col_name)
        else:
            mdata[col] = np.tile(pandas_obj, K + extra_group)
            mcolumns.append(col)

    values = np.concatenate([frame.iloc[:, i] for i in non_id_ilocs])
    if extra_group > 0:
        values = np.concatenate((values, np.full([N * extra_group], np.nan)))
    mdata[value_name[0]] = values
    
    for i, col in enumerate(var_name):
        values = frame.columns[non_id_ilocs]._get_level_values(i)
        if stubnames:
            regex = '^{0}{1}'.format(re.escape(value_name[0]), re.escape(sep))
            values = to_numeric(values.str.replace(regex, ''), errors='ignore')
        if isinstance(values, MultiIndex):
            # asanyarray will keep the columns as an Index
            values = np.asanyarray(values).repeat(N)
        else: 
            data_list = []
            for v in values.tolist():
                data_list.extend([v] * N)
            values = data_list
        if extra_group > 0:
            values = np.concatenate((values, np.full([N * extra_group], np.nan)))
        mdata[col] = values
    mcolumns += var_name + value_name
    
    return mdata, mcolumns


@Appender(_shared_docs['melt'] %
          dict(caller='pd.melt(df, ',
               versionadded="",
               other='DataFrame.melt'))
def melt(frame, id_vars=None, value_vars=None, var_name=None,
         value_name='value', col_level=None, stubnames=False,
         suffix=r'\d+', sep=''):
    def convert_to_list(val):
        if val is None:
            return []
        elif not is_list_like(val):
            return [val]
        else:
            return list(val)

    def get_var_names(df, stub, sep, suffix):
        regex = '^{0}{1}{2}$'.format(re.escape(stub), re.escape(sep), suffix)
        col_return = [col for col in df.columns if re.match(regex, col)]
        if col_return == []:
            raise ValueError('No stubname {}'.format(stub))
        return col_return

    id_vars = convert_to_list(id_vars)
    value_vars = convert_to_list(value_vars)
    var_name = convert_to_list(var_name)
    value_name = convert_to_list(value_name)
    col_level = convert_to_list(col_level)

    if stubnames:
        if value_vars == []:
            raise ValueError('Must provide stubnames as a list to value_vars')
        value_name = value_vars
        value_vars = [get_var_names(frame, stub, sep, suffix)
                      for stub in value_vars]
        if var_name == []:
            var_name = ['variable_' + v for v in value_name]

    if value_vars != [] and isinstance(value_vars[0], list):
        if var_name != []:
            if len(value_vars) != len(var_name):
                raise ValueError('Number of inner lists of value_vars must '
                                 'equal length of var_name '
                                 '{} != {}'.format(len(value_vars),
                                                   len(var_name)))
        else:
            var_name = [[]] * len(value_vars)

        if len(value_name) > 1:
            if len(value_vars) != len(value_name):
                raise ValueError('Number of inner lists of value_vars must '
                                 'equal length of value_name '
                                 '{} != {}'.format(len(value_vars),
                                                   len(value_name)))
        elif not stubnames:
            value_name = [value_name[0] + '_' + str(i) for i in range(len(value_vars))]

        value_vars_length = []
        for vv in value_vars:
            count = 0
            for col in frame.columns.values:
                if col in vv:
                    count += 1
            value_vars_length.append(count)
        max_group_len = max(value_vars_length)  

        mdata_list = []
        mcolumns_list = []
        vars_zipped = zip(value_vars, var_name, value_name, value_vars_length)
        for i, (val_v, var_n, val_n, vvl) in enumerate(vars_zipped):
            var_n = convert_to_list(var_n)
            val_n = convert_to_list(val_n)

            id_vars_ = [] if i > 0 else id_vars
            var_end = i if var_n == [] else None
            
            md, mc = _melt(frame, id_vars=id_vars_, value_vars=val_v,
                       var_name=var_n, value_name=val_n,
                       col_level=col_level, stubnames=stubnames,
                       suffix=suffix, sep=sep, 
                       extra_group=max_group_len - vvl,
                       var_end=var_end)

            mdata_list.append(md)
            mcolumns_list.append(mc)
            
        mdata = {}
        for d in mdata_list:
            mdata.update(d)
            
        mcolumns = [e for lst in mcolumns_list for e in lst]
        return DataFrame(mdata, columns=mcolumns)

    else:   
        mdata, mcolumns =  _melt(frame, id_vars=id_vars, value_vars=value_vars,
                             var_name=var_name, value_name=value_name,
                             col_level=col_level, stubnames=stubnames,
                             suffix=suffix, sep=sep)
        return DataFrame(mdata, columns=mcolumns)

In [30]:
# test better melt

dfm = pd.DataFrame({'City': ['Houston', 'Austin', 'Hoover'],
                   'State': ['Texas', 'Texas', 'Alabama'],
                   'Name':['Aria', 'Penelope', 'Niko'],
                   'Mango':[4, 10, 90],
                   'Orange': [10, 8, 14], 
                   'Watermelon':[40, 99, 43],
                   'Gin':[16, 200, 34],
                   'Vodka':[20, 33, 18]},
                 columns=['City', 'State', 'Name', 'Mango', 'Orange', 'Watermelon', 'Gin', 'Vodka'])
melt(dfm, id_vars=['City', 'State'], value_vars=[['Mango', 'Orange', 'Watermelon'], ['Gin', 'Vodka']], 
     var_name=['Fruit', 'Drink'], value_name=['Pounds', 'Ounces'])


Out[30]:
City State Fruit Pounds Drink Ounces
0 Houston Texas Mango 4 Gin 16.0
1 Austin Texas Mango 10 Gin 200.0
2 Hoover Alabama Mango 90 Gin 34.0
3 Houston Texas Orange 10 Vodka 20.0
4 Austin Texas Orange 8 Vodka 33.0
5 Hoover Alabama Orange 14 Vodka 18.0
6 Houston Texas Watermelon 40 nan NaN
7 Austin Texas Watermelon 99 nan NaN
8 Hoover Alabama Watermelon 43 nan NaN

In [31]:
df.keys()


Out[31]:
Index(['_create_nlinks time', 'block size', 'data offset', 'data radius',
       'data seedsz', 'data size', 'edge number', 'error',
       'experiment iteration start time', 'gc time', 'machine hostname',
       'method', 'nlinks number', 't1', 't2', 't3', 't4', 't5', 't6', 't7',
       't8', 't9', 'time', 'tlinks number'],
      dtype='object')

In [32]:
df.rename(columns={"gc time": "gc", "time": "total"})[["gc", "total", "data size"]]


Out[32]:
gc total data size
0 0.387758 0.464404 44
1 0.600486 1.484253 44
2 0.575705 1.364116 44
3 0.398880 0.511308 44
4 0.631328 1.501349 44
5 0.586955 1.292109 44
6 0.408006 0.510415 45
7 0.703892 1.509023 45
8 0.580516 1.368980 45
9 0.422899 0.509746 45
10 0.732416 1.584651 45
11 0.600608 1.286131 45
12 0.456883 0.542603 46
13 0.898809 1.762383 46
14 0.618423 1.377814 46
15 0.423894 0.504369 46
16 0.737121 1.618654 46
17 0.563548 1.293229 46
18 0.579378 0.673480 47
19 0.633154 1.527280 47
20 0.596221 1.507242 47
21 0.466438 0.562227 47
22 0.758529 1.844235 47
23 0.618097 1.409968 47
24 0.552704 0.675754 48
25 0.839266 1.981161 48
26 0.621476 1.365128 48
27 0.513288 0.611944 48
28 0.888105 1.862755 48
29 0.657072 1.504336 48
... ... ... ...
876 30.796478 37.147485 190
877 1.532676 32.707072 190
878 0.834002 8.025624 190
879 33.111549 39.908661 190
880 1.342993 28.866505 190
881 1.057288 9.641186 190
882 33.081220 41.211395 191
883 1.307421 27.349145 191
884 0.962159 8.315291 191
885 32.641730 39.949749 191
886 1.672588 28.504106 191
887 1.061269 8.619738 191
888 38.583321 46.299754 192
889 1.321576 30.646133 192
890 0.860820 7.827029 192
891 32.928773 40.408750 192
892 1.659828 32.106595 192
893 1.076310 9.570305 192
894 37.169855 45.791534 193
895 1.284505 29.258626 193
896 0.912246 8.053750 193
897 33.687315 40.848926 193
898 1.340688 29.740595 193
899 1.081530 8.506383 193
900 33.464883 41.466649 194
901 1.476419 29.586427 194
902 1.010888 8.595915 194
903 35.363853 42.249555 194
904 1.333210 28.107350 194
905 0.998112 7.885276 194

906 rows × 3 columns


In [33]:
uu = melt(df.rename(columns={"gc time": "gc", "time": "total"}), value_vars=["gc", "total"], id_vars=["method"], var_name=["time type"], value_name=["time"])
uu["mth"] = uu["method"] + " " + uu["time type"]
uu


Out[33]:
method time type time mth
0 ssgc gc 0.387758 ssgc gc
1 msgc_hi2lo gc 0.600486 msgc_hi2lo gc
2 msgc_lo2hi gc 0.575705 msgc_lo2hi gc
3 ssgc gc 0.398880 ssgc gc
4 msgc_hi2lo gc 0.631328 msgc_hi2lo gc
5 msgc_lo2hi gc 0.586955 msgc_lo2hi gc
6 ssgc gc 0.408006 ssgc gc
7 msgc_hi2lo gc 0.703892 msgc_hi2lo gc
8 msgc_lo2hi gc 0.580516 msgc_lo2hi gc
9 ssgc gc 0.422899 ssgc gc
10 msgc_hi2lo gc 0.732416 msgc_hi2lo gc
11 msgc_lo2hi gc 0.600608 msgc_lo2hi gc
12 ssgc gc 0.456883 ssgc gc
13 msgc_hi2lo gc 0.898809 msgc_hi2lo gc
14 msgc_lo2hi gc 0.618423 msgc_lo2hi gc
15 ssgc gc 0.423894 ssgc gc
16 msgc_hi2lo gc 0.737121 msgc_hi2lo gc
17 msgc_lo2hi gc 0.563548 msgc_lo2hi gc
18 ssgc gc 0.579378 ssgc gc
19 msgc_hi2lo gc 0.633154 msgc_hi2lo gc
20 msgc_lo2hi gc 0.596221 msgc_lo2hi gc
21 ssgc gc 0.466438 ssgc gc
22 msgc_hi2lo gc 0.758529 msgc_hi2lo gc
23 msgc_lo2hi gc 0.618097 msgc_lo2hi gc
24 ssgc gc 0.552704 ssgc gc
25 msgc_hi2lo gc 0.839266 msgc_hi2lo gc
26 msgc_lo2hi gc 0.621476 msgc_lo2hi gc
27 ssgc gc 0.513288 ssgc gc
28 msgc_hi2lo gc 0.888105 msgc_hi2lo gc
29 msgc_lo2hi gc 0.657072 msgc_lo2hi gc
... ... ... ... ...
1782 ssgc total 37.147485 ssgc total
1783 msgc_hi2lo total 32.707072 msgc_hi2lo total
1784 msgc_lo2hi total 8.025624 msgc_lo2hi total
1785 ssgc total 39.908661 ssgc total
1786 msgc_hi2lo total 28.866505 msgc_hi2lo total
1787 msgc_lo2hi total 9.641186 msgc_lo2hi total
1788 ssgc total 41.211395 ssgc total
1789 msgc_hi2lo total 27.349145 msgc_hi2lo total
1790 msgc_lo2hi total 8.315291 msgc_lo2hi total
1791 ssgc total 39.949749 ssgc total
1792 msgc_hi2lo total 28.504106 msgc_hi2lo total
1793 msgc_lo2hi total 8.619738 msgc_lo2hi total
1794 ssgc total 46.299754 ssgc total
1795 msgc_hi2lo total 30.646133 msgc_hi2lo total
1796 msgc_lo2hi total 7.827029 msgc_lo2hi total
1797 ssgc total 40.408750 ssgc total
1798 msgc_hi2lo total 32.106595 msgc_hi2lo total
1799 msgc_lo2hi total 9.570305 msgc_lo2hi total
1800 ssgc total 45.791534 ssgc total
1801 msgc_hi2lo total 29.258626 msgc_hi2lo total
1802 msgc_lo2hi total 8.053750 msgc_lo2hi total
1803 ssgc total 40.848926 ssgc total
1804 msgc_hi2lo total 29.740595 msgc_hi2lo total
1805 msgc_lo2hi total 8.506383 msgc_lo2hi total
1806 ssgc total 41.466649 ssgc total
1807 msgc_hi2lo total 29.586427 msgc_hi2lo total
1808 msgc_lo2hi total 8.595915 msgc_lo2hi total
1809 ssgc total 42.249555 ssgc total
1810 msgc_hi2lo total 28.107350 msgc_hi2lo total
1811 msgc_lo2hi total 7.885276 msgc_lo2hi total

1812 rows × 4 columns


In [34]:
lm = sns.lmplot(data=uu, x="data size", y="time", hue="mth", order=3, scatter_kws={"s": 3, "marker": "x", "alpha": 0.5})
axes = lm.axes
axes[0,0].set_xlim(30,200)
axes[0,0].set_ylim(0,50)

lines = lm.ax.get_lines()
line = lines[0]
line.set_linestyle("--")


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-34-5af07ecba190> in <module>()
----> 1 lm = sns.lmplot(data=uu, x="data size", y="time", hue="mth", order=3, scatter_kws={"s": 3, "marker": "x", "alpha": 0.5})
      2 axes = lm.axes
      3 axes[0,0].set_xlim(30,200)
      4 axes[0,0].set_ylim(0,50)
      5 

~/miniconda/lib/python3.6/site-packages/seaborn/regression.py in lmplot(x, y, data, hue, col, row, palette, col_wrap, size, aspect, markers, sharex, sharey, hue_order, col_order, row_order, legend, legend_out, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, x_jitter, y_jitter, scatter_kws, line_kws)
    550     need_cols = [x, y, hue, col, row, units, x_partial, y_partial]
    551     cols = np.unique([a for a in need_cols if a is not None]).tolist()
--> 552     data = data[cols]
    553 
    554     # Initialize the grid

~/miniconda/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2680         if isinstance(key, (Series, np.ndarray, Index, list)):
   2681             # either boolean or fancy integer index
-> 2682             return self._getitem_array(key)
   2683         elif isinstance(key, DataFrame):
   2684             return self._getitem_frame(key)

~/miniconda/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_array(self, key)
   2724             return self._take(indexer, axis=0)
   2725         else:
-> 2726             indexer = self.loc._convert_to_indexer(key, axis=1)
   2727             return self._take(indexer, axis=1)
   2728 

~/miniconda/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1325                 if mask.any():
   1326                     raise KeyError('{mask} not in index'
-> 1327                                    .format(mask=objarr[mask]))
   1328 
   1329                 return com._values_from_object(indexer)

KeyError: "['data size'] not in index"

In [ ]:
lm = sns.lmplot(data=uu, x="data size", y="time", hue="time type", order=3, scatter_kws={"s": 3, "marker": "x", "alpha": 0.5})
axes = lm.axes
axes[0,0].set_xlim(30,200)
axes[0,0].set_ylim(0,50)

lines = lm.ax.get_lines()
line = lines[0]
line.set_linestyle("--")
# line.set_marker("s")

plt.savefig(op.join(latex_dir, "msgc_time_datasize_plot.pdf"), dpi=1000)
# axes[0,1].set_ylim(0,)

# lm.ax.get_lines()

In [ ]:


In [ ]:
# this work jus for not duplicit values of data siz
# sns.tsplot(data=df, time="data size", value="time", unit="method", condition="method")
# plt.savefig(op.join(latex_dir, "msgc_size_time.pdf"), dpi=1000)

In [ ]:
line.set_marker("s")

In [ ]:
# df

In [ ]:
# df["method"]

Statistics


In [ ]:
from scipy import stats



dfs_plus_describe = dfs_plus.describe()
display(dfs_plus_describe)

print("pokud je pvalue mensi nez zvolena hladina vyznamnosti (0.01=1%), je vsechno ok")
tt = stats.ttest_rel(dfs_plus.loc[dfs_plus["method"] == "ssgc"]['time'], dfs_plus.loc[dfs_plus["method"] == "msgc_lo2hi"]['time'])
# tt

In [ ]:
ssgc_rows = dfs_plus[dfs_plus["method"].str.contains(labels[0])]
ssgc_hi2lo_rows = dfs_plus[dfs_plus["method"].str.contains(labels[1])]
ssgc_lo2hi_rows = dfs_plus[dfs_plus["method"].str.contains(labels[2])]
pp0 = stats.ttest_rel(ssgc_rows["time"], ssgc_hi2lo_rows["time"])
pp1 = stats.ttest_rel(ssgc_rows["time"], ssgc_lo2hi_rows["time"])
pp2 = stats.ttest_rel(ssgc_hi2lo_rows["time"], ssgc_lo2hi_rows["time"])

print("pokud je pvalue mensi nez zvolena hladina vyznamnosti (0.01=1%), je vsechno ok") 
#mozna staci i dvojnasobek hladiny vyzamnosi
print("statistic musi byt vetsi nez 0")
display(pp0)
display(pp1)
display(pp2)

float_to_latex_file(pp0.pvalue, op.join(latex_dir, "ttest_pvalue_ssgc_msgc_hi2lo.tex"))
float_to_latex_file(pp1.pvalue, op.join(latex_dir, "ttest_pvalue_ssgc_msgc_lo2hi.tex"))
float_to_latex_file(pp2.pvalue, op.join(latex_dir, "ttest_pvalue_msgc_hi2lo_msgc_lo2hi.tex"))

In [ ]:
# dfs_plus["method"] == "ssgc "

Datasets


In [ ]:
dfs_describe = dfs.describe()
display(dfs_describe)
dfs_plus_describe = dfs_plus.describe()
display(dfs_plus_describe)

In [ ]:
dfs_plus_size = int(len(dfs_plus) / len(labels))
to_file(str(dfs_plus_size), op.join(latex_dir, "msgc_dataset_subset_size.tex"))

In [ ]:
df_mn = df[["GC total time", "MSGC total time", "GC time", "MSGC time"]].describe()
display(df_mn)
to_latex_file(df_mn, "../includes/exp062-all2data_size.tex")

dfs_mn = dfs[["GC total time", "MSGC total time", "GC time", "MSGC time"]].describe()
display(dfs_mn)
to_latex_file(dfs_mn, "../includes/exp062-selection2data_size.tex")

In [ ]:
dfs_plus[["method"]]