In [1]:
%pylab inline
import sys
import os.path as op
import shutil
# sys.path.insert(0, "/home/mjirik/projects/pyseg_base/")
sys.path.insert(0, op.abspath("../"))
import scipy
import time
import pandas as pd
from imcut import pycut
import sed3
import itertools
latex_dir = "../../papers/iwcia18/"
# sh 155,160, r10, dpoff 3, seeds 3
dataparams_sh = list(range(44, 195, 10))
dataparams_sh = list(range(44, 195, 1))
# dataparams_sh = range(44, 195, 50)
dp_radius = [10]
dp_offset = [3, 5]
dp_seedsz = [3]
fname = "exp062-multiscale.csv"
fnamenew = "msgc_experiment.csv"
rnd_seed=1
In [2]:
%pwd
Out[2]:
In [3]:
# block size bylo 10
segparams0 = {
'method':'graphcut',
# 'method':'multiscale_graphcut',
'use_boundary_penalties': True,
'boundary_dilatation_distance': 2,
'boundary_penalties_weight': 1,
'block_size': 10,
'tile_zoom_constant': 1
}
segparams1 = {
# 'method':'graphcut',
'method':'multiscale_graphcut_hi2lo',
'use_boundary_penalties': True,
'boundary_dilatation_distance': 2,
'boundary_penalties_weight': 1,
'block_size': 10,
'tile_zoom_constant': 1
}
segparams2 = {
# 'method':'graphcut',
'method':'multiscale_graphcut_lo2hi',
'use_boundary_penalties': True,
'boundary_dilatation_distance': 2,
'boundary_penalties_weight': 1,
'block_size': 10,
'tile_zoom_constant': 1
}
labels = [
"ssgc ",
"msgc_hi2lo ",
"msgc_lo2hi ",
]
In [4]:
def make_data(sz=32, offset=0, radius=7, seedsz=3):
#seedsz= int(sz/10)
space=2
seeds = np.zeros([sz, sz+1, sz+2], dtype=np.int8)
xmin = radius + seedsz + offset + 2
ymin = radius + seedsz + offset + 6
seeds[offset + 12, xmin + 3:xmin + 7 + seedsz, ymin:ymin+2] = 1
seeds[offset + 20, xmin + 7:xmin + 12 + seedsz, ymin+5:ymin+7] = 1
img = np.ones([sz, sz+1, sz+2])
img = img - seeds
seeds[
2:10 + seedsz,
2:9+ seedsz,
2:3+ seedsz] = 2
img = scipy.ndimage.morphology.distance_transform_edt(img)
segm = img < radius
img = (100 * segm + 80 * np.random.random(img.shape)).astype(np.uint8)
return img, segm, seeds
In [5]:
img, seg, seeds = make_data(45, 3, 10, 3)
plt.figure(figsize=(10,15))
import copy
imgview = copy.copy(img)
imgview[:,:,-1] = 0
imgview[:,-1,:] = 0
imgview[-1,:,:] = 0
sed3.show_slices(imgview, contour=seg, seeds=seeds, show=False, slice_step=3.9)
plt.axis("off")
plt.savefig("../graphics/exp062-imgsample_separated.png")
In [6]:
img, seg, seeds = make_data(50, 10, 15)
plt.figure(figsize=(10,15))
sed3.show_slices(img, contour=seg, seeds=seeds)
In [7]:
np.unique(seeds)
Out[7]:
In [8]:
def to_latex_file(df, fn):
with open(fn, "w") as f:
f.write(df.to_latex())
def latex_float(f, precision=4):
float_str = "{0:." + str(int(precision)) + "g}"
float_str = float_str.format(f)
if "e" in float_str:
base, exponent = float_str.split("e")
return r"{0} \times 10^{{{1}}}".format(base, int(exponent))
else:
return float_str
def float_to_latex_file(fl, fn, precision=4):
string = latex_float(fl, precision=precision)
with open(fn, "w") as f:
f.write(string)
def num2latex(num, filename=None, precision=4):
if type(num) is str:
float_str = num
else:
float_str = "{0:." + str(int(precision)) + "g}"
float_str = float_str.format(num)
if float_str[:4] == r"\num":
pass
else:
float_str = "\\num{" + float_str + "}"
if filename is not None:
with open(filename, "w") as f:
f.write(float_str)
return float_str
def to_file(text, fn):
with open(fn, "w") as f:
f.write(text)
In [9]:
def process_gc_stats(stats1, prefix=None):
if prefix is None:
prefix = ""
outstats = {}
for key in stats1:
outstats[prefix + key] = stats1[key]
outstats[prefix + "nlinks number"] = np.sum(np.asarray(outstats[prefix + "nlinks shape"]), axis=0)[0]
outstats[prefix + "tlinks number"] = np.sum(np.asarray(outstats[prefix + "tlinks shape"]), axis=0)[0]
outstats.pop(prefix + "tlinks shape")
outstats.pop(prefix + "nlinks shape")
outstats[prefix + "edge number"] = outstats[prefix + "nlinks number"] + outstats[prefix + "tlinks number"]
return outstats
def merge_stats(stats0, stats1, stats2, labels=None):
if labels is None:
labels = [""] * 3
stats0 = process_gc_stats(stats0, labels[0])
stats1 = process_gc_stats(stats1, labels[1])
stats2 = process_gc_stats(stats2, labels[2])
stats = {}
stats.update(stats0)
stats.update(stats1)
stats.update(stats2)
return stats
def run_gc_with_defined_setup(img, segparams):
start = time.time()
gc = pycut.ImageGraphCut(img, segparams=segparams)
gc.set_seeds(seeds)
gc.run()
sg1 = gc.segmentation
stats1 = gc.stats
elapsed1 = (time.time() - start)
err1 = np.sum(np.abs(seg - (1 - sg1)))
stats1["time"] = elapsed1
stats1["error"] = err1
return stats1
def add_data_and_algoritm_info(stats, data_params, segparams, start):
# stats['msgc time'] = elapsed1
# stats['normal time'] = elapsed2
stats['data size'] = data_params[0]
stats['data offset'] = data_params[1]
stats['data radius'] = data_params[2]
stats["block size"] = segparams["block_size"]
stats["data seedsz"] = data_params[3]
# stats["GC error"] = err2
# stats["MSGC error"] = err1
stats['machine hostname'] = machine_hostname
stats['experiment iteration start time'] = start
return stats
def add_data_seaborn(stats, data_params, segparams, start, i, label):
stats = process_gc_stats(stats, "")
stats = add_data_and_algoritm_info(stats, data_params, segparams, start)
stats["method"] = label
dfinew = pd.DataFrame(stats, index=[i*3 + 0])
#dfnew = dfnew.append(dfinew, sort=True)
return dfinew
In [10]:
#for par in it:
# print par
i = 0
In [11]:
force_rewrite = False
force_rewrite = True
if op.exists(fname) and not force_rewrite:
df = pd.read_csv(fname)#, index_col=0)
else:
df = pd.DataFrame([])
if op.exists(fnamenew) and not force_rewrite:
dfnew = pd.read_csv(fnamenew)#, index_col=0)
else:
dfnew = pd.DataFrame([])
i = 0
np.random.seed(rnd_seed)
import platform
machine_hostname = platform.node()
it = itertools.product(dataparams_sh, dp_offset, dp_radius, dp_seedsz)
for data_params in it:
start = time.time()
img, seg, seeds = make_data(data_params[0], data_params[1], data_params[2], data_params[3])
stats0 = run_gc_with_defined_setup(img, segparams0)
stats1 = run_gc_with_defined_setup(img, segparams1)
stats2 = run_gc_with_defined_setup(img, segparams2)
# stats2 = gc.stats
# elapsed2 = (time.time() - start)
# err2 = np.sum(np.abs(seg - (1 - sg2)))
# print(err1)
# print("t1 / t2 =", elapsed1, "/" , elapsed2 ,' = ', elapsed1/elapsed2)
stats = merge_stats(stats0, stats1, stats2, labels)
stats = add_data_and_algoritm_info(stats, data_params, segparams0, start)
# stats['msgc time'] = elapsed1
# stats['normal time'] = elapsed2
# stats['data size'] = data_params[0]
# stats['data offset'] = data_params[1]
# stats['data radius'] = data_params[2]
# stats["block size"] = segparams1["block_size"]
# stats["data seedsz"] = data_params[3]
# # stats["GC error"] = err2
# # stats["MSGC error"] = err1
# stats['machine hostname'] = machine_hostname
# stats['start time'] = start
dfi = pd.DataFrame(stats, index=[i])
# display(df)
df = df.append(dfi, sort=True)
df.to_csv(fname, index=False)
# stats = process_gc_stats(stats0, "")
dfinew = add_data_seaborn(stats0, data_params, segparams0, start, i, labels[0])
dfnew = dfnew.append(dfinew, sort=True)
dfinew = add_data_seaborn(stats1, data_params, segparams1, start, i, labels[1])
dfnew = dfnew.append(dfinew, sort=True)
dfinew = add_data_seaborn(stats2, data_params, segparams2, start, i, labels[2])
dfnew = dfnew.append(dfinew, sort=True)
# stats = process_gc_stats(stats1, "")
# stats = add_data_and_algoritm_info(stats, data_params, segparams1, start)
# dfinew = pd.DataFrame(stats, index=[i*3 + 1])
# dfnew = dfnew.append(dfinew, sort=True)
# stats = process_gc_stats(stats2, "")
# stats = add_data_and_algoritm_info(stats, data_params, segparams2, start)
# dfinew = pd.DataFrame(stats, index=[i*3 + 2])
# dfnew = dfnew.append(dfinew, sort=True)
dfnew.to_csv(fnamenew, index=False)
i += 1
# plt.figure(figsize=[10,15])
# sed3.show_slices(img, contour=sg1, seeds=seeds, slice_step=10)
# plt.figure(figsize=[10,15])
# sed3.show_slices(img, contour=sg2, seeds=seeds, slice_step=10)
In [12]:
# dfnew.to_csv(fnamenew, index=False)
to_file(str(i), op.join(latex_dir, "msgc_dataset_size.tex"))
In [13]:
len(list(itertools.product(dataparams_sh, dp_offset, dp_radius, dp_seedsz)))
Out[13]:
In [14]:
stats
Out[14]:
In [15]:
df = pd.read_csv(fname)
# df.rename(columns={"msgc time": "MSGC time"})
dfs = df[(df["data seedsz"]==3) & (df["data offset"] == 3) & (df["data radius"] == 10)]
dfs_plus = dfs[dfs['data size'] > 160]
In [ ]:
In [16]:
# df["GC total time"] = df["normal time"]
# df["MSGC total time"] = df["msgc time"]
# df["GC time"] = df["normal gc time"]
# df["MSGC time"] = df["gc time"]
# df["MSGC links number"] = df["nlinks number"] + df["tlinks number"]
# df["GC links number"] = df["normal nlinks number"] + df["normal tlinks number"]
# df["time rate"] = df["gc time"] / df["normal gc time"]
# df["MSGC time rate"] = df["gc time"] / df["data size"]
# df["GC time rate"] = df["normal gc time"] / df["data size"]
def func(x, a, c, d):
return a*np.exp(-c*x)+d
#from scipy.optimize import curve_fit
#popt, pcov = curve_fit(func, df["data size"], df["MSGC time"])
#print popt, pcov
#def msgcp(x):
# a, c, d = popt
# return a*np.exp(-c*x)+d
msgcp = np.poly1d(np.polyfit(df["data size"], df[labels[0] + "time"], 2))
df[labels[0] + "time trend"] = msgcp(df["data size"])
msgctp = np.poly1d(np.polyfit(df["data size"], df[labels[1] + "time"], 2))
df[labels[1] + "time trend"] = msgctp(df["data size"])
plt.figure()
df[[labels[0] + 'time', labels[1] + 'time', labels[2] + "time", 'data size']].sort_values("data size").plot(x='data size')
plt.figure(figsize=(10,15))
df[[labels[0] + 'time', labels[1] + 'time', labels[2] + "time",
labels[0] + 'gc time', labels[1] + 'gc time', labels[2] + "gc time", 'data size',
]].sort_values("data size").plot(x='data size', style=["-o", "-^", "-s", ":", ":", ":"], color=["r", "g", "b", "r", "g", "b"])
plt.savefig("../graphics/exp062-msgc_time_size_comparison.pdf")
plt.figure()
#df[['normal nlinks number', "nlinks number", 'data size']].sort("data size").plot(x='data size')
df[[labels[0] + 'edge number', labels[1] + 'edge number', labels[2] + 'edge number', 'data size']].sort_values("data size").plot(
x='data size')
# plt.figure()
# df[[labels[0] + 'time', labels[1] + "time", labels[1] + "time trend", 'data size']].sort_values("data size").plot(
# x='data size', style=["-", "x", "-"])
# plt.savefig("../graphics/exp062-multiscale-time-data_size.pdf")
# plt.figure()
# df[['GC total time', "MSGC total time", "MSGC total time trend", 'data size']].sort_values("data size").plot(
# x='data size', style=["-", "x", "-"])
# plt.savefig("../graphics/exp062-multiscale-total_time-data_size.pdf")
plt.figure()
df[[labels[0] + "time", labels[1] + "time", labels[2] + "time",]].boxplot(showfliers=False)
plt.savefig("../graphics/exp062-multiscale-gc_time.pdf")
plt.figure()
df[[labels[0] + 'time', labels[0] + 'gc time', labels[1] + "time",
labels[1] + 'gc time', labels[2] + 'time', labels[2] + "gc time",
]].boxplot(showfliers=False, rot=90)
plt.savefig("../graphics/exp062-multiscale-gc_time-total_time.pdf")
# plt.figure()
# df[["GC total time", "MSGC total time"]].boxplot(showfliers=False)
# plt.savefig("../graphics/exp062-multiscale-total_time.pdf")
# plt.figure()
# df[['MSGC time', "GC links number", 'GC time']].sort_values("GC links number").plot(x='GC links number')
dfs = df[(df["data seedsz"]==3) & (df["data offset"] == 3) & (df["data radius"] == 10)]
# plt.figure()
# dfs[["GC total time", "MSGC total time", 'GC time', 'MSGC time', 'data size']].sort_values("data size").plot(x='data size', style=["-", "-", "--","--"])
# plt.savefig("../graphics/exp062-multiscale-all.pdf")
plt.figure()
dfs[[labels[0] + "edge number", labels[1] + "edge number", 'data size']].sort_values("data size").plot(x='data size', style=["-", "-", "--","--"])
plt.savefig("../graphics/exp062-multiscale-links_number.pdf")
df_mn = df[[labels[0] + "time", labels[1] + "time", labels[2] + "time", labels[0] + "gc time", labels[1] + "gc time", labels[2] + "gc time"]].describe()
display(df_mn)
def to_latex_file(df, fn):
with open(fn, "w") as f:
f.write(df.to_latex())
to_latex_file(df_mn, op.abspath("../includes/exp062-all.tex"))
In [17]:
i
Out[17]:
In [ ]:
In [18]:
df.keys()
Out[18]:
In [19]:
df = pd.read_csv(fnamenew)
# df.rename(columns={"msgc time": "MSGC time"})
dfs = df[(df["data seedsz"]==3) & (df["data offset"] == 3) & (df["data radius"] == 10)]
dfs_plus = dfs[dfs['data size'] > 160]
import seaborn as sns
sns.set_context("paper")
sns.set_style("white")
In [21]:
df.keys()
Out[21]:
In [23]:
sns.boxplot(data=df, y="time", x="method")
Out[23]:
In [24]:
# df
In [25]:
uu = pd.melt(df.rename(columns={"gc time": "gc", "time": "total"}), value_vars=["gc", "total"], id_vars=["method"], var_name="time type", value_name="time")
# uu = pd.melt(dfs, value_vars=["gc time", "time"], id_vars=["method"], var_name="type", value_name="time")
# uu
In [26]:
sns.boxplot(data=uu, hue="time type",y="time", x="method")
plt.savefig(op.join(latex_dir, "msgc_time_boxplot.pdf"), dpi=1000)
In [27]:
# sns.boxplot(data=dfs, y="error", x="method")
In [28]:
lm = sns.lmplot(data=df, x="data size", y="time", hue="method", order=3, scatter_kws={"s": 3, "marker": "x", "alpha": 0.5})
axes = lm.axes
axes[0,0].set_xlim(30,200)
axes[0,0].set_ylim(0,50)
lines = lm.ax.get_lines()
line = lines[0]
line.set_linestyle("--")
# line.set_marker("s")
plt.savefig(op.join(latex_dir, "msgc_time_datasize_plot.pdf"), dpi=1000)
# axes[0,1].set_ylim(0,)
# lm.ax.get_lines()
In [29]:
# better melt
from pandas.core.dtypes.common import is_list_like
from pandas.core.frame import DataFrame
from pandas.core.index import MultiIndex
from pandas import compat
from IPython.display import display
from pandas.core.reshape.concat import concat
import re
from pandas.core.tools.numeric import to_numeric
from pandas.util._decorators import Appender
from pandas.core.frame import _shared_docs
import numpy as np
import pandas as pd
import pandas.util.testing as tm
def _melt(frame, id_vars=None, value_vars=None, var_name=None,
value_name='value', col_level=None, stubnames=False,
suffix=r'\d+', sep='', extra_group=0, var_end=None):
# TODO: what about the existing index?
def check_vars(frame, var, var_string):
for v in var:
if num_col_levels > 1:
if not isinstance(v, tuple):
raise ValueError('{} must be a list of tuples'
' when columns are a MultiIndex'
.format(var_string))
elif len(v) != num_col_levels:
raise ValueError('all tuples in {} must be length {}'
.format(var_string,
frame.columns.nlevels))
else:
if is_list_like(v) and len(v) > 1:
raise ValueError('DataFrame has only a single level of '
'columns. {} is not a column'.format(v))
if len(col_level) == 0:
num_col_levels = frame.columns.nlevels
else:
num_col_levels = len(col_level)
check_vars(frame, id_vars, 'id_vars')
check_vars(frame, value_vars, 'value_vars')
if var_name != [] and len(var_name) != num_col_levels:
raise ValueError('Length of var_name must match effective number of '
'column levels.')
if col_level != []:
droplevels = list(range(frame.columns.nlevels))
for level in col_level:
if isinstance(level, int):
droplevels.remove(level)
else:
droplevels.remove(frame.columns.names.index(level))
if droplevels != []:
frame = frame.copy()
frame.columns = frame.columns.droplevel(droplevels)
if stubnames and isinstance(frame.columns, MultiIndex):
raise ValueError('Stubnames only work with single-index DataFrames')
for iv in id_vars:
if iv not in frame.columns:
raise KeyError('{} not in columns'.format(iv))
if value_vars != []:
for vv in value_vars:
if vv not in frame.columns:
raise KeyError('{} not in columns'.format(vv))
if var_name == []:
names = list(frame.columns.names)
if len(names) == 1:
if names[0] is None:
var_name.append('variable')
else:
var_name.append(names[0])
elif names.count(None) == 1:
names[names.index(None)] = 'variable'
var_name = names
else:
missing_name_count = 0
for name in names:
if name is None:
var_name.append('variable_{}'.format(missing_name_count))
missing_name_count += 1
else:
var_name.append(name)
if var_end is not None:
var_name = [vn + '_' + str(var_end) for vn in var_name]
N = len(frame)
non_id_ilocs = []
if value_vars != []:
for v in value_vars:
for i, v1 in enumerate(frame.columns):
if v == v1:
non_id_ilocs.append(i)
else:
if id_vars == []:
non_id_ilocs = list(range(frame.shape[1]))
else:
for i, v in enumerate(frame.columns):
if v not in id_vars:
non_id_ilocs.append(i)
K = len(non_id_ilocs)
mdata = {}
mcolumns = []
for col in id_vars:
pandas_obj = frame[col]
if isinstance(pandas_obj, DataFrame):
for i in range(pandas_obj.shape[1]):
col_name = col + '_id_' + str(i)
mdata[col_name] = np.tile(pandas_obj.iloc[:, i].values, K + extra_group)
mcolumns.append(col_name)
else:
mdata[col] = np.tile(pandas_obj, K + extra_group)
mcolumns.append(col)
values = np.concatenate([frame.iloc[:, i] for i in non_id_ilocs])
if extra_group > 0:
values = np.concatenate((values, np.full([N * extra_group], np.nan)))
mdata[value_name[0]] = values
for i, col in enumerate(var_name):
values = frame.columns[non_id_ilocs]._get_level_values(i)
if stubnames:
regex = '^{0}{1}'.format(re.escape(value_name[0]), re.escape(sep))
values = to_numeric(values.str.replace(regex, ''), errors='ignore')
if isinstance(values, MultiIndex):
# asanyarray will keep the columns as an Index
values = np.asanyarray(values).repeat(N)
else:
data_list = []
for v in values.tolist():
data_list.extend([v] * N)
values = data_list
if extra_group > 0:
values = np.concatenate((values, np.full([N * extra_group], np.nan)))
mdata[col] = values
mcolumns += var_name + value_name
return mdata, mcolumns
@Appender(_shared_docs['melt'] %
dict(caller='pd.melt(df, ',
versionadded="",
other='DataFrame.melt'))
def melt(frame, id_vars=None, value_vars=None, var_name=None,
value_name='value', col_level=None, stubnames=False,
suffix=r'\d+', sep=''):
def convert_to_list(val):
if val is None:
return []
elif not is_list_like(val):
return [val]
else:
return list(val)
def get_var_names(df, stub, sep, suffix):
regex = '^{0}{1}{2}$'.format(re.escape(stub), re.escape(sep), suffix)
col_return = [col for col in df.columns if re.match(regex, col)]
if col_return == []:
raise ValueError('No stubname {}'.format(stub))
return col_return
id_vars = convert_to_list(id_vars)
value_vars = convert_to_list(value_vars)
var_name = convert_to_list(var_name)
value_name = convert_to_list(value_name)
col_level = convert_to_list(col_level)
if stubnames:
if value_vars == []:
raise ValueError('Must provide stubnames as a list to value_vars')
value_name = value_vars
value_vars = [get_var_names(frame, stub, sep, suffix)
for stub in value_vars]
if var_name == []:
var_name = ['variable_' + v for v in value_name]
if value_vars != [] and isinstance(value_vars[0], list):
if var_name != []:
if len(value_vars) != len(var_name):
raise ValueError('Number of inner lists of value_vars must '
'equal length of var_name '
'{} != {}'.format(len(value_vars),
len(var_name)))
else:
var_name = [[]] * len(value_vars)
if len(value_name) > 1:
if len(value_vars) != len(value_name):
raise ValueError('Number of inner lists of value_vars must '
'equal length of value_name '
'{} != {}'.format(len(value_vars),
len(value_name)))
elif not stubnames:
value_name = [value_name[0] + '_' + str(i) for i in range(len(value_vars))]
value_vars_length = []
for vv in value_vars:
count = 0
for col in frame.columns.values:
if col in vv:
count += 1
value_vars_length.append(count)
max_group_len = max(value_vars_length)
mdata_list = []
mcolumns_list = []
vars_zipped = zip(value_vars, var_name, value_name, value_vars_length)
for i, (val_v, var_n, val_n, vvl) in enumerate(vars_zipped):
var_n = convert_to_list(var_n)
val_n = convert_to_list(val_n)
id_vars_ = [] if i > 0 else id_vars
var_end = i if var_n == [] else None
md, mc = _melt(frame, id_vars=id_vars_, value_vars=val_v,
var_name=var_n, value_name=val_n,
col_level=col_level, stubnames=stubnames,
suffix=suffix, sep=sep,
extra_group=max_group_len - vvl,
var_end=var_end)
mdata_list.append(md)
mcolumns_list.append(mc)
mdata = {}
for d in mdata_list:
mdata.update(d)
mcolumns = [e for lst in mcolumns_list for e in lst]
return DataFrame(mdata, columns=mcolumns)
else:
mdata, mcolumns = _melt(frame, id_vars=id_vars, value_vars=value_vars,
var_name=var_name, value_name=value_name,
col_level=col_level, stubnames=stubnames,
suffix=suffix, sep=sep)
return DataFrame(mdata, columns=mcolumns)
In [30]:
# test better melt
dfm = pd.DataFrame({'City': ['Houston', 'Austin', 'Hoover'],
'State': ['Texas', 'Texas', 'Alabama'],
'Name':['Aria', 'Penelope', 'Niko'],
'Mango':[4, 10, 90],
'Orange': [10, 8, 14],
'Watermelon':[40, 99, 43],
'Gin':[16, 200, 34],
'Vodka':[20, 33, 18]},
columns=['City', 'State', 'Name', 'Mango', 'Orange', 'Watermelon', 'Gin', 'Vodka'])
melt(dfm, id_vars=['City', 'State'], value_vars=[['Mango', 'Orange', 'Watermelon'], ['Gin', 'Vodka']],
var_name=['Fruit', 'Drink'], value_name=['Pounds', 'Ounces'])
Out[30]:
In [31]:
df.keys()
Out[31]:
In [32]:
df.rename(columns={"gc time": "gc", "time": "total"})[["gc", "total", "data size"]]
Out[32]:
In [33]:
uu = melt(df.rename(columns={"gc time": "gc", "time": "total"}), value_vars=["gc", "total"], id_vars=["method"], var_name=["time type"], value_name=["time"])
uu["mth"] = uu["method"] + " " + uu["time type"]
uu
Out[33]:
In [34]:
lm = sns.lmplot(data=uu, x="data size", y="time", hue="mth", order=3, scatter_kws={"s": 3, "marker": "x", "alpha": 0.5})
axes = lm.axes
axes[0,0].set_xlim(30,200)
axes[0,0].set_ylim(0,50)
lines = lm.ax.get_lines()
line = lines[0]
line.set_linestyle("--")
In [ ]:
lm = sns.lmplot(data=uu, x="data size", y="time", hue="time type", order=3, scatter_kws={"s": 3, "marker": "x", "alpha": 0.5})
axes = lm.axes
axes[0,0].set_xlim(30,200)
axes[0,0].set_ylim(0,50)
lines = lm.ax.get_lines()
line = lines[0]
line.set_linestyle("--")
# line.set_marker("s")
plt.savefig(op.join(latex_dir, "msgc_time_datasize_plot.pdf"), dpi=1000)
# axes[0,1].set_ylim(0,)
# lm.ax.get_lines()
In [ ]:
In [ ]:
# this work jus for not duplicit values of data siz
# sns.tsplot(data=df, time="data size", value="time", unit="method", condition="method")
# plt.savefig(op.join(latex_dir, "msgc_size_time.pdf"), dpi=1000)
In [ ]:
line.set_marker("s")
In [ ]:
# df
In [ ]:
# df["method"]
In [ ]:
from scipy import stats
dfs_plus_describe = dfs_plus.describe()
display(dfs_plus_describe)
print("pokud je pvalue mensi nez zvolena hladina vyznamnosti (0.01=1%), je vsechno ok")
tt = stats.ttest_rel(dfs_plus.loc[dfs_plus["method"] == "ssgc"]['time'], dfs_plus.loc[dfs_plus["method"] == "msgc_lo2hi"]['time'])
# tt
In [ ]:
ssgc_rows = dfs_plus[dfs_plus["method"].str.contains(labels[0])]
ssgc_hi2lo_rows = dfs_plus[dfs_plus["method"].str.contains(labels[1])]
ssgc_lo2hi_rows = dfs_plus[dfs_plus["method"].str.contains(labels[2])]
pp0 = stats.ttest_rel(ssgc_rows["time"], ssgc_hi2lo_rows["time"])
pp1 = stats.ttest_rel(ssgc_rows["time"], ssgc_lo2hi_rows["time"])
pp2 = stats.ttest_rel(ssgc_hi2lo_rows["time"], ssgc_lo2hi_rows["time"])
print("pokud je pvalue mensi nez zvolena hladina vyznamnosti (0.01=1%), je vsechno ok")
#mozna staci i dvojnasobek hladiny vyzamnosi
print("statistic musi byt vetsi nez 0")
display(pp0)
display(pp1)
display(pp2)
float_to_latex_file(pp0.pvalue, op.join(latex_dir, "ttest_pvalue_ssgc_msgc_hi2lo.tex"))
float_to_latex_file(pp1.pvalue, op.join(latex_dir, "ttest_pvalue_ssgc_msgc_lo2hi.tex"))
float_to_latex_file(pp2.pvalue, op.join(latex_dir, "ttest_pvalue_msgc_hi2lo_msgc_lo2hi.tex"))
In [ ]:
# dfs_plus["method"] == "ssgc "
In [ ]:
dfs_describe = dfs.describe()
display(dfs_describe)
dfs_plus_describe = dfs_plus.describe()
display(dfs_plus_describe)
In [ ]:
dfs_plus_size = int(len(dfs_plus) / len(labels))
to_file(str(dfs_plus_size), op.join(latex_dir, "msgc_dataset_subset_size.tex"))
In [ ]:
df_mn = df[["GC total time", "MSGC total time", "GC time", "MSGC time"]].describe()
display(df_mn)
to_latex_file(df_mn, "../includes/exp062-all2data_size.tex")
dfs_mn = dfs[["GC total time", "MSGC total time", "GC time", "MSGC time"]].describe()
display(dfs_mn)
to_latex_file(dfs_mn, "../includes/exp062-selection2data_size.tex")
In [ ]:
dfs_plus[["method"]]