In [141]:
import pandas as pd
tests = pd.read_csv("../notebooks/datasets/test_page_objects.csv")
#tests['calls'] = 1
tests['id'] = pd.factorize(tests.page_method)[0] + 1
tests['calls'] = 1
tests
Out[141]:
In [142]:
grouped = tests.groupby(['test_method','page_object', 'page_method' ], sort=False)[['id']].sum()
test_matrix = grouped.unstack(level=[1,2])
test_matrix = test_matrix.fillna(0)
#test_matrix = test_matrix.cumsum(axis=1)
test_matrix
Out[142]:
In [143]:
test_matrix.T.plot.area(alpha=0.7)
Out[143]:
In [157]:
sns.heatmap(test_matrix.corr())
Out[157]:
In [144]:
sns.heatmap(test_matrix)
Out[144]:
In [145]:
test_matrix.T.plot.bar(alpha=0.7)
Out[145]:
In [146]:
grouped = tests.groupby(['test_method','page_object', 'page_method' ], sort=False)[['calls']].count()
test_matrix = grouped.unstack(level=[1,2])
test_matrix = test_matrix.fillna(0)
#test_matrix = test_matrix.cumsum(axis=1)
test_matrix
Out[146]:
In [147]:
test_matrix.T.plot.barh(alpha=0.7, stacked=True)
Out[147]:
In [148]:
test_matrix.T.plot.bar(alpha=0.7, stacked=True)
Out[148]:
In [149]:
test_matrix.T.plot.area(alpha=0.7, stacked=True)
Out[149]:
In [161]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
def get_iris_data():
"""Get the iris data, from local csv or pandas repo."""
if os.path.exists("iris.csv"):
print("-- iris.csv found locally")
df = pd.read_csv("iris.csv", index_col=0)
else:
print("-- trying to download from github")
fn = "https://raw.githubusercontent.com/pydata/pandas/" + \
"master/pandas/tests/data/iris.csv"
try:
df = pd.read_csv(fn)
except:
exit("-- Unable to download iris.csv")
with open("iris.csv", 'w') as f:
print("-- writing to local iris.csv file")
df.to_csv(f)
return df
df = get_iris_data()
In [163]:
df.head()
Out[163]:
In [184]:
df = test_matrix.reset_index()
df
Out[184]:
In [185]:
def encode_target(df, target_column):
df_mod = df.copy()
targets = df_mod[target_column].unique()
map_to_int = {name: n for n, name in enumerate(targets)}
df_mod["Target"] = df_mod[target_column].replace(map_to_int)
return (df_mod, targets)
df2, targets = encode_target(df, "test_method")
df2.head()
Out[185]:
In [196]:
features = list(df2.columns[1:-1])
features
Out[196]:
In [218]:
y = df2["Target"]
X = df2[features]
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X, y)
Out[218]:
In [217]:
def visualize_tree(tree, feature_names):
with open("dt.dot", 'w') as f:
export_graphviz(tree, out_file=f,
feature_names=feature_names,
label=None,
leaves_parallel=True,
impurity=False,
class_names=df2.test_method)
command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
try:
subprocess.check_call(command)
except:
exit("Could not run dot, ie graphviz, to "
"produce visualization")
visualize_tree(dt, features)
In [150]:
from sklearn.metrics.pairwise import cosine_similarity
test_sim = cosine_similarity(test_matrix)
test_sim
Out[150]:
In [151]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
y = np.ones(len(test_sim.T))
y
X = test_sim.T
X
# #############################################################################
# Compute paths
n_alphas = 2000
alphas = np.logspace(0, 1, n_alphas)
coefs = []
for a in alphas:
ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[20,20])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)
pd.DataFrame(X)
Out[152]:
In [153]:
import pandas as pd
distance_df = pd.read_excel(
"../notebooks/datasets/test_distance_matrix.xlsx",
index_col=[0,1],
header=[0,1])
# show only subset of data
distance_df.iloc[:5,:2]
X = 1- distance_df.iloc[0:10,0:10].values
X
Out[153]:
In [ ]:
In [154]:
import pandas as pd
tests = pd.read_csv("../notebooks/datasets/test_code_invocations.csv", sep=";")
tests.head()
invocation_matrix = tests.pivot_table(
index=['test_type', 'test_method'],
columns=['prod_type', 'prod_method'],
values='invocations',
fill_value=0
)
y = np.ones(100)
X = 1-invocation_matrix.iloc[0:100,0:100].values
X
# #############################################################################
# Compute paths
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)
coefs = []
for a in alphas:
ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
ridge.fit(X, y)
coefs.append(ridge.coef_)
# #############################################################################
# Display results
plt.figure(figsize=[20,20])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()