In [141]:
import pandas as pd
tests = pd.read_csv("../notebooks/datasets/test_page_objects.csv")
#tests['calls'] = 1
tests['id'] = pd.factorize(tests.page_method)[0] + 1
tests['calls'] = 1

tests


Out[141]:
test_method page_object page_method id calls
0 testLogin StartPage goToStartSite 1 1
1 testLogin StartPage clickLogin 2 1
2 testLogin StartPage typeUser 3 1
3 testLogin StartPage typePassword 4 1
4 testLogin StartPage checkLogin 5 1
5 testUserNameChange StartPage goToStartSite 1 1
6 testUserNameChange StartPage clickLogin 2 1
7 testUserNameChange StartPage typeUser 3 1
8 testUserNameChange StartPage typePassword 4 1
9 testUserNameChange StartPage goToSettings 6 1
10 testUserNameChange UserPage changePassword 7 1
11 testUserNameChange UserPage checkChangedPassword 8 1
12 testUserNameChange UserPage logOff 9 1
13 testNewBlogPost StartPage goToStartSite 1 1
14 testNewBlogPost StartPage clickLogin 2 1
15 testNewBlogPost StartPage typeUser 3 1
16 testNewBlogPost StartPage typePassword 4 1
17 testNewBlogPost StartPage clickNewBlogPost 10 1
18 testNewBlogPost BlogPage checkBlogPost 11 1
19 testNewBlogPost BlogPage logOff 9 1
20 testReadLatestBlogPost StartPage goToStartSite 1 1
21 testReadLatestBlogPost StartPage clickLatestBlogPosts 12 1
22 testReadLatestBlogPost BlogPage checkBlogPost 11 1
23 testReadLatestBlogPost BlogPage logOff 9 1

In [142]:
grouped = tests.groupby(['test_method','page_object', 'page_method' ], sort=False)[['id']].sum()
test_matrix = grouped.unstack(level=[1,2])
test_matrix = test_matrix.fillna(0)
#test_matrix = test_matrix.cumsum(axis=1)
test_matrix


Out[142]:
id
page_object StartPage UserPage StartPage BlogPage StartPage
page_method goToStartSite clickLogin typeUser typePassword checkLogin goToSettings changePassword checkChangedPassword logOff clickNewBlogPost checkBlogPost logOff clickLatestBlogPosts
test_method
testLogin 1.0 2.0 3.0 4.0 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
testUserNameChange 1.0 2.0 3.0 4.0 0.0 6.0 7.0 8.0 9.0 0.0 0.0 0.0 0.0
testNewBlogPost 1.0 2.0 3.0 4.0 0.0 0.0 0.0 0.0 0.0 10.0 11.0 9.0 0.0
testReadLatestBlogPost 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 11.0 9.0 12.0

In [143]:
test_matrix.T.plot.area(alpha=0.7)


Out[143]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fa3a559e80>

In [157]:
sns.heatmap(test_matrix.corr())


Out[157]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fa342bc2b0>

In [144]:
sns.heatmap(test_matrix)


Out[144]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fa38b199b0>

In [145]:
test_matrix.T.plot.bar(alpha=0.7)


Out[145]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fa38b19710>

In [146]:
grouped = tests.groupby(['test_method','page_object', 'page_method' ], sort=False)[['calls']].count()
test_matrix = grouped.unstack(level=[1,2])
test_matrix = test_matrix.fillna(0)
#test_matrix = test_matrix.cumsum(axis=1)
test_matrix


Out[146]:
calls
page_object StartPage UserPage StartPage BlogPage StartPage
page_method goToStartSite clickLogin typeUser typePassword checkLogin goToSettings changePassword checkChangedPassword logOff clickNewBlogPost checkBlogPost logOff clickLatestBlogPosts
test_method
testLogin 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
testUserNameChange 1.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0
testNewBlogPost 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0
testReadLatestBlogPost 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0

In [147]:
test_matrix.T.plot.barh(alpha=0.7, stacked=True)


Out[147]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fa32d71c88>

In [148]:
test_matrix.T.plot.bar(alpha=0.7, stacked=True)


Out[148]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fa3618b1d0>

In [149]:
test_matrix.T.plot.area(alpha=0.7, stacked=True)


Out[149]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fa2e833668>

In [161]:
import os
import subprocess

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz

def get_iris_data():
    """Get the iris data, from local csv or pandas repo."""
    if os.path.exists("iris.csv"):
        print("-- iris.csv found locally")
        df = pd.read_csv("iris.csv", index_col=0)
    else:
        print("-- trying to download from github")
        fn = "https://raw.githubusercontent.com/pydata/pandas/" + \
             "master/pandas/tests/data/iris.csv"
        try:
            df = pd.read_csv(fn)
        except:
            exit("-- Unable to download iris.csv")

        with open("iris.csv", 'w') as f:
            print("-- writing to local iris.csv file")
            df.to_csv(f)

    return df

df = get_iris_data()


-- trying to download from github
-- writing to local iris.csv file

In [163]:
df.head()


Out[163]:
SepalLength SepalWidth PetalLength PetalWidth Name
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa

In [184]:
df = test_matrix.reset_index()
df


Out[184]:
test_method calls
page_object StartPage UserPage StartPage BlogPage StartPage
page_method goToStartSite clickLogin typeUser typePassword checkLogin goToSettings changePassword checkChangedPassword logOff clickNewBlogPost checkBlogPost logOff clickLatestBlogPosts
0 testLogin 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 testUserNameChange 1.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0
2 testNewBlogPost 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0
3 testReadLatestBlogPost 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0

In [185]:
def encode_target(df, target_column):

    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

df2, targets = encode_target(df, "test_method")
df2.head()


Out[185]:
test_method calls Target
page_object StartPage UserPage StartPage BlogPage StartPage
page_method goToStartSite clickLogin typeUser typePassword checkLogin goToSettings changePassword checkChangedPassword logOff clickNewBlogPost checkBlogPost logOff clickLatestBlogPosts
0 testLogin 1.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0
1 testUserNameChange 1.0 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 1
2 testNewBlogPost 1.0 1.0 1.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 0.0 2
3 testReadLatestBlogPost 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 3

In [196]:
features = list(df2.columns[1:-1])
features


Out[196]:
[('calls', 'StartPage', 'goToStartSite'),
 ('calls', 'StartPage', 'clickLogin'),
 ('calls', 'StartPage', 'typeUser'),
 ('calls', 'StartPage', 'typePassword'),
 ('calls', 'StartPage', 'checkLogin'),
 ('calls', 'StartPage', 'goToSettings'),
 ('calls', 'UserPage', 'changePassword'),
 ('calls', 'UserPage', 'checkChangedPassword'),
 ('calls', 'UserPage', 'logOff'),
 ('calls', 'StartPage', 'clickNewBlogPost'),
 ('calls', 'BlogPage', 'checkBlogPost'),
 ('calls', 'BlogPage', 'logOff'),
 ('calls', 'StartPage', 'clickLatestBlogPosts')]

In [218]:
y = df2["Target"]
X = df2[features]
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X, y)


Out[218]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [217]:
def visualize_tree(tree, feature_names):
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names, 
                        
                        label=None,
                        leaves_parallel=True,
                        impurity=False,
                        class_names=df2.test_method)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")
        
visualize_tree(dt, features)

In [150]:
from sklearn.metrics.pairwise import cosine_similarity

test_sim = cosine_similarity(test_matrix)
test_sim


Out[150]:
array([[ 1.        ,  0.63245553,  0.6761234 ,  0.2236068 ],
       [ 0.63245553,  1.        ,  0.53452248,  0.1767767 ],
       [ 0.6761234 ,  0.53452248,  1.        ,  0.56694671],
       [ 0.2236068 ,  0.1767767 ,  0.56694671,  1.        ]])

In [151]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
y = np.ones(len(test_sim.T))
y

X = test_sim.T
X

# #############################################################################
# Compute paths

n_alphas = 2000
alphas = np.logspace(0, 1, n_alphas)

coefs = []
for a in alphas:
    ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
    ridge.fit(X, y)
    coefs.append(ridge.coef_)

    
    
# #############################################################################
# Display results
plt.figure(figsize=[20,20])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()



In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)
pd.DataFrame(X)


Out[152]:
0 1 2 3 4 5 6 7 8 9
0 1.000000 0.500000 0.333333 0.250000 0.200000 0.166667 0.142857 0.125000 0.111111 0.100000
1 0.500000 0.333333 0.250000 0.200000 0.166667 0.142857 0.125000 0.111111 0.100000 0.090909
2 0.333333 0.250000 0.200000 0.166667 0.142857 0.125000 0.111111 0.100000 0.090909 0.083333
3 0.250000 0.200000 0.166667 0.142857 0.125000 0.111111 0.100000 0.090909 0.083333 0.076923
4 0.200000 0.166667 0.142857 0.125000 0.111111 0.100000 0.090909 0.083333 0.076923 0.071429
5 0.166667 0.142857 0.125000 0.111111 0.100000 0.090909 0.083333 0.076923 0.071429 0.066667
6 0.142857 0.125000 0.111111 0.100000 0.090909 0.083333 0.076923 0.071429 0.066667 0.062500
7 0.125000 0.111111 0.100000 0.090909 0.083333 0.076923 0.071429 0.066667 0.062500 0.058824
8 0.111111 0.100000 0.090909 0.083333 0.076923 0.071429 0.066667 0.062500 0.058824 0.055556
9 0.100000 0.090909 0.083333 0.076923 0.071429 0.066667 0.062500 0.058824 0.055556 0.052632

In [153]:
import pandas as pd

distance_df = pd.read_excel(
    "../notebooks/datasets/test_distance_matrix.xlsx",
    index_col=[0,1],
    header=[0,1])
# show only subset of data
distance_df.iloc[:5,:2]


X = 1- distance_df.iloc[0:10,0:10].values
X


Out[153]:
array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.95940322,  0.35355339,  0.35355339],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.95940322,  1.        ,  0.60302269,  0.60302269],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.35355339,  0.60302269,  1.        ,  1.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.35355339,  0.60302269,  1.        ,  1.        ]])

In [ ]:


In [154]:
import pandas as pd

tests = pd.read_csv("../notebooks/datasets/test_code_invocations.csv", sep=";")
tests.head()
invocation_matrix = tests.pivot_table(
    index=['test_type', 'test_method'],
    columns=['prod_type', 'prod_method'],
    values='invocations', 
    fill_value=0
)

y = np.ones(100)
X = 1-invocation_matrix.iloc[0:100,0:100].values
X

# #############################################################################
# Compute paths

n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)

coefs = []
for a in alphas:
    ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
    ridge.fit(X, y)
    coefs.append(ridge.coef_)

# #############################################################################
# Display results
plt.figure(figsize=[20,20])
ax = plt.gca()

ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis

plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()