notebook.community

Edit and run



In [141]:

    
import pandas as pd
tests = pd.read_csv("../notebooks/datasets/test_page_objects.csv")
#tests['calls'] = 1
tests['id'] = pd.factorize(tests.page_method)[0] + 1
tests['calls'] = 1

tests









    Out[141]:







  
    
      
      test_method
      page_object
      page_method
      id
      calls
    
  
  
    
      0
      testLogin
      StartPage
      goToStartSite
      1
      1
    
    
      1
      testLogin
      StartPage
      clickLogin
      2
      1
    
    
      2
      testLogin
      StartPage
      typeUser
      3
      1
    
    
      3
      testLogin
      StartPage
      typePassword
      4
      1
    
    
      4
      testLogin
      StartPage
      checkLogin
      5
      1
    
    
      5
      testUserNameChange
      StartPage
      goToStartSite
      1
      1
    
    
      6
      testUserNameChange
      StartPage
      clickLogin
      2
      1
    
    
      7
      testUserNameChange
      StartPage
      typeUser
      3
      1
    
    
      8
      testUserNameChange
      StartPage
      typePassword
      4
      1
    
    
      9
      testUserNameChange
      StartPage
      goToSettings
      6
      1
    
    
      10
      testUserNameChange
      UserPage
      changePassword
      7
      1
    
    
      11
      testUserNameChange
      UserPage
      checkChangedPassword
      8
      1
    
    
      12
      testUserNameChange
      UserPage
      logOff
      9
      1
    
    
      13
      testNewBlogPost
      StartPage
      goToStartSite
      1
      1
    
    
      14
      testNewBlogPost
      StartPage
      clickLogin
      2
      1
    
    
      15
      testNewBlogPost
      StartPage
      typeUser
      3
      1
    
    
      16
      testNewBlogPost
      StartPage
      typePassword
      4
      1
    
    
      17
      testNewBlogPost
      StartPage
      clickNewBlogPost
      10
      1
    
    
      18
      testNewBlogPost
      BlogPage
      checkBlogPost
      11
      1
    
    
      19
      testNewBlogPost
      BlogPage
      logOff
      9
      1
    
    
      20
      testReadLatestBlogPost
      StartPage
      goToStartSite
      1
      1
    
    
      21
      testReadLatestBlogPost
      StartPage
      clickLatestBlogPosts
      12
      1
    
    
      22
      testReadLatestBlogPost
      BlogPage
      checkBlogPost
      11
      1
    
    
      23
      testReadLatestBlogPost
      BlogPage
      logOff
      9
      1



In [142]:

    
grouped = tests.groupby(['test_method','page_object', 'page_method' ], sort=False)[['id']].sum()
test_matrix = grouped.unstack(level=[1,2])
test_matrix = test_matrix.fillna(0)
#test_matrix = test_matrix.cumsum(axis=1)
test_matrix









    Out[142]:







  
    
      
      id
    
    
      page_object
      StartPage
      UserPage
      StartPage
      BlogPage
      StartPage
    
    
      page_method
      goToStartSite
      clickLogin
      typeUser
      typePassword
      checkLogin
      goToSettings
      changePassword
      checkChangedPassword
      logOff
      clickNewBlogPost
      checkBlogPost
      logOff
      clickLatestBlogPosts
    
    
      test_method
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      testLogin
      1.0
      2.0
      3.0
      4.0
      5.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      testUserNameChange
      1.0
      2.0
      3.0
      4.0
      0.0
      6.0
      7.0
      8.0
      9.0
      0.0
      0.0
      0.0
      0.0
    
    
      testNewBlogPost
      1.0
      2.0
      3.0
      4.0
      0.0
      0.0
      0.0
      0.0
      0.0
      10.0
      11.0
      9.0
      0.0
    
    
      testReadLatestBlogPost
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      11.0
      9.0
      12.0



In [143]:

    
test_matrix.T.plot.area(alpha=0.7)









    Out[143]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fa3a559e80>



In [157]:

    
sns.heatmap(test_matrix.corr())









    Out[157]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fa342bc2b0>



In [144]:

    
sns.heatmap(test_matrix)









    Out[144]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fa38b199b0>



In [145]:

    
test_matrix.T.plot.bar(alpha=0.7)









    Out[145]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fa38b19710>



In [146]:

    
grouped = tests.groupby(['test_method','page_object', 'page_method' ], sort=False)[['calls']].count()
test_matrix = grouped.unstack(level=[1,2])
test_matrix = test_matrix.fillna(0)
#test_matrix = test_matrix.cumsum(axis=1)
test_matrix









    Out[146]:







  
    
      
      calls
    
    
      page_object
      StartPage
      UserPage
      StartPage
      BlogPage
      StartPage
    
    
      page_method
      goToStartSite
      clickLogin
      typeUser
      typePassword
      checkLogin
      goToSettings
      changePassword
      checkChangedPassword
      logOff
      clickNewBlogPost
      checkBlogPost
      logOff
      clickLatestBlogPosts
    
    
      test_method
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      testLogin
      1.0
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      testUserNameChange
      1.0
      1.0
      1.0
      1.0
      0.0
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
    
    
      testNewBlogPost
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      1.0
      0.0
    
    
      testReadLatestBlogPost
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      1.0



In [147]:

    
test_matrix.T.plot.barh(alpha=0.7, stacked=True)









    Out[147]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fa32d71c88>



In [148]:

    
test_matrix.T.plot.bar(alpha=0.7, stacked=True)









    Out[148]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fa3618b1d0>



In [149]:

    
test_matrix.T.plot.area(alpha=0.7, stacked=True)









    Out[149]:





<matplotlib.axes._subplots.AxesSubplot at 0x1fa2e833668>



In [161]:

    
import os
import subprocess

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz

def get_iris_data():
    """Get the iris data, from local csv or pandas repo."""
    if os.path.exists("iris.csv"):
        print("-- iris.csv found locally")
        df = pd.read_csv("iris.csv", index_col=0)
    else:
        print("-- trying to download from github")
        fn = "https://raw.githubusercontent.com/pydata/pandas/" + \
             "master/pandas/tests/data/iris.csv"
        try:
            df = pd.read_csv(fn)
        except:
            exit("-- Unable to download iris.csv")

        with open("iris.csv", 'w') as f:
            print("-- writing to local iris.csv file")
            df.to_csv(f)

    return df

df = get_iris_data()









    



-- trying to download from github
-- writing to local iris.csv file



In [163]:

    
df.head()









    Out[163]:







  
    
      
      SepalLength
      SepalWidth
      PetalLength
      PetalWidth
      Name
    
  
  
    
      0
      5.1
      3.5
      1.4
      0.2
      Iris-setosa
    
    
      1
      4.9
      3.0
      1.4
      0.2
      Iris-setosa
    
    
      2
      4.7
      3.2
      1.3
      0.2
      Iris-setosa
    
    
      3
      4.6
      3.1
      1.5
      0.2
      Iris-setosa
    
    
      4
      5.0
      3.6
      1.4
      0.2
      Iris-setosa



In [184]:

    
df = test_matrix.reset_index()
df









    Out[184]:







  
    
      
      test_method
      calls
    
    
      page_object
      
      StartPage
      UserPage
      StartPage
      BlogPage
      StartPage
    
    
      page_method
      
      goToStartSite
      clickLogin
      typeUser
      typePassword
      checkLogin
      goToSettings
      changePassword
      checkChangedPassword
      logOff
      clickNewBlogPost
      checkBlogPost
      logOff
      clickLatestBlogPosts
    
  
  
    
      0
      testLogin
      1.0
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      testUserNameChange
      1.0
      1.0
      1.0
      1.0
      0.0
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      testNewBlogPost
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      1.0
      0.0
    
    
      3
      testReadLatestBlogPost
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      1.0



In [185]:

    
def encode_target(df, target_column):

    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

df2, targets = encode_target(df, "test_method")
df2.head()









    Out[185]:







  
    
      
      test_method
      calls
      Target
    
    
      page_object
      
      StartPage
      UserPage
      StartPage
      BlogPage
      StartPage
      
    
    
      page_method
      
      goToStartSite
      clickLogin
      typeUser
      typePassword
      checkLogin
      goToSettings
      changePassword
      checkChangedPassword
      logOff
      clickNewBlogPost
      checkBlogPost
      logOff
      clickLatestBlogPosts
      
    
  
  
    
      0
      testLogin
      1.0
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0
    
    
      1
      testUserNameChange
      1.0
      1.0
      1.0
      1.0
      0.0
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1
    
    
      2
      testNewBlogPost
      1.0
      1.0
      1.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      1.0
      0.0
      2
    
    
      3
      testReadLatestBlogPost
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      1.0
      3



In [196]:

    
features = list(df2.columns[1:-1])
features









    Out[196]:





[('calls', 'StartPage', 'goToStartSite'),
 ('calls', 'StartPage', 'clickLogin'),
 ('calls', 'StartPage', 'typeUser'),
 ('calls', 'StartPage', 'typePassword'),
 ('calls', 'StartPage', 'checkLogin'),
 ('calls', 'StartPage', 'goToSettings'),
 ('calls', 'UserPage', 'changePassword'),
 ('calls', 'UserPage', 'checkChangedPassword'),
 ('calls', 'UserPage', 'logOff'),
 ('calls', 'StartPage', 'clickNewBlogPost'),
 ('calls', 'BlogPage', 'checkBlogPost'),
 ('calls', 'BlogPage', 'logOff'),
 ('calls', 'StartPage', 'clickLatestBlogPosts')]



In [218]:

    
y = df2["Target"]
X = df2[features]
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X, y)









    Out[218]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')



In [217]:

    
def visualize_tree(tree, feature_names):
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names, 
                        
                        label=None,
                        leaves_parallel=True,
                        impurity=False,
                        class_names=df2.test_method)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")
        
visualize_tree(dt, features)



In [150]:

    
from sklearn.metrics.pairwise import cosine_similarity

test_sim = cosine_similarity(test_matrix)
test_sim









    Out[150]:





array([[ 1.        ,  0.63245553,  0.6761234 ,  0.2236068 ],
       [ 0.63245553,  1.        ,  0.53452248,  0.1767767 ],
       [ 0.6761234 ,  0.53452248,  1.        ,  0.56694671],
       [ 0.2236068 ,  0.1767767 ,  0.56694671,  1.        ]])



In [151]:

    
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
y = np.ones(len(test_sim.T))
y

X = test_sim.T
X

# #############################################################################
# Compute paths

n_alphas = 2000
alphas = np.logspace(0, 1, n_alphas)

coefs = []
for a in alphas:
    ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
    ridge.fit(X, y)
    coefs.append(ridge.coef_)

    
    
# #############################################################################
# Display results
plt.figure(figsize=[20,20])
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()



In [152]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

# X is the 10x10 Hilbert matrix
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10)
pd.DataFrame(X)



In [153]:

    
import pandas as pd

distance_df = pd.read_excel(
    "../notebooks/datasets/test_distance_matrix.xlsx",
    index_col=[0,1],
    header=[0,1])
# show only subset of data
distance_df.iloc[:5,:2]


X = 1- distance_df.iloc[0:10,0:10].values
X









    Out[153]:





array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.95940322,  0.35355339,  0.35355339],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.95940322,  1.        ,  0.60302269,  0.60302269],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.35355339,  0.60302269,  1.        ,  1.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.35355339,  0.60302269,  1.        ,  1.        ]])



In [ ]:



In [154]:

    
import pandas as pd

tests = pd.read_csv("../notebooks/datasets/test_code_invocations.csv", sep=";")
tests.head()
invocation_matrix = tests.pivot_table(
    index=['test_type', 'test_method'],
    columns=['prod_type', 'prod_method'],
    values='invocations', 
    fill_value=0
)

y = np.ones(100)
X = 1-invocation_matrix.iloc[0:100,0:100].values
X

# #############################################################################
# Compute paths

n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas)

coefs = []
for a in alphas:
    ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
    ridge.fit(X, y)
    coefs.append(ridge.coef_)

# #############################################################################
# Display results
plt.figure(figsize=[20,20])
ax = plt.gca()

ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlim(ax.get_xlim()[::-1])  # reverse axis

plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()

	0	1	2	3	4	5	6	7	8	9
0	1.000000	0.500000	0.333333	0.250000	0.200000	0.166667	0.142857	0.125000	0.111111	0.100000
1	0.500000	0.333333	0.250000	0.200000	0.166667	0.142857	0.125000	0.111111	0.100000	0.090909
2	0.333333	0.250000	0.200000	0.166667	0.142857	0.125000	0.111111	0.100000	0.090909	0.083333
3	0.250000	0.200000	0.166667	0.142857	0.125000	0.111111	0.100000	0.090909	0.083333	0.076923
4	0.200000	0.166667	0.142857	0.125000	0.111111	0.100000	0.090909	0.083333	0.076923	0.071429
5	0.166667	0.142857	0.125000	0.111111	0.100000	0.090909	0.083333	0.076923	0.071429	0.066667
6	0.142857	0.125000	0.111111	0.100000	0.090909	0.083333	0.076923	0.071429	0.066667	0.062500
7	0.125000	0.111111	0.100000	0.090909	0.083333	0.076923	0.071429	0.066667	0.062500	0.058824
8	0.111111	0.100000	0.090909	0.083333	0.076923	0.071429	0.066667	0.062500	0.058824	0.055556
9	0.100000	0.090909	0.083333	0.076923	0.071429	0.066667	0.062500	0.058824	0.055556	0.052632

	test_method	page_object	page_method	id	calls
0	testLogin	StartPage	goToStartSite	1	1
1	testLogin	StartPage	clickLogin	2	1
2	testLogin	StartPage	typeUser	3	1
3	testLogin	StartPage	typePassword	4	1
4	testLogin	StartPage	checkLogin	5	1
5	testUserNameChange	StartPage	goToStartSite	1	1
6	testUserNameChange	StartPage	clickLogin	2	1
7	testUserNameChange	StartPage	typeUser	3	1
8	testUserNameChange	StartPage	typePassword	4	1
9	testUserNameChange	StartPage	goToSettings	6	1
10	testUserNameChange	UserPage	changePassword	7	1
11	testUserNameChange	UserPage	checkChangedPassword	8	1
12	testUserNameChange	UserPage	logOff	9	1
13	testNewBlogPost	StartPage	goToStartSite	1	1
14	testNewBlogPost	StartPage	clickLogin	2	1
15	testNewBlogPost	StartPage	typeUser	3	1
16	testNewBlogPost	StartPage	typePassword	4	1
17	testNewBlogPost	StartPage	clickNewBlogPost	10	1
18	testNewBlogPost	BlogPage	checkBlogPost	11	1
19	testNewBlogPost	BlogPage	logOff	9	1
20	testReadLatestBlogPost	StartPage	goToStartSite	1	1
21	testReadLatestBlogPost	StartPage	clickLatestBlogPosts	12	1
22	testReadLatestBlogPost	BlogPage	checkBlogPost	11	1
23	testReadLatestBlogPost	BlogPage	logOff	9	1

	id
page_object	StartPage						UserPage			StartPage	BlogPage		StartPage
page_method	goToStartSite	clickLogin	typeUser	typePassword	checkLogin	goToSettings	changePassword	checkChangedPassword	logOff	clickNewBlogPost	checkBlogPost	logOff	clickLatestBlogPosts
test_method
testLogin	1.0	2.0	3.0	4.0	5.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
testUserNameChange	1.0	2.0	3.0	4.0	0.0	6.0	7.0	8.0	9.0	0.0	0.0	0.0	0.0
testNewBlogPost	1.0	2.0	3.0	4.0	0.0	0.0	0.0	0.0	0.0	10.0	11.0	9.0	0.0
testReadLatestBlogPost	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	11.0	9.0	12.0

	SepalLength	SepalWidth	PetalLength	PetalWidth	Name
0	5.1	3.5	1.4	0.2	Iris-setosa
1	4.9	3.0	1.4	0.2	Iris-setosa
2	4.7	3.2	1.3	0.2	Iris-setosa
3	4.6	3.1	1.5	0.2	Iris-setosa
4	5.0	3.6	1.4	0.2	Iris-setosa