In [1]:
%%html
<style>.cm-s-ipython .CodeMirror-matchingbracket { color: black !important;}</style><!-- Bob J: no green highlights -->
<style>table.dataframe th { vertical-align: top; }</style><!-- Move MultiIndex headers to top of block -->
<style>table.dataframe td, table.dataframe th { border-style: solid; border-width: thin; }</style><!-- thin border around tables -->
In [2]:
%%javascript # Prefer to display output instead of scrolling, so it can print
IPython.OutputArea.prototype._should_scroll = function(lines) {
return false;
}
In [3]:
# Standard definitions and options
import pandas as pd
from pandas import DataFrame
import os
import numpy as np
from datetime import datetime
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (15.0, 10.0)
# widen the page to match the window
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
# full width display of tables
pd.options.display.max_columns = 50
# want to display all rows
pd.options.display.max_rows = None
In [4]:
import json
# read log in JSON form, as processed by i.e. transfer.php
with open('log.json', 'r') as f:
samples_dict = json.load(f)
In [ ]:
In [5]:
exerciseName = {
"exercise1.tar": " 1 Testing 1",
"exercise2.tar": " 2 Testing 2 JUnit",
"exerciseD.tar": " 3 Testing 3 CppUnit",
"exerciseP.tar": " 4 Testing 3 PyUnit",
"exercise3.tar": " 5 Profiling 1",
"exercise4.tar": " 6 Profiling 2",
"exercise5.tar": " 7 Memory",
"exerciseC.tar": " 8 SVN",
"exG2script.sh": " 9 Git 2",
"exG3script.sh": "10 Git 3",
"exG3MasterRepo.tar": "10 Git 3",
"exerciseM1.tar": "11 CMake 1",
"exerciseM1A.tar": "11 CMake 1",
"exerciseM2.tar": "12 CMake 2",
"exerciseM2A.tar": "12 CMake 2",
"exerciseM2B.tar": "12 CMake 2",
"exercise7.tar": "13 CMT 1",
"exercise7extras.tar": "13 CMT 1",
"exercise8.tar": "14 CMT 2",
"exercise9.tar": "15 CMT 3",
"exerciseF.tar": "16 Dist Git",
"cmt-dev-all.tgz": "17 CMT Build"
}
In [6]:
df = DataFrame()
for sample in samples_dict :
df = df.append(sample, ignore_index=True)
df = df.replace(np.nan, '', regex=True)
# Add column
def lookUpCanFail(array, item, failValue) :
try :
return array[item]
except:
return failValue
df['Exercise'] = df.apply(lambda row: \
lookUpCanFail(exerciseName, row.file, ""), axis = 1)
In [7]:
display(df[df.user=="csc_armenuhi"])
In [8]:
# show counts of IP addresses
display(df.groupby("IP").count()["REMOTE_HOST"])
In [9]:
# self-reported host names
display(df.host.unique())
In [10]:
display(df.user.unique())
In [11]:
# most recent 'setup' time
t1 = df[df.item1 == 'setup']
# display(t1.user.unique())
t1.drop_duplicates(['user'], keep='last')[["DATE", "IP", "host", "user"]]
Out[11]:
In [12]:
# plot that start up time
t1 = df[df.item1 == 'setup']
# restrict to exercise period
t1 = t1[t1.DATE > "2019-09-17 04:00:00"]
times = pd.to_datetime(t1["DATE"])
count = np.arange(0, len(times))
plt.plot_date(times, count)
plt.gcf().autofmt_xdate()
plt.show()
# and focus on first exercise period to see startup
t1 = t1[t1.DATE < "2019-09-17 07:30:00"]
times = pd.to_datetime(t1["DATE"])
count = np.arange(0, len(times))
plt.plot_date(times, count)
plt.gcf().autofmt_xdate()
plt.show()
In [13]:
# most recent 'Exercise' by user
t1 = df[df.Exercise.notnull()]
t2 = t1.drop_duplicates(['user'], keep='last').sort_values(['user'])
t2[['DATE','user','Exercise']]
Out[13]:
In [14]:
# count most recent by Exercise
t1 = df[df.file.notnull()]
t2 = t1.drop_duplicates(['user'], keep='last')
t2.sort_values("Exercise")
t2.groupby("Exercise").count()
Out[14]:
In [15]:
# how many total times for each Exercise
t1 = df[df.Exercise.notnull()]
t1.groupby(["Exercise"]).count()
Out[15]:
In [ ]: