This notebook aims at building a Kibana dashboard focused on the several aspects of interest for the Xen community.
This builds the following panels:
In [1]:
# Elastic search library
from elasticsearch import Elasticsearch
# python-mysql database access
import MySQLdb
import logging
import pandas
from ConfigParser import ConfigParser
In [2]:
config = ConfigParser()
config.read("settings")
args = {}
# There are two sections: mysql and elasticsearch
if config.has_section("mysql"):
if config.has_option("mysql", "user") and \
config.has_option("mysql", "password") and \
config.has_option("mysql", "mlstats_db") and \
config.has_option("mysql", "cvsanaly_db") and \
config.has_option("mysql", "code_review_db"):
args["mysql"] = dict(config.items("mysql"))
if config.has_section("elasticsearch"):
if config.has_option("elasticsearch", "user") and \
config.has_option("elasticsearch", "password") and \
config.has_option("elasticsearch", "host") and \
config.has_option("elasticsearch", "port") and \
config.has_option("elasticsearch", "path"):
args["elasticsearch"] = dict(config.items("elasticsearch"))
if not(args.has_key("mysql") and args.has_key("elasticsearch")):
raise Exception("Section 'mysql' or section 'elasticsearch' not found in the 'settings' file")
In [4]:
def connect(args):
user = args["mysql"]["user"]
password = args["mysql"]["user"]
host = "localhost"
db = args["mysql"]["code_review_db"]
try:
db = MySQLdb.connect(user = user, passwd = password, db = db, charset='utf8')
return db, db.cursor()
except:
logging.error("Database connection error")
raise
def execute_query(connector, query):
results = int (connector.execute(query))
cont = 0
if results > 0:
result1 = connector.fetchall()
return result1
else:
return []
db, cursor = connect(args)
In [5]:
# Insert data in ElasticSearch
def to_json(row, columns):
# Function that translates from tuple to JSON doc
doc = {}
for column in columns:
value = row[columns.index(column) + 1]
try:
doc[column] = value
except:
doc[column] = ""
return doc
This panel plays with the following data:
In [6]:
# Calculating the list of patch series found in the database
# Basic info about a patch serie:
# * Patch Serie id
# * Patch Serie subject
# * Patch Serie sender
# * Patch Serie initial sending date
# * Patch Serie number of patches
# * Patch Serie number of versions
# * Patch Serie number of comments
# * Patch Serie participants (people commenting)
# * Patch Serie time 2 merge
# * Patch Serie merge time
query_patchserie = """ SELECT ps.id as patchserie_id,
ps.message_id as message_id,
ps.subject as patchserie_subject,
pe.email as patchserie_sender,
SUBSTRING_INDEX(pe.email, '@', -1) as sender_domain,
MIN(psv.date_utc) as patchserie_sent_date,
max(t1.patches) as patchserie_numpatches,
COUNT(DISTINCT(version)) as patchserie_versions,
count(distinct(c.id)) as patchserie_comments,
COUNT(DISTINCT(c.submitter_id)) as patchserie_commenters
FROM patch_series ps,
patch_series_version psv,
patches p,
people pe,
comments c,
(SELECT psv.ps_id,
p.ps_version_id,
count(distinct(p.id)) as patches
FROM patch_series_version psv,
patches p
WHERE psv.id = p.ps_version_id
GROUP BY psv.ps_id,
p.ps_version_id) t1
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
p.ps_version_id = t1.ps_version_id AND
p.id = c.patch_id AND
p.submitter_id = pe.id
GROUP BY ps.id
"""
# Calculating the time to merge. Later merging result with previous query
query_time2merge = """ SELECT psv.ps_id as patch_serie,
TIMESTAMPDIFF(SECOND, MIN(psv.date_utc), MAX(c.committer_date_utc)) as time2merge,
MIN(psv.date_utc) as first_patch_date,
MAX(c.committer_date_utc) as merge_time
FROM patch_series_version psv,
patches p,
commits c
WHERE psv.id=p.ps_version_id AND
p.commit_id = c.id
GROUP BY psv.ps_id; """
# Calculating the time to commit
query_time2commit = """ SELECT psv.ps_id as patch_serie,
TIMESTAMPDIFF(SECOND, MAX(c.date_utc), MAX(commits.committer_date_utc)) as time2commit,
MAX(c.date_utc) as last_comment_date,
MAX(commits.committer_date_utc) as commit_time
FROM patch_series_version psv,
patches p,
comments c,
commits
WHERE psv.id = p.ps_version_id AND
p.commit_id = commits.id AND
p.id=c.patch_id
GROUP BY psv.ps_id
HAVING time2commit >= 0 and time2commit < 2000*3600*24 """
In [7]:
patchserie_data = execute_query(cursor, query_patchserie)
time2merge_data = execute_query(cursor, query_time2merge)
time2commit_data = execute_query(cursor, query_time2commit)
In [8]:
patchseries_list = list(patchserie_data)
time2merge_list = list(time2merge_data)
time2commit_list = list(time2commit_data)
In [9]:
patchseries_df = pandas.DataFrame(patchseries_list, columns=["patchserie_id", "message_id", "subject", "sender", "sender_domain", "sent_date", "num_patches", "num_versions", "num_comments", "num_commenters"])
time2merge_df = pandas.DataFrame(time2merge_list, columns=["patchserie_id", "time2merge", "sent_date", "mergetime"])
time2commit_df = pandas.DataFrame(time2commit_list, columns=["patchserie_id", "time2commit", "lastcommentdate", "committime"])
In [10]:
patchseries_df = pandas.merge(patchseries_df, time2merge_df, on="patchserie_id", how="left")
patchseries_df = pandas.merge(patchseries_df, time2commit_df, on="patchserie_id", how="left")
In [11]:
patchseries_df = patchseries_df.fillna(-1)
#patchseries_df
In [12]:
patchseries_df.columns
Out[12]:
In [11]:
patchseries_df
Out[11]:
In [13]:
user = args["elasticsearch"]["user"]
password = args["elasticsearch"]["password"]
host = args["elasticsearch"]["host"]
port = args["elasticsearch"]["port"]
path = args["elasticsearch"]["path"]
connection = "http://" + user + ":" + password + "@" + host + ":" + port + "/" + path
INDEX = 'xen-patchseries-timefocused'
# Building the ES connection
es = Elasticsearch([connection])
#es.indices.create(index=INDEX)
In [14]:
patchseries_df["time2commit"] = (patchseries_df["time2commit"] / 3600.0) / 24.0
patchseries_df["time2merge"] = (patchseries_df["time2merge"] / 3600.0) / 24.0
columns = patchseries_df.columns.values.tolist()
print columns
for row in patchseries_df.itertuples():
# Let's insert into ES each tuple found in the dataset
uniq_id = row[0]
doc = to_json(row, columns)
result = es.index(index=INDEX, doc_type='patchserie', id = uniq_id, body=doc)
This panel is focused on the analysis of the developers, their reviewers, their domains, and the time to merge, time to first review and time to commit.
In [57]:
exit()
people_query = """ SELECT p.subject as patch_subject,
p.message_id as message_id,
p.date_utc as patch_sent_date,
pe.email as patch_sender,
SUBSTRING_INDEX(pe.email, '@', -1) as patch_sender_domain,
ps.id as patchserie_id,
ps.subject as patchserie_subject,
MIN(psv.date_utc) as patchserie_sent_date,
t1.email as patch_comment_sender,
t1.domain as patch_comment_domain,
t1.comment_date as patch_comment_date,
TIMESTAMPDIFF(SECOND, p.date_utc, t1.comment_date) as patch_time2comment
FROM patch_series ps,
patch_series_version psv,
patches p,
people pe,
(select p.id as patch_id,
pe.email as email,
SUBSTRING_INDEX(pe.email, '@', -1) as domain,
c.date_utc as comment_date
from patches p,
comments c,
people pe
where c.patch_id = p.id and
c.submitter_id = pe.id) t1
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
p.submitter_id = pe.id AND
p.id = t1.patch_id
GROUP BY p.id; """
#The issue is in the database. All of the TZ must be substracted. So the negative ones will be added (double substraction),
# while the positive ones will be simply substracted. More info at https://www.bitergia.net/redmine/issues/6699
In [58]:
people_data = execute_query(cursor, people_query)
people_list = list(people_data)
people_df = pandas.DataFrame(people_list, columns=["patch_subject", "patch_message_id", "patch_sent_date", "patch_sender", "patch_sender_domain", "patchserie_id", "patchserie_subject", "patchserie_sent_date", "patch_comment_sender", "patch_comment_domain", "patch_comment_date", "patch_time2comment"])
In [59]:
INDEX = 'xen-patchseries-peoplefocused'
# Building the ES connection
es = Elasticsearch()
# Creating the openstack-votes index
es.indices.create(index=INDEX)
Out[59]:
In [60]:
people_df["patch_time2comment"] = (people_df["patch_time2comment"] / 3600.0) / 24.0
columns = people_df.columns.values.tolist()
print columns
for row in people_df.itertuples():
# Let's insert into ES each tuple found in the dataset
uniq_id = row[0]
doc = to_json(row, columns)
result = es.index(index=INDEX, doc_type='patch', id = uniq_id, body=doc)
In [ ]: