This notebook aims at building a Kibana dashboard focused on the several aspects of interest for the Xen community.
This builds the following panels:
In [1]:
    
# Elastic search library
from elasticsearch import Elasticsearch
# python-mysql database access
import MySQLdb
import logging
import pandas
from ConfigParser import ConfigParser
    
In [2]:
    
config = ConfigParser()
config.read("settings")
args = {}
# There are two sections: mysql and elasticsearch
if config.has_section("mysql"):
    if config.has_option("mysql", "user") and \
        config.has_option("mysql", "password") and \
        config.has_option("mysql", "mlstats_db") and \
        config.has_option("mysql", "cvsanaly_db") and \
        config.has_option("mysql", "code_review_db"):
        args["mysql"] = dict(config.items("mysql"))
if config.has_section("elasticsearch"):
    if config.has_option("elasticsearch", "user") and \
       config.has_option("elasticsearch", "password") and \
       config.has_option("elasticsearch", "host") and \
       config.has_option("elasticsearch", "port") and \
       config.has_option("elasticsearch", "path"):
         args["elasticsearch"] = dict(config.items("elasticsearch"))
if not(args.has_key("mysql") and args.has_key("elasticsearch")):
    raise Exception("Section 'mysql' or section 'elasticsearch' not found in the 'settings' file")
    
In [4]:
    
def connect(args):
   user = args["mysql"]["user"]
   password = args["mysql"]["user"]
   host = "localhost"
   db = args["mysql"]["code_review_db"]
   try:
      db = MySQLdb.connect(user = user, passwd = password, db = db, charset='utf8')
      return db, db.cursor()
   except:
      logging.error("Database connection error")
      raise
def execute_query(connector, query):
   results = int (connector.execute(query))
   cont = 0
   if results > 0:
      result1 = connector.fetchall()
      return result1
   else:
      return []
    
db, cursor = connect(args)
    
In [5]:
    
# Insert data in ElasticSearch
def to_json(row, columns):
    # Function that translates from tuple to JSON doc
    doc = {}
    
    for column in columns:
       
        value = row[columns.index(column) + 1] 
       
        try:
            doc[column] = value
        except:
            doc[column] = ""
   
    return doc
    
This panel plays with the following data:
In [6]:
    
# Calculating the list of patch series found in the database
# Basic info about a patch serie:
#   * Patch Serie id
#   * Patch Serie subject
#   * Patch Serie sender
#   * Patch Serie initial sending date
#   * Patch Serie number of patches
#   * Patch Serie number of versions
#   * Patch Serie number of comments
#   * Patch Serie participants (people commenting)
#   * Patch Serie time 2 merge
#   * Patch Serie merge time
query_patchserie = """ SELECT ps.id as patchserie_id,
                   ps.message_id as message_id,
                   ps.subject as patchserie_subject,
                   pe.email as patchserie_sender,
                   SUBSTRING_INDEX(pe.email, '@', -1) as sender_domain,
                   MIN(psv.date_utc) as patchserie_sent_date,
                   max(t1.patches) as patchserie_numpatches,
                   COUNT(DISTINCT(version)) as patchserie_versions,
                   count(distinct(c.id)) as patchserie_comments,
                   COUNT(DISTINCT(c.submitter_id)) as patchserie_commenters
            FROM patch_series ps,
                 patch_series_version psv,
                 patches p,
                 people pe,
                 comments c,
                 (SELECT psv.ps_id, 
                             p.ps_version_id, 
                             count(distinct(p.id)) as patches 
                  FROM patch_series_version psv, 
                           patches p 
                  WHERE psv.id = p.ps_version_id  
                  GROUP BY psv.ps_id, 
                           p.ps_version_id) t1
            WHERE ps.id = psv.ps_id AND
                  psv.id = p.ps_version_id AND
                  p.ps_version_id = t1.ps_version_id AND
                  p.id = c.patch_id AND
                  p.submitter_id = pe.id
            GROUP BY ps.id
            """
# Calculating the time to merge. Later merging result with previous query
query_time2merge = """ SELECT psv.ps_id as patch_serie,
                   TIMESTAMPDIFF(SECOND, MIN(psv.date_utc), MAX(c.committer_date_utc)) as time2merge,
                   MIN(psv.date_utc) as first_patch_date,
                   MAX(c.committer_date_utc) as merge_time
            FROM patch_series_version psv,
                 patches p,
                 commits c
            WHERE psv.id=p.ps_version_id AND 
                  p.commit_id = c.id 
            GROUP BY psv.ps_id; """
# Calculating the time to commit
query_time2commit = """ SELECT psv.ps_id as patch_serie,
                   TIMESTAMPDIFF(SECOND, MAX(c.date_utc), MAX(commits.committer_date_utc)) as time2commit,
                   MAX(c.date_utc) as last_comment_date, 
                   MAX(commits.committer_date_utc) as commit_time 
            FROM patch_series_version psv, 
                 patches p, 
                 comments c, 
                 commits 
            WHERE psv.id = p.ps_version_id AND 
                  p.commit_id = commits.id AND 
                  p.id=c.patch_id 
            GROUP BY psv.ps_id
            HAVING time2commit >= 0 and time2commit < 2000*3600*24 """
    
In [7]:
    
patchserie_data = execute_query(cursor, query_patchserie)
time2merge_data = execute_query(cursor, query_time2merge)
time2commit_data = execute_query(cursor, query_time2commit)
    
In [8]:
    
patchseries_list = list(patchserie_data)
time2merge_list = list(time2merge_data)
time2commit_list = list(time2commit_data)
    
In [9]:
    
patchseries_df = pandas.DataFrame(patchseries_list, columns=["patchserie_id", "message_id", "subject", "sender", "sender_domain", "sent_date", "num_patches", "num_versions", "num_comments", "num_commenters"])
time2merge_df = pandas.DataFrame(time2merge_list, columns=["patchserie_id", "time2merge", "sent_date", "mergetime"])
time2commit_df = pandas.DataFrame(time2commit_list, columns=["patchserie_id", "time2commit", "lastcommentdate", "committime"])
    
In [10]:
    
patchseries_df = pandas.merge(patchseries_df, time2merge_df, on="patchserie_id", how="left")
patchseries_df = pandas.merge(patchseries_df, time2commit_df, on="patchserie_id", how="left")
    
In [11]:
    
patchseries_df = patchseries_df.fillna(-1)
#patchseries_df
    
In [12]:
    
patchseries_df.columns
    
    Out[12]:
In [11]:
    
patchseries_df
    
    Out[11]:
In [13]:
    
user = args["elasticsearch"]["user"]
password = args["elasticsearch"]["password"]
host = args["elasticsearch"]["host"]
port = args["elasticsearch"]["port"]
path = args["elasticsearch"]["path"]
connection = "http://" + user + ":" + password + "@" + host + ":" + port + "/" + path
INDEX = 'xen-patchseries-timefocused'
# Building the ES connection
es = Elasticsearch([connection])
#es.indices.create(index=INDEX)
    
In [14]:
    
patchseries_df["time2commit"] = (patchseries_df["time2commit"] / 3600.0) / 24.0
patchseries_df["time2merge"] = (patchseries_df["time2merge"] / 3600.0) / 24.0
columns = patchseries_df.columns.values.tolist()
print columns
for row in patchseries_df.itertuples():
    # Let's insert into ES each tuple found in the dataset
    uniq_id = row[0]
    doc = to_json(row, columns)
    result = es.index(index=INDEX, doc_type='patchserie', id = uniq_id, body=doc)
    
    
This panel is focused on the analysis of the developers, their reviewers, their domains, and the time to merge, time to first review and time to commit.
In [57]:
    
exit()
people_query = """ SELECT p.subject as patch_subject,
                          p.message_id as message_id,
                          p.date_utc as patch_sent_date,
                          pe.email as patch_sender,
                          SUBSTRING_INDEX(pe.email, '@', -1) as patch_sender_domain,
                          ps.id as patchserie_id,
                          ps.subject as patchserie_subject,
                          MIN(psv.date_utc) as patchserie_sent_date,
                          t1.email as patch_comment_sender,
                          t1.domain as patch_comment_domain,
                          t1.comment_date as patch_comment_date,
                          TIMESTAMPDIFF(SECOND, p.date_utc, t1.comment_date) as patch_time2comment
                    FROM patch_series ps,
                         patch_series_version psv,
                         patches p,
                         people pe,
                         (select p.id as patch_id,
                                 pe.email as email,
                                 SUBSTRING_INDEX(pe.email, '@', -1) as domain,
                                 c.date_utc as comment_date
                          from patches p,
                               comments c,
                               people pe
                          where c.patch_id = p.id and
                                c.submitter_id = pe.id) t1
                    WHERE ps.id = psv.ps_id AND
                          psv.id = p.ps_version_id AND
                          p.submitter_id = pe.id AND
                          p.id = t1.patch_id
                    GROUP BY p.id; """
#The issue is in the database. All of the TZ must be substracted. So the negative ones will be added (double substraction),
# while the positive ones will be simply substracted. More info at https://www.bitergia.net/redmine/issues/6699
    
In [58]:
    
people_data = execute_query(cursor, people_query)
people_list = list(people_data)
people_df = pandas.DataFrame(people_list, columns=["patch_subject", "patch_message_id", "patch_sent_date", "patch_sender", "patch_sender_domain", "patchserie_id", "patchserie_subject", "patchserie_sent_date", "patch_comment_sender", "patch_comment_domain", "patch_comment_date", "patch_time2comment"])
    
In [59]:
    
INDEX = 'xen-patchseries-peoplefocused'
# Building the ES connection
es = Elasticsearch()
# Creating the openstack-votes index
es.indices.create(index=INDEX)
    
    Out[59]:
In [60]:
    
people_df["patch_time2comment"] = (people_df["patch_time2comment"] / 3600.0) / 24.0
columns = people_df.columns.values.tolist()
print columns
for row in people_df.itertuples():
    # Let's insert into ES each tuple found in the dataset
    uniq_id = row[0]
    doc = to_json(row, columns)
    result = es.index(index=INDEX, doc_type='patch', id = uniq_id, body=doc)
    
    
In [ ]: