This notebook aims at providing a first approach to manage the new code review database. The usual code review process is done through the mailing lists. This approach migrates that code review into a database schema closer to how Gerrit or another review system works.
The list of metrics for this notebook are the following ones:
A patch can be divided into several steps:
1---------2----------3---------4--------5---------A--------C
And each version can be divided into several steps:
1----a)review------b)review------c)review-----------d)review--------------2----------------------------------------------3
Where 1, 2, 3, .., are the several iterations, A the point where all of the patches were 'Acked-by' and C the commit action into master. And a, b, c, d comments and reviews.
In [1]:
%pylab inline
# Enabling libraries to manage data and plot
import pandas as pd
import numpy as np
from pandas import Series, DataFrame, Panel
In [2]:
# Working with the xen-database with code review info
import MySQLdb
def connect():
user = "root"
password = ""
host = "localhost"
db = "xen_reports_patchsets3"
try:
db = MySQLdb.connect(user = user, passwd = password, db = db)
return db, db.cursor()
except:
logging.error("Database connection error")
raise
def execute_query(connector, query):
results = int (connector.execute(query))
cont = 0
if results > 0:
result1 = connector.fetchall()
return result1
else:
return []
db, cursor = connect()
In [3]:
def draw_distribution(data_df, key_field, hist, boxplot, boxplot_log=True):
# This function draws as histogram and boxplot the distribution
# of the specified charts.
# data: raw data coming from database
# columns_list: list of columns represented in 'data'
# key_field: key field to be represented in the histogram and boxplot
# hist: contains the xlabel, ylabel and title fields
# boxplot: contains the ylabel and title fields
fig, axs = plt.subplots(1,2, figsize=(12,5))
hist_chart = data_df[key_field].plot(kind="hist", ax=axs[0])
boxplot_chart = data_df[key_field].plot(kind="box", ax=axs[1], logy=boxplot_log)
#chart labels
hist_chart.set_xlabel(hist["xlabel"])
hist_chart.set_ylabel(hist["ylabel"])
hist_chart.set_title(hist["title"])
boxplot_chart.set_ylabel(boxplot["ylabel"])
boxplot_chart.set_title(boxplot["title"])
Metrics to retrieve:
Xen community uses the term patch or patch serie when sending pieces of code to the mailing list. In order to simplify this process, a patch serie is now conformed of one or more patches.
In [4]:
query = """ SELECT year,
month,
count(distinct(ps)) as patch_series
FROM
(SELECT ps_version_id as ps,
year(min(date_utc)) as year,
month(min(date_utc)) as month
FROM patches
GROUP BY ps_version_id
ORDER BY min(date_utc)) t
GROUP BY year, month
"""
data = execute_query(cursor, query)
In [5]:
np_data = np.array(data)
np_data_filtered = np_data[1:-2] # This remove 2001 and 2016 data
#Migrating to use Series in pandas (nicer view and easier to manage series)
dates = pd.date_range('2004-08', '2015-11', freq='M') # data starting in 2004-08 and ending in 2015-08
#Checking size of both arrays: dates.shape and np_data_filtered.shape
patchserieTS = Series(np_data_filtered[:,2], index=dates)
In [6]:
patchserieschart = patchserieTS.plot()
patchserieschart.set_xlabel("Time evolution")
patchserieschart.set_ylabel("Number of patches")
patchserieschart.set_title("Evolution of patch series per month")
Out[6]:
In [7]:
query = """ SELECT year,
month,
count(distinct(submitter)) as submitters
FROM
(SELECT submitter_id as submitter,
year(min(date_utc)) as year,
month(min(date_utc)) as month
FROM patches
GROUP BY ps_version_id
ORDER BY min(date_utc)) t
GROUP BY year, month
"""
data_submitters = execute_query(cursor, query)
In [8]:
np_data = np.array(data_submitters)
np_data_filtered = np_data[1:-2] # This remove 2001 and 2016 data
patchserie_submittersTS = Series(np_data_filtered[:,2], index=dates)
In [9]:
patchserie_submitters_chart = patchserie_submittersTS.plot()
patchserie_submitters_chart.set_xlabel("Time evolution")
patchserie_submitters_chart.set_ylabel("Number of patch submitters")
patchserie_submitters_chart.set_title("Evolution of unique patch submitters per month")
Out[9]:
In [10]:
# Let's calculate evolution of comments in general when they were sent
# We should ignore 1970 emails with wrong sent date
query = """ SELECT year(date_utc) as year,
month(date_utc) as month,
count(distinct(comments.id)) as comments
FROM comments
WHERE year(date_utc) > 2004
GROUP BY year(date_utc),
month(date_utc)
"""
comments = execute_query(cursor, query)
In [11]:
dates = pd.date_range('2005-01', '2015-11', freq='M') # data starting in 2005-01 and ending in 2015-08
np_data = np.array(comments)
commentsTS = Series(np_data[:,2], index=dates)
In [12]:
comments_chart = commentsTS.plot()
comments_chart.set_xlabel("Time evolution")
comments_chart.set_ylabel("Number of comments")
comments_chart.set_title("Evolution of comments per month")
Out[12]:
In [13]:
query = """ SELECT psv.ps_id as patch_serie,
MIN(psv.date_utc) as patch_serie_sent_date,
COUNT(DISTINCT(c.submitter_id)) as commenters
FROM patch_series_version psv,
patches p,
comments c
WHERE psv.id = p.ps_version_id AND
p.id=c.patch_id
GROUP BY psv.ps_id """
peoplecomments_ps = execute_query(cursor, query)
In [14]:
peoplecomments_list = list(peoplecomments_ps)
peoplecomments_df = DataFrame(peoplecomments_list, columns=["patch_serie", "partch_serie_sent_date", "commenters"])
In [15]:
hist = {"xlabel":"People commenting", "ylabel":"Number of patch series", "title":"Histogram: number of reviews"}
boxplot = {"ylabel":"People commenting per patch", "title":"Log scale: Dist. people commenting"}
draw_distribution(peoplecomments_df, "commenters", hist, boxplot)
In [16]:
mean_peoplecomments = peoplecomments_df.set_index("partch_serie_sent_date").resample('M', how={"commenters":np.mean})
median_peoplecomments = peoplecomments_df.set_index("partch_serie_sent_date").resample('M', how={"commenters":np.median})
dates = pd.date_range('2010-10', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
peoplecommentsTS = Series(mean_peoplecomments["commenters"], index=dates)
peoplecomments_medianTS = Series(median_peoplecomments["commenters"], index=dates)
peoplecommentsTS_chart = peoplecommentsTS.plot()
peoplecomments_medianTS.plot()
peoplecommentsTS_chart.set_xlabel("Time evolution")
peoplecommentsTS_chart.set_ylabel("People commenting per patch serie")
peoplecommentsTS_chart.set_title("Evolution of the mean (blue) and median (green) number of people per patch serie")
Out[16]:
In [17]:
#TODO
Metrics to retrieve:
Time to merge needs to be calculated as the time when a patch was initially sent till this is merge into code. As each patch is associated to a commit, this is obtained as the first date when the patch was sent till the last commit for that patch serie is merged into code.
In [18]:
query = """ SELECT psv.ps_id as patch_serie,
TIMESTAMPDIFF(SECOND, MIN(psv.date_utc), MAX(c.committer_date_utc)) as time2merge,
MIN(psv.date_utc) as first_patch_date,
MAX(c.committer_date_utc) as merge_time
FROM patch_series_version psv,
patches p,
commits c
WHERE psv.id=p.ps_version_id AND
p.commit_id = c.id
GROUP BY psv.ps_id
HAVING time2merge >=0 """
time2merge = execute_query(cursor, query)
In [19]:
time2merge_list = list(time2merge)
time2merge_df = DataFrame(time2merge_list, columns=["patch_serie", "time2merge", "first_patch_date", "merge_time"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
#Migrate seconds to days
time2merge_df["time2merge_days"] = (time2merge_df["time2merge"]/3600)/24
hist_chart = time2merge_df["time2merge_days"].plot(kind="hist", ax=axs[0])
boxplot_chart = time2merge_df["time2merge_days"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("Days to merge")
hist_chart.set_ylabel("Number of reviews")
hist_chart.set_title("Histogram: number of reviews")
boxplot_chart.set_ylabel("Days to merge")
boxplot_chart.set_title("Log scale: Dist. time2merge")
Out[19]:
In [20]:
mean_time2merge = time2merge_df.set_index("merge_time").resample('M', how={"time2merge_days":np.mean})
median_time2merge = time2merge_df.set_index("merge_time").resample('M', how={"time2merge_days":np.median})
dates = pd.date_range('2010-10', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
time2mergeTS = Series(mean_time2merge["time2merge_days"], index=dates)
time2merge_medianTS = Series(median_time2merge["time2merge_days"], index=dates)
time2mergeTS_chart = time2mergeTS.plot()
time2merge_medianTS.plot()
time2mergeTS_chart.set_xlabel("Time evolution")
time2mergeTS_chart.set_ylabel("Number of days")
time2mergeTS_chart.set_title("Evolution of the mean (blue) and median (green) time to merge a patch serie")
Out[20]:
In [21]:
query = """ SELECT psv.ps_id as patch_serie,
TIMESTAMPDIFF(SECOND, MAX(c.date_utc), MAX(commits.committer_date_utc)) as time2commit,
MAX(c.date_utc) as last_comment_date,
MAX(commits.committer_date_utc) as commit_time
FROM patch_series_version psv,
patches p,
comments c,
commits
WHERE psv.id = p.ps_version_id AND
p.commit_id = commits.id AND
p.id=c.patch_id
GROUP BY psv.ps_id
HAVING time2commit >= 0 and time2commit < 2000*3600*24"""
time2commit = execute_query(cursor, query)
In [22]:
time2commit_list = list(time2commit)
time2commit_df = DataFrame(time2commit_list, columns=["patch_serie", "time2commit", "last_comment_date", "commit_time"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
#Migrate seconds to days
time2commit_df["time2commit_days"] = (time2commit_df["time2commit"]/3600)/24
hist_chart = time2commit_df["time2commit_days"].plot(kind="hist", ax=axs[0])
boxplot_chart = time2commit_df["time2commit_days"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("Days to commit")
hist_chart.set_ylabel("Number of patch series")
hist_chart.set_title("Histogram: days to commit")
boxplot_chart.set_ylabel("Days to commit")
boxplot_chart.set_title("Log scale: Dist. time2commit")
Out[22]:
In [23]:
mean_time2commit = time2commit_df.set_index("commit_time").resample('M', how={"time2commit_days":np.mean})
median_time2commit = time2commit_df.set_index("commit_time").resample('M', how={"time2commit_days":np.median})
dates = pd.date_range('2010-10', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
time2commitTS = Series(mean_time2commit["time2commit_days"], index=dates)
time2commit_medianTS = Series(median_time2commit["time2commit_days"], index=dates)
time2commitTS_chart = time2commitTS.plot()
time2commit_medianTS.plot()
time2commitTS_chart.set_xlabel("Time evolution")
time2commitTS_chart.set_ylabel("Number of days")
time2commitTS_chart.set_title("Evolution of the mean (blue) and median (green) time to commit a patch serie")
Out[23]:
In [24]:
query = """ SELECT psv.ps_id as patch_serie,
psv.version as version,
psv.date_utc as rework_time,
t.last_activity as last_version_activity,
timestampdiff(SECOND, psv.date_utc, t.last_activity) as time2rework
FROM patch_series_version psv,
(SELECT psv.ps_id,
psv.version,
greatest(max(c.date_utc), max(p.date_utc)) as last_activity
FROM patch_series_version psv,
patches p,
comments c
WHERE psv.id = p.ps_version_id AND
c.patch_id=p.id
GROUP BY psv.ps_id, psv.version) t
WHERE psv.ps_id = t.ps_id AND
psv.version = t.version -1
HAVING time2rework >= 0 and time2rework < 1000*3600*24"""
time2rework = execute_query(cursor, query)
In [25]:
time2rework_list = list(time2rework)
time2rework_df = DataFrame(time2rework_list, columns=["patch_serie", "version", "rework_time", "last_version_activity", "time2rework"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
#Migrate seconds to days
time2rework_df["time2rework_days"] = (time2rework_df["time2rework"]/3600)/24
hist_chart = time2rework_df["time2rework_days"].plot(kind="hist", ax=axs[0])
boxplot_chart = time2rework_df["time2rework_days"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("Days to re-work")
hist_chart.set_ylabel("Number of patch series")
hist_chart.set_title("Histogram: days to patch serie re-work")
boxplot_chart.set_ylabel("Days to re-work")
boxplot_chart.set_title("Log scale: Dist. time2rework")
Out[25]:
In [26]:
mean_time2rework = time2rework_df.set_index("rework_time").resample('M', how={"time2rework_days":np.mean})
median_time2rework = time2rework_df.set_index("rework_time").resample('M', how={"time2rework_days":np.median})
dates = pd.date_range('2010-10', '2015-09', freq='M') # data starting in 2010-10 and ending in 2015-08
time2reworkTS = Series(mean_time2rework["time2rework_days"], index=dates)
time2rework_medianTS = Series(median_time2rework["time2rework_days"], index=dates)
time2reworkTS_chart = time2reworkTS.plot()
time2rework_medianTS.plot()
time2reworkTS_chart.set_xlabel("Time evolution")
time2reworkTS_chart.set_ylabel("Number of days")
time2reworkTS_chart.set_title("Evolution of the mean (blue) and median (green) time to re-work a patch serie")
Out[26]:
In [27]:
query = """ SELECT p1.ps_id as patch_serie,
p1.version as version,
p1.date_utc as cycle1,
p2.date_utc as cycle2,
timestampdiff(SECOND, p1.date_utc, p2.date_utc) as time2newcycle
FROM patch_series_version p1,
patch_series_version p2
WHERE p1.ps_id = p2.ps_id AND
p1.version = p2.version -1
HAVING time2newcycle >= 0 and time2newcycle<1000*3600*24
ORDER BY p1.ps_id """
time2newcycle = execute_query(cursor, query)
In [28]:
time2newcycle_list = list(time2newcycle)
time2newcycle_df = DataFrame(time2newcycle_list, columns=["patch_serie", "version", "cycle1", "cycle2", "time2newcycle"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
#Migrate seconds to days
time2newcycle_df["time2newcycle_days"] = (time2newcycle_df["time2newcycle"]/3600)/24
hist_chart = time2newcycle_df["time2newcycle_days"].plot(kind="hist", ax=axs[0])
boxplot_chart = time2newcycle_df["time2newcycle_days"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("Days to new cycle")
hist_chart.set_ylabel("Number of patch series")
hist_chart.set_title("Histogram: days to new cycle")
boxplot_chart.set_ylabel("Days to new cycle")
boxplot_chart.set_title("Log scale: Dist. time2newcycle")
Out[28]:
In [29]:
mean_time2newcycle = time2newcycle_df.set_index("cycle2").resample('M', how={"time2newcycle_days":np.mean})
median_time2newcycle = time2newcycle_df.set_index("cycle2").resample('M', how={"time2newcycle_days":np.median})
dates = pd.date_range('2010-10', '2015-09', freq='M') # data starting in 2010-10 and ending in 2015-08
time2newcycleTS = Series(mean_time2newcycle["time2newcycle_days"], index=dates)
time2newcycle_medianTS = Series(median_time2newcycle["time2newcycle_days"], index=dates)
time2newcycleTS_chart = time2newcycleTS.plot()
time2newcycle_medianTS.plot()
time2newcycleTS_chart.set_xlabel("Time evolution")
time2newcycleTS_chart.set_ylabel("Number of days")
time2newcycleTS_chart.set_title("Evolution of the mean (blue) and median (green) time to have a new cycle in a patch serie")
Out[29]:
Either this is a patch or a patch serie, this analysis is focused at the level of patch. This ignores all of the patches that are not reviewed. At the level of 'flags', this counts the time between a patch is sent till a 'Reviewed-by' flag appears.
Some issues found:
Assumption:
In [30]:
query = """ SELECT p.subject as subject,
p.date_utc as sent_date,
f.flag as flag,
f.date_utc as reviewed_date,
TIMESTAMPDIFF(SECOND, p.date_utc, min(f.date_utc)) as time2review
FROM patches p,
flags f
WHERE p.id = f.patch_id and
f.flag='Reviewed-by'
GROUP BY p.subject
HAVING TIMESTAMPDIFF(SECOND, p.date_utc, min(f.date_utc)) > 0 and TIMESTAMPDIFF(SECOND, p.date_utc, min(f.date_utc))
"""
time2review = execute_query(cursor, query)
In [31]:
time2review_list = list(time2review)
time2review_df = DataFrame(time2review_list, columns=["subject", "date", "flag", "review_time", "time2review"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
#Migrate seconds to days
time2review_df["time2review_days"] = (time2review_df["time2review"]/3600)/24
hist_chart = time2review_df["time2review_days"].plot(kind="hist", ax=axs[0])
boxplot_chart = time2review_df["time2review_days"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("Days to review")
hist_chart.set_ylabel("Number of reviews")
hist_chart.set_title("Histogram: number of reviews")
boxplot_chart.set_ylabel("Days to review")
boxplot_chart.set_title("Log scale: Dist. time2review")
Out[31]:
In [32]:
mean_time2review = time2review_df.set_index("review_time").resample('M', how={"time2review_days":np.mean})
median_time2review = time2review_df.set_index("review_time").resample('M', how={"time2review_days":np.median})
dates = pd.date_range('2010-10', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
time2reviewTS = Series(mean_time2review["time2review_days"], index=dates)
time2review_medianTS = Series(median_time2review["time2review_days"], index=dates)
time2reviewTS_chart = time2reviewTS.plot()
time2review_medianTS.plot()
time2reviewTS_chart.set_xlabel("Time evolution")
time2reviewTS_chart.set_ylabel("Number of days")
time2reviewTS_chart.set_title("Evolution of the mean (blue) and median (green) time to first-review a patch")
Out[32]:
In [33]:
query = """ SELECT t1.ps_id as patch_serie,
t1.first_patch_date,
f.flag as flag,
f.date_utc as acked_date,
TIMESTAMPDIFF(SECOND, t1.first_patch_date, min(f.date_utc)) as time2ack
FROM patches p,
flags f,
(select psv.ps_id,
p.id as patch_id,
min(psv.date_utc) as first_patch_date
from patch_series_version psv,
patches p
where p.ps_version_id=psv.id
group by psv.ps_id) t1
WHERE p.id = f.patch_id and
f.flag='Acked-by' and
p.id = t1.patch_id
GROUP BY t1.ps_id
HAVING TIMESTAMPDIFF(SECOND, t1.first_patch_date, min(f.date_utc)) > 0 and TIMESTAMPDIFF(SECOND, t1.first_patch_date, min(f.date_utc))
"""
time2ack = execute_query(cursor, query)
In [34]:
time2ack_list = list(time2ack)
time2ack_df = DataFrame(time2ack_list, columns=["subject", "date", "flag", "ack_time", "time2ack"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
#Migrate seconds to days
time2ack_df["time2ack_days"] = (time2ack_df["time2ack"]/3600)/24
hist_chart = time2ack_df["time2ack_days"].plot(kind="hist", ax=axs[0])
boxplot_chart = time2ack_df["time2ack_days"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("Days to ack")
hist_chart.set_ylabel("Number of reviews")
hist_chart.set_title("Histogram: number of reviews")
boxplot_chart.set_ylabel("Days to ack")
boxplot_chart.set_title("Log scale: Dist. time2ack")
Out[34]:
In [35]:
mean_time2ack = time2ack_df.set_index("ack_time").resample('M', how={"time2ack_days":np.mean})
median_time2ack = time2ack_df.set_index("ack_time").resample('M', how={"time2ack_days":np.median})
dates = pd.date_range('2011-10', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
time2ackTS = Series(mean_time2ack["time2ack_days"], index=dates)
time2ack_medianTS = Series(median_time2ack["time2ack_days"], index=dates)
time2ackTS_chart = time2ackTS.plot()
time2ack_medianTS.plot()
time2ackTS_chart.set_xlabel("Time evolution")
time2ackTS_chart.set_ylabel("Number of days")
time2ackTS_chart.set_title("Evolution of the mean (blue) and median (green) time to ack a patch")
Out[35]:
Metrics to retrieve:
In [36]:
# List of the most recent and completed reviews up to the date of this analysis
query = """ SELECT distinct ps.id as patch_serie,
ps.subject,
MAX(c.committer_date_utc) as merge_time
FROM patch_series ps,
patch_series_version psv,
patches p,
commits c
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND p.commit_id = c.id
GROUP BY ps.id
ORDER BY merge_time desc """
completed_reviews_list = execute_query(cursor, query)
In [37]:
completed_reviews = list(completed_reviews_list)
completed_reviews_df = DataFrame(completed_reviews, columns=["patch_serie", "subject", "merge_time"])
completed_reviews_df
Out[37]:
In [38]:
# Total number of completed reviews up to the date of this analysis
query = """ SELECT count(distinct(ps.id)) as merged_patch_series
FROM patch_series ps,
patch_series_version psv,
patches p,
commits c
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND p.commit_id = c.id
"""
total_completed_reviews = execute_query(cursor, query)
In [39]:
total_completed_reviews
Out[39]:
In [40]:
# List of recent reviewed patch series but not merged.
query = """ SELECT DISTINCT ps.id as reviewed_patch_serie,
ps.subject,
max(f.date_utc) as last_reviewed_date
FROM patch_series ps,
patch_series_version psv,
patches p,
flags f
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
f.patch_id = p.id AND
f.flag='Reviewed-by' AND
ps.id not in
(SELECT COUNT(DISTINCT(ps.id)) as merged_patch_series
FROM patch_series ps,
patch_series_version psv,
patches p,
commits c
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
p.commit_id = c.id) AND
f.date_utc BETWEEN DATE_SUB(NOW(), INTERVAL 60 DAY) AND NOW()
GROUP BY ps.id
ORDER BY f.date_utc desc """ # INITIAL VALUE OF 60 DAYS
recent_reviews = execute_query(cursor, query)
In [41]:
recent_reviews_list = list(recent_reviews)
recent_reviews_df = DataFrame(recent_reviews_list, columns=["patch_serie", "subject", "last_reviewed_date"])
recent_reviews_df
Out[41]:
In [42]:
query = """ SELECT COUNT(DISTINCT(ps.id)) as reviewed_patch_serie
FROM patch_series ps,
patch_series_version psv,
patches p,
flags f
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
f.patch_id = p.id AND
f.flag='Reviewed-by' AND
ps.id not in
(SELECT COUNT(DISTINCT(ps.id)) as merged_patch_series
FROM patch_series ps,
patch_series_version psv,
patches p,
commits c
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
p.commit_id = c.id) AND
f.date_utc BETWEEN DATE_SUB(NOW(), INTERVAL 60 DAY) AND NOW() """ # INITIAL VALUE OF 60 DAYS
total_recent_reviews = execute_query(cursor, query)
In [43]:
total_recent_reviews
Out[43]:
In [44]:
# List of last year reviewed patch series but not merged (and not recent reviewed)
query = """ SELECT DISTINCT ps.id as reviewed_patch_serie,
ps.subject,
max(f.date_utc) as last_reviewed_date
FROM patch_series ps,
patch_series_version psv,
patches p,
flags f
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
f.patch_id = p.id AND
f.flag='Reviewed-by' AND
ps.id not in
(SELECT COUNT(DISTINCT(ps.id)) as merged_patch_series
FROM patch_series ps,
patch_series_version psv,
patches p,
commits c
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
p.commit_id = c.id) AND
f.date_utc BETWEEN DATE_SUB(NOW(), INTERVAL 365 DAY) AND DATE_SUB(NOW(), INTERVAL 60 DAY)
GROUP BY ps.id
ORDER BY f.date_utc desc """ # INITIAL VALUE OF 60 DAYS
last_year_reviews = execute_query(cursor, query)
In [45]:
last_year_reviews_list = list(last_year_reviews)
last_year_reviews_df = DataFrame(last_year_reviews_list, columns=["patch_serie", "subject", "last_reviewed_date"])
last_year_reviews_df
Out[45]:
In [46]:
query = """ SELECT COUNT(DISTINCT(ps.id)) as reviewed_patch_serie
FROM patch_series ps,
patch_series_version psv,
patches p,
flags f
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
f.patch_id = p.id AND
f.flag='Reviewed-by' AND
ps.id not in
(SELECT COUNT(DISTINCT(ps.id)) as merged_patch_series
FROM patch_series ps,
patch_series_version psv,
patches p,
commits c
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
p.commit_id = c.id) AND
f.date_utc BETWEEN DATE_SUB(NOW(), INTERVAL 365 DAY) AND DATE_SUB(NOW(), INTERVAL 60 DAY) """ # INITIAL VALUE OF 60 DAYS
total_last_year_reviews = execute_query(cursor, query)
In [47]:
total_last_year_reviews
Out[47]:
In [48]:
# Stalled list of reviews: those with review activity and older than one year
query = """ SELECT DISTINCT ps.id as reviewed_patch_serie,
ps.subject,
max(f.date_utc) as last_reviewed_date
FROM patch_series ps,
patch_series_version psv,
patches p,
flags f
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
f.patch_id = p.id AND
f.flag='Reviewed-by' AND
ps.id not in
(SELECT COUNT(DISTINCT(ps.id)) as merged_patch_series
FROM patch_series ps,
patch_series_version psv,
patches p,
commits c
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
p.commit_id = c.id) AND
f.date_utc < DATE_SUB(NOW(), INTERVAL 365 DAY)
GROUP BY ps.id
ORDER BY f.date_utc desc """
stalled_reviews = execute_query(cursor, query)
In [49]:
stalled_reviews_list = list(stalled_reviews)
stalled_reviews_df = DataFrame(stalled_reviews_list, columns=["patch_serie", "subject", "last_reviewed_date"])
stalled_reviews_df
Out[49]:
In [50]:
query = """ SELECT COUNT(DISTINCT(ps.id)) as reviewed_patch_serie
FROM patch_series ps,
patch_series_version psv,
patches p,
flags f
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
f.patch_id = p.id AND
f.flag='Reviewed-by' AND
ps.id not in
(SELECT COUNT(DISTINCT(ps.id)) as merged_patch_series
FROM patch_series ps,
patch_series_version psv,
patches p,
commits c
WHERE ps.id = psv.ps_id AND
psv.id = p.ps_version_id AND
p.commit_id = c.id) AND
f.date_utc < DATE_SUB(NOW(), INTERVAL 365 DAY)"""
total_stalled_reviews = execute_query(cursor, query)
In [51]:
int(total_stalled_reviews[0][0])
Out[51]:
Metrics to retrieve:
In [52]:
query = """ SELECT ps_id as patch_serie,
COUNT(DISTINCT(version)) as versions,
MIN(date_utc) as patch_serie_first_date
FROM patch_series_version
GROUP BY ps_id """
iterations = execute_query(cursor, query)
In [53]:
iterations_list = list(iterations)
iterations_df = DataFrame(iterations_list, columns=["patch_serie", "versions", "patch_serie_first_date"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
hist_chart = iterations_df["versions"].plot(kind="hist", ax=axs[0])
boxplot_chart = iterations_df["versions"].plot(kind="box", ax=axs[1])
#chart labels
hist_chart.set_xlabel("Iterations")
hist_chart.set_ylabel("Patches")
hist_chart.set_title("Histogram: number of iterations per patch")
boxplot_chart.set_ylabel("Iterations per patch")
boxplot_chart.set_title("Log scale: Dist. iterations per patch")
Out[53]:
In [54]:
mean_iterations = iterations_df.set_index("patch_serie_first_date").resample('M', how={"versions":np.mean})
median_iterations = iterations_df.set_index("patch_serie_first_date").resample('M', how={"versions":np.median})
dates = pd.date_range('2005-01', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
iterationsTS = Series(mean_iterations["versions"], index=dates)
iterations_medianTS = Series(median_iterations["versions"], index=dates)
iterations_chart = iterationsTS.plot()
iterations_medianTS.plot()
iterations_chart.set_xlabel("Time evolution")
iterations_chart.set_ylabel("iterations")
iterations_chart.set_title("Evolution of the mean (blue) and median (green) iterations per patch")
Out[54]:
In [55]:
query = """ SELECT psv.ps_id as patch_serie,
max(c.committer_date_utc) as merge_time,
COUNT(DISTINCT(a.file_id)) as touched_files,
SUM(cl.added) as added_lines,
SUM(cl.removed) as removed_lines
FROM patch_series_version psv,
patches p,
commits c,
xen_reports_cvsanaly.scmlog s,
xen_reports_cvsanaly.actions a,
xen_reports_cvsanaly.commits_lines cl
WHERE psv.id = p.ps_version_id AND
p.commit_id = c.id AND
c.rev=s.rev AND
s.id=a.commit_id AND
s.id=cl.commit_id
GROUP BY psv.ps_id """
modified_code = execute_query(cursor, query)
In [56]:
modified_code_list = list(modified_code)
modified_code_df = DataFrame(modified_code_list, columns=["patch_serie", "merge_time", "touched_files", "added_lines", "removed_lines"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
hist_chart = modified_code_df["touched_files"].plot(kind="hist", ax=axs[0])
boxplot_chart = modified_code_df["touched_files"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("Touched files")
hist_chart.set_ylabel("Patch serie")
hist_chart.set_title("Histogram: number of touched files per patch")
boxplot_chart.set_ylabel("Touched files per patch serie")
boxplot_chart.set_title("Log scale: Dist. touched files per patch serie")
Out[56]:
In [57]:
# Touched files per patch serie in mean. This is also time series distributed to check if there are more touched files per patch series in mean and median
mean_modified_code = modified_code_df.set_index("merge_time").resample('M', how={"touched_files":np.mean})
median_modified_code = modified_code_df.set_index("merge_time").resample('M', how={"touched_files":np.median})
dates = pd.date_range('2005-01', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
modified_codeTS = Series(mean_modified_code["touched_files"], index=dates)
modified_code_medianTS = Series(median_modified_code["touched_files"], index=dates)
modified_code_chart = modified_codeTS.plot()
modified_code_medianTS.plot()
modified_code_chart.set_xlabel("Time evolution")
modified_code_chart.set_ylabel("Touched files")
modified_code_chart.set_title("Evolution of the mean (blue) and median (green) touched files per patch")
Out[57]:
In [58]:
# Let's calculate evolution of comments in general when they were sent
# We should ignore 1970 emails with wrong sent date
query = """ SELECT MIN(psv.date_utc) as patch_serie_first_date,
ps_id as patch_serie,
count(distinct(c.id)) as comments
FROM patch_series_version psv,
patches p,
comments c
WHERE psv.id = p.ps_version_id and
p.id = c.patch_id and
year(psv.date_utc) > 2004 and
year(psv.date_utc) < 2016
GROUP BY ps_id;
"""
comments = execute_query(cursor, query)
In [59]:
comments_list = list(comments)
comments_df = DataFrame(comments_list, columns=["patch_serie_first_date", "patch_serie", "comments"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
hist_chart = comments_df["comments"].plot(kind="hist", ax=axs[0])
boxplot_chart = comments_df["comments"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("Comments")
hist_chart.set_ylabel("Patch series")
hist_chart.set_title("Histogram: number of comments per patch serie")
boxplot_chart.set_ylabel("Comments per patch serie")
boxplot_chart.set_title("Log scale: Dist. comments per patch serie")
Out[59]:
In [60]:
# Comments per patch serie in mean. This is also time series distributed to check if there are more comments per patch series in mean and median
mean_comments = comments_df.set_index("patch_serie_first_date").resample('M', how={"comments":np.mean})
median_comments = comments_df.set_index("patch_serie_first_date").resample('M', how={"comments":np.median})
dates = pd.date_range('2005-01', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
commentsTS = Series(mean_comments["comments"], index=dates)
comments_medianTS = Series(median_comments["comments"], index=dates)
comments_chart = commentsTS.plot()
comments_medianTS.plot()
comments_chart.set_xlabel("Time evolution")
comments_chart.set_ylabel("comments")
comments_chart.set_title("Evolution of the mean (blue) and median (green) comments per patch")
Out[60]:
In [61]:
query = """ SELECT distinct psv.ps_id as patch_serie,
psv.date_utc as first_send,
max(t.patches) as numpatches
FROM patch_series_version psv,
patches p,
(SELECT psv.ps_id,
p.ps_version_id,
count(distinct(p.id)) as patches
FROM patch_series_version psv,
patches p
WHERE psv.id = p.ps_version_id
GROUP BY psv.ps_id,
p.ps_version_id) t
WHERE psv.ps_id = t.ps_id AND
p.ps_version_id=t.ps_version_id AND
psv.id = p.ps_version_id
GROUP BY psv.ps_id
"""
patches_ps = execute_query(cursor, query)
In [62]:
numpatches_list = list(patches_ps)
numpatches_df = DataFrame(numpatches_list, columns=["patch_serie", "first_send", "numpatches"])
fig, axs = plt.subplots(1,2, figsize=(12,5))
hist_chart = numpatches_df["numpatches"].plot(kind="hist", ax=axs[0])
boxplot_chart = numpatches_df["numpatches"].plot(kind="box", ax=axs[1], logy=True)
#chart labels
hist_chart.set_xlabel("numpatches")
hist_chart.set_ylabel("Patches")
hist_chart.set_title("Histogram: number of numpatches per patch serie")
boxplot_chart.set_ylabel("numpatches per patch serie")
boxplot_chart.set_title("Log scale: Dist. numpatches per patch serie")
Out[62]:
In [63]:
# numpatches per patch serie in mean. This is also time series distributed to check if there are more numpatches per patch series in mean and median
mean_numpatches = numpatches_df.set_index("first_send").resample('M', how={"numpatches":np.mean})
median_numpatches = numpatches_df.set_index("first_send").resample('M', how={"numpatches":np.median})
dates = pd.date_range('2005-01', '2015-11', freq='M') # data starting in 2010-10 and ending in 2015-08
numpatchesTS = Series(mean_numpatches["numpatches"], index=dates)
numpatches_medianTS = Series(median_numpatches["numpatches"], index=dates)
numpatches_chart = numpatchesTS.plot()
numpatches_medianTS.plot()
numpatches_chart.set_xlabel("Time evolution")
numpatches_chart.set_ylabel("numpatches")
numpatches_chart.set_title("Evolution of the mean (blue) and median (green) numpatches per patch")
Out[63]:
Metrics to retrieve:
In [64]:
#TODO
In [65]:
#TODO
In [66]:
# Taking time2merge dataframe
time2merge_2015 = time2merge_df[time2merge_df["first_patch_date"] >= '2015-01-01']
time2merge_2014 = time2merge_df[time2merge_df["first_patch_date"] >= '2014-01-01']
time2merge_2014 = time2merge_2014[time2merge_2014["first_patch_date"] < '2015-01-01']
time2merge_2013 = time2merge_df[time2merge_df["first_patch_date"] >= '2013-01-01']
time2merge_2013 = time2merge_2013[time2merge_2013["first_patch_date"] < '2014-01-01']
time2merge_2012 = time2merge_df[time2merge_df["first_patch_date"] >= '2012-01-01']
time2merge_2012 = time2merge_2012[time2merge_2012["first_patch_date"] < '2013-01-01']
data=[time2merge_2012["time2merge_days"], time2merge_2013["time2merge_days"], time2merge_2014["time2merge_days"], time2merge_2015["time2merge_days"]]
figure = pylab.boxplot(data, sym="")
ticks = pylab.xticks([1, 2, 3, 4], ['2012', '2013', '2014', '2015'])
pylab.title("Time to merge divided by year (without outliers)")
pylab.ylabel("Days")
Out[66]:
In [67]:
# Taking time2merge dataframe
time2merge_2015_s1 = time2merge_df[time2merge_df["first_patch_date"] >= '2015-01-01']
time2merge_2015_s1 = time2merge_2015_s1[time2merge_2015_s1["first_patch_date"] < '2015-07-01']
time2merge_2014_s2 = time2merge_df[time2merge_df["first_patch_date"] >= '2014-07-01']
time2merge_2014_s2 = time2merge_2014_s2[time2merge_2014_s2["first_patch_date"] < '2015-01-01']
time2merge_2014_s1 = time2merge_df[time2merge_df["first_patch_date"] >= '2014-01-01']
time2merge_2014_s1 = time2merge_2014_s1[time2merge_2014_s1["first_patch_date"] < '2014-07-01']
time2merge_2013_s2 = time2merge_df[time2merge_df["first_patch_date"] >= '2013-01-01']
time2merge_2013_s2 = time2merge_2013_s2[time2merge_2013_s2["first_patch_date"] < '2014-01-01']
time2merge_2013_s1 = time2merge_df[time2merge_df["first_patch_date"] >= '2013-01-01']
time2merge_2013_s1 = time2merge_2013_s1[time2merge_2013_s1["first_patch_date"] < '2014-01-01']
time2merge_2012_s2 = time2merge_df[time2merge_df["first_patch_date"] >= '2012-07-01']
time2merge_2012_s2 = time2merge_2012_s2[time2merge_2012_s2["first_patch_date"] < '2013-01-01']
time2merge_2012_s1 = time2merge_df[time2merge_df["first_patch_date"] >= '2012-01-01']
time2merge_2012_s1 = time2merge_2012_s1[time2merge_2012_s1["first_patch_date"] < '2012-07-01']
data=[time2merge_2012_s1["time2merge_days"], time2merge_2012_s2["time2merge_days"], time2merge_2013_s1["time2merge_days"], time2merge_2013_s2["time2merge_days"], time2merge_2014_s1["time2merge_days"], time2merge_2014_s2["time2merge_days"], time2merge_2015_s1["time2merge_days"]]
figure = pylab.boxplot(data, sym="")
ticks = pylab.xticks([1, 2, 3, 4, 5, 6, 7], ['2012S1', '2012S2', '2013S1', '2013S2', '2014S1', '2014S2', '2015S1'])
pylab.title("Time to merge divided by year (without outliers)")
pylab.ylabel("Days")
Out[67]:
In [68]:
# Taking time2merge dataframe
time2merge_2015_s1 = time2merge_df[time2merge_df["merge_time"] >= '2015-01-01']
time2merge_2015_s1 = time2merge_2015_s1[time2merge_2015_s1["merge_time"] < '2015-07-01']
time2merge_2014_s2 = time2merge_df[time2merge_df["merge_time"] >= '2014-07-01']
time2merge_2014_s2 = time2merge_2014_s2[time2merge_2014_s2["merge_time"] < '2015-01-01']
time2merge_2014_s1 = time2merge_df[time2merge_df["merge_time"] >= '2014-01-01']
time2merge_2014_s1 = time2merge_2014_s1[time2merge_2014_s1["merge_time"] < '2014-07-01']
time2merge_2013_s2 = time2merge_df[time2merge_df["merge_time"] >= '2013-01-01']
time2merge_2013_s2 = time2merge_2013_s2[time2merge_2013_s2["merge_time"] < '2014-01-01']
time2merge_2013_s1 = time2merge_df[time2merge_df["merge_time"] >= '2013-01-01']
time2merge_2013_s1 = time2merge_2013_s1[time2merge_2013_s1["merge_time"] < '2014-01-01']
time2merge_2012_s2 = time2merge_df[time2merge_df["merge_time"] >= '2012-07-01']
time2merge_2012_s2 = time2merge_2012_s2[time2merge_2012_s2["merge_time"] < '2013-01-01']
time2merge_2012_s1 = time2merge_df[time2merge_df["merge_time"] >= '2012-01-01']
time2merge_2012_s1 = time2merge_2012_s1[time2merge_2012_s1["merge_time"] < '2012-07-01']
data=[time2merge_2012_s1["time2merge_days"], time2merge_2012_s2["time2merge_days"], time2merge_2013_s1["time2merge_days"], time2merge_2013_s2["time2merge_days"], time2merge_2014_s1["time2merge_days"], time2merge_2014_s2["time2merge_days"], time2merge_2015_s1["time2merge_days"]]
figure = pylab.boxplot(data, sym="")
ticks = pylab.xticks([1, 2, 3, 4, 5, 6, 7], ['2012S1', '2012S2', '2013S1', '2013S2', '2014S1', '2014S2', '2015S1'])
pylab.title("Time to merge divided by year (without outliers)")
pylab.ylabel("Days")
Out[68]:
In [69]:
query = """ SELECT psv.ps_id as patch_serie,
TIMESTAMPDIFF(SECOND, MIN(psv.date_utc), MAX(c.committer_date_utc)) as time2merge,
MIN(psv.date_utc) as first_patch_date,
MAX(c.committer_date_utc) as merge_time,
MAX(t.patches) as numpatches
FROM patch_series_version psv,
patches p,
(SELECT psv.ps_id,
p.ps_version_id,
count(*) as patches
FROM patch_series_version psv,
patches p
WHERE psv.id = p.ps_version_id
GROUP BY psv.ps_id,
p.ps_version_id) t,
commits c
WHERE psv.ps_id = t.ps_id AND
p.ps_version_id=t.ps_version_id AND
psv.id = p.ps_version_id AND
p.commit_id = c.id
GROUP BY psv.ps_id
HAVING time2merge > 0 and time2merge < 1000*3600*24"""
time2merge_extra = execute_query(cursor, query)
In [70]:
time2merge_extra_list = list(time2merge_extra)
time2merge_extra_df = DataFrame(time2merge_extra_list, columns=["patch_serie", "time2merge", "first_patch_date", "merge_time", "numpatches"])
time2merge_extra_df["time2merge_days"] = (time2merge_extra_df["time2merge"]/3600)/24
In [71]:
time2merge_extra_df_1patch = time2merge_extra_df[time2merge_extra_df["numpatches"]==1]
time2merge_extra_df_2patch = time2merge_extra_df[time2merge_extra_df["numpatches"]==2]
time2merge_extra_df_3patch = time2merge_extra_df[time2merge_extra_df["numpatches"]==3]
time2merge_extra_df_4patch = time2merge_extra_df[time2merge_extra_df["numpatches"]==4]
time2merge_extra_df_5patch = time2merge_extra_df[time2merge_extra_df["numpatches"]>4]
data=[time2merge_extra_df_1patch["time2merge_days"], time2merge_extra_df_2patch["time2merge_days"], time2merge_extra_df_3patch["time2merge_days"],
time2merge_extra_df_4patch["time2merge_days"], time2merge_extra_df_5patch["time2merge_days"]]
figure = pylab.boxplot(data, sym="")
ticks = pylab.xticks([1, 2, 3, 4, 5], ['1 patch', '2 patches', '3 patches', '4 patches', '> 4 patches'])
pylab.title("Time to merge divided by number of patches (without outliers)")
pylab.ylabel("Days")
Out[71]:
In [72]:
def draw_boxplots(title, data_df, field):
# Taking data dataframe
data_2015_s1 = data_df[data_df[field] >= '2015-01-01']
data_2015_s1 = data_2015_s1[data_2015_s1[field] < '2015-07-01']
data_2014_s2 = data_df[data_df[field] >= '2014-07-01']
data_2014_s2 = data_2014_s2[data_2014_s2[field] < '2015-01-01']
data_2014_s1 = data_df[data_df[field] >= '2014-01-01']
data_2014_s1 = data_2014_s1[data_2014_s1[field] < '2014-07-01']
data_2013_s2 = data_df[data_df[field] >= '2013-01-01']
data_2013_s2 = data_2013_s2[data_2013_s2[field] < '2014-01-01']
data_2013_s1 = data_df[data_df[field] >= '2013-01-01']
data_2013_s1 = data_2013_s1[data_2013_s1[field] < '2014-01-01']
data_2012_s2 = data_df[data_df[field] >= '2012-07-01']
data_2012_s2 = data_2012_s2[data_2012_s2[field] < '2013-01-01']
data_2012_s1 = data_df[data_df[field] >= '2012-01-01']
data_2012_s1 = data_2012_s1[data_2012_s1[field] < '2012-07-01']
data=[data_2012_s1["time2merge_days"], data_2012_s2["time2merge_days"], data_2013_s1["time2merge_days"], data_2013_s2["time2merge_days"], data_2014_s1["time2merge_days"], data_2014_s2["time2merge_days"], data_2015_s1["time2merge_days"]]
figure = pylab.boxplot(data, sym="")
ticks = pylab.xticks([1, 2, 3, 4, 5, 6, 7], ['2012S1', '2012S2', '2013S1', '2013S2', '2014S1', '2014S2', '2015S1'])
pylab.title("Time to merge divided by year (without outliers)" + title)
pylab.ylabel("Days")
In [73]:
draw_boxplots(" 1 patch", time2merge_extra_df_1patch, "first_patch_date")
In [74]:
draw_boxplots(" 2 patches", time2merge_extra_df_2patch, "first_patch_date")