In [1]:
import pandas as pd
log = pd.read_csv("datasets/git_log_refactoring.gz")
log.head()
Out[1]:
In [2]:
log.info()
In [3]:
log['timestamp'] = pd.to_datetime(log['timestamp'])
log = log.sort_values(by='timestamp').reset_index(drop=True)
log.head()
Out[3]:
In [4]:
log.loc[log['file'].str.contains("/jdbc/"), 'type'] = 'jdbc'
log.loc[log['file'].str.contains("/jpa/"), 'type'] = 'jpa'
log = log.dropna(subset=['type'])
log.head()
Out[4]:
In [5]:
log['lines'] = log['additions'] - log['deletions']
log.head()
Out[5]:
In [6]:
log_timed = log.groupby(['timestamp', 'type']).lines.sum()
log_timed.head()
Out[6]:
In [7]:
log_progess = log_timed.unstack(fill_value=0).cumsum()
log_progess.head()
Out[7]:
In [30]:
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')
ax = log_progess.plot()
ax.set_title("Reengineering of the database access technology")
ax.set_xlabel("time")
ax.set_ylabel("changes");
In [45]:
progress_per_year = log_progess.groupby(log_progess.index.year).last()
progress_per_year.index.name = "year"
progress_per_year
Out[45]:
In [17]:
import matplotlib.pyplot as plt
plt.savefig("reengineering.svg", format="svg")