In [1]:
from lib.ozapfdis import git_tc
log = git_tc.log_numstat("C:/dev/repos/buschmais-spring-petclinic")
log.head()
Out[1]:
In [2]:
log = log[log.file.str.contains(".java")]
log.loc[log.file.str.contains("/jdbc/"), 'type'] = "jdbc"
log.loc[log.file.str.contains("/jpa/"), 'type'] = "jpa"
log.loc[log.type.isna(), 'type'] = "other"
log.head()
Out[2]:
In [3]:
import numpy as np
import pandas as pd
np.random.seed(0)
# adding period
added_lines = [int(np.random.normal(30,50)) for i in range(0,600)]
# deleting period
added_lines.extend([int(np.random.normal(-50,100)) for i in range(0,200)])
added_lines.extend([int(np.random.normal(-2,20)) for i in range(0,200)])
added_lines.extend([int(np.random.normal(-3,10)) for i in range(0,200)])
df_jdbc = pd.DataFrame()
df_jdbc['lines'] = added_lines
df_jdbc.head()
Out[3]:
In [4]:
times = pd.timedelta_range("00:00:00","23:59:59", freq="s")
times = pd.Series(times)
times.head()
Out[4]:
In [5]:
dates = pd.date_range('2013-05-15', '2017-07-23')
dates = pd.to_datetime(dates)
dates = dates[~dates.dayofweek.isin([5,6])]
dates = pd.Series(dates)
dates = dates.add(times.sample(len(dates), replace=True).values)
dates.head()
Out[5]:
In [6]:
df_jdbc['timestamp'] = dates.sample(len(df_jdbc), replace=True).sort_values().reset_index(drop=True)
df_jdbc = df_jdbc.sort_index()
df_jdbc.head()
Out[6]:
In [7]:
df_jdbc.loc[0, 'lines'] = 250
df_jdbc.head()
Out[7]:
In [8]:
df_jdbc = df_jdbc
In [9]:
df_jdbc['file'] = log[log['type'] == 'jdbc']['file'].sample(len(df_jdbc), replace=True).values
In [10]:
%matplotlib inline
df_jdbc.lines.hist()
Out[10]:
Sum up the data and check if it was created as wanted.
In [11]:
df_jdbc_timed = df_jdbc.set_index('timestamp')
df_jdbc_timed['count'] = df_jdbc_timed.lines.cumsum()
df_jdbc_timed['count'].plot()
Out[11]:
In [12]:
last_non_zero_timestamp = df_jdbc_timed[df_jdbc_timed['count'] >= 0].index.max()
last_non_zero_timestamp
Out[12]:
In [13]:
df_jdbc = df_jdbc[df_jdbc.timestamp <= last_non_zero_timestamp]
df_jdbc.head()
Out[13]:
In [14]:
df_jpa = pd.DataFrame([int(np.random.normal(20,50)) for i in range(0,600)], columns=['lines'])
df_jpa.loc[0,'lines'] = 150
df_jpa['timestamp'] = pd.DateOffset(years=2) + dates.sample(len(df_jpa), replace=True).sort_values().reset_index(drop=True)
df_jpa = df_jpa.sort_index()
df_jpa['file'] = log[log['type'] == 'jpa']['file'].sample(len(df_jpa), replace=True).values
df_jpa.head()
Out[14]:
In [15]:
df_jpa.lines.hist()
Out[15]:
In [16]:
df_jpa_timed = df_jpa.set_index('timestamp')
df_jpa_timed['count'] = df_jpa_timed.lines.cumsum()
df_jpa_timed['count'].plot()
Out[16]:
In [17]:
dates_other = pd.date_range(df_jdbc.timestamp.min(), df_jpa.timestamp.max())
dates_other = pd.to_datetime(dates_other)
dates_other = dates_other[~dates_other.dayofweek.isin([5,6])]
dates_other = pd.Series(dates_other)
dates_other = dates_other.add(times.sample(len(dates_other), replace=True).values)
dates_other.head()
Out[17]:
In [18]:
df_other = pd.DataFrame([int(np.random.normal(5,100)) for i in range(0,40000)], columns=['lines'])
df_other['timestamp'] = dates_other.sample(len(df_other), replace=True).sort_values().reset_index(drop=True)
df_other = df_other.sort_index()
df_other['file'] = log[log['type'] == 'other']['file'].sample(len(df_other), replace=True).values
df_other.head()
Out[18]:
In [19]:
df_other.lines.hist()
Out[19]:
In [20]:
df_other_timed = df_other.set_index('timestamp')
df_other_timed['count'] = df_other_timed.lines.cumsum()
df_other_timed['count'].plot()
Out[20]:
In [ ]:
In [21]:
df = pd.concat([df_jpa, df_jdbc, df_other], ignore_index=True).sort_values(by='timestamp')
df.loc[df.lines > 0, 'additions'] = df.lines
df.loc[df.lines < 0, 'deletions'] = df.lines * -1
df = df.fillna(0).reset_index(drop=True)
df = df[['additions', 'deletions', 'file', 'timestamp']]
df.loc[(df.deletions > 0) & (df.loc[0].timestamp == df.timestamp),'additions'] = df.deletions
df.loc[df.loc[0].timestamp == df.timestamp,'deletions'] = 0
df['additions'] = df.additions.astype(int)
df['deletions'] = df.deletions.astype(int)
df = df.sort_values(by='timestamp', ascending=False)
df.head()
Out[21]:
In [22]:
df = df[df.timestamp < pd.Timestamp('2018-01-01')]
df.head()
Out[22]:
In [23]:
df.to_csv("datasets/git_log_refactoring.gz", index=None, compression='gzip')
In [24]:
df_loaded = pd.read_csv("datasets/git_log_refactoring.gz")
df_loaded.head()
Out[24]:
In [25]:
df_loaded.info()