In [75]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import pdb
import numpy as np
import datetime
file_json = open('closed.json')
data_json = json.load(file_json)
data_df = pd.DataFrame(data_json, copy=True)
git_df_temp = data_df[['title', 'created_at', 'labels', 'closed_at', 'id']]
git_user_temp = data_df['user']
value_list = []
for i, row_entry in git_user_temp.iteritems():
val = row_entry['login']
value_list.append(val)
user_df = pd.Series(value_list,index=git_user_temp.index)
user_df.name = 'user'
git_df = git_df_temp.join(user_df)
In [110]:
data_df.user.head()
Out[110]:
In [90]:
git_df
Out[90]:
In [40]:
git_df_2 = pd.concat([git_df_temp,user_df],axis=1)
In [76]:
git_df.drop_duplicates(cols='id',inplace=True)
git_df['created_at'] = pd.to_datetime(git_df['created_at'])
git_df['closed_at'] = pd.to_datetime(git_df['closed_at'])
git_df.set_index('created_at', inplace=True, drop=False)
In [82]:
git_df.head()
Out[82]:
In [73]:
def count(created_in):
return len(created_in)
In [68]:
git_df.set_index('created_at', inplace = True)
In [72]:
git_df.
Out[72]:
In [79]:
issue_df = git_df.title.resample('M', how= count)
issue_df.name = 'Number of issues'
plt.figure()
issue_ax = issue_df.plot()
issue_ax.set_ylabel(issue_df.name)
Out[79]:
In [78]:
%matplotlib
In [85]:
def distinct_user(resample_input):
#print resample_input.user
distinct_input = resample_input.drop_duplicates()
#print distinct_input.user
return len(distinct_input.index)
user_monthly = git_df.user.resample('M', how = distinct_user)
user_monthly.name = 'Number of distinct user'
plt.figure()
user_ax = user_monthly.plot()
user_ax.set_ylabel(user_monthly.name)
Out[85]:
In [86]:
import datetime
In [87]:
def mean_day(close_at_input):
try:
create_at_series = pd.Series(close_at_input.index, index = close_at_input.index, name = close_at_input.index.name)
diff = (close_at_input - create_at_series)
sec_mean = diff.sum()/(len(diff))
#pdb.set_trace()
return (datetime.timedelta(microseconds = int(sec_mean/1000))).days
except:
return np.nan
open_day = git_df.closed_at.resample('M', mean_day)
open_day.name = 'mean_days'
open_day_nissue = pd.concat([issue_df, open_day], axis=1)
plt.figure()
line1 = open_day_nissue['Number of issues'].plot()
line1.set_ylabel('Number of issues')
plt.figure()
line2 = open_day_nissue['mean_days'].plot()
line2.set_ylabel('mean_days')
Out[87]:
In [88]:
open_day_nissue.head()
Out[88]:
In [79]:
comments_df = data_df[['created_at','comments', 'id' ]]
#print comments_df.head()
comments_df.drop_duplicates(cols='id', inplace=True)
In [50]:
comments_df.comments[100][0]['text']
Out[50]:
In [80]:
def comment_func(comment_list):
try:
return comment_list[0]['text']
except:
return np.nan
comments_df.comments = comments_df.comments.map(comment_func)
In [81]:
comments_df.head()
Out[81]:
In [2]:
chattiest_df = git_df[['created_at', 'user', 'id']]
In [3]:
chattiest_df.head()
Out[3]:
In [4]:
chattiest_df['created_at'] = pd.to_datetime(chattiest_df['created_at'])
chattiest_df.set_index('created_at', inplace=True)
In [5]:
chattiest_df.head()
Out[5]:
In [100]:
del chattiest_df['id']
In [102]:
chattiest_df.head()
Out[102]:
In [6]:
chattiest_se = chattiest_df['user']
In [8]:
def distinct(in_df):
return len(in_df.unique())
nDistinct_se = chattiest_se.resample('M', how = distinct)
nDistinct_se.head()
Out[8]:
In [112]:
chattiest_se.head()
Out[112]:
In [113]:
chattiest_se.resample('M', how = [distinct, percent, chattiest_user, count])
Out[113]:
In [29]:
def percent(in_se):
try:
counts = in_se.value_counts().order(ascending = False)
return counts.ix[0]/float(counts.sum())
except:
return np.nan
percent_se = 100*chattiest_se.resample('M', how = percent)
In [14]:
def chattiest_user(in_se):
try:
counts = in_se.value_counts().order(ascending = False)
return counts.index[0]
except:
return np.nan
chatUser_se = chattiest_se.resample('M', how = chattiest_user)
In [15]:
chatUser_se.head()
Out[15]:
In [21]:
nComments_se = chattiest_se.resample('M', how = count)
print nComments_se.head()
In [30]:
user = pd.DataFrame({'nIssue':nComments_se, 'chattiest':chatUser_se,\
'percentage of the chattiest':percent_se, 'nParticipants':nDistinct_se},\
columns = ['nParticipants', 'nIssue', 'chattiest', 'percentage of the chattiest'])
In [117]:
user.columns
Out[117]:
In [98]:
id_labels_df = data_df[['id', 'labels', 'created_at']].drop_duplicates(cols = 'id', inplace = False)
In [99]:
id_labels_df.columns
Out[99]:
In [101]:
id_labels_list = []
for (idx,Id) in id_labels_df.id.iteritems():
if len(id_labels_df.labels.ix[idx]):
for label_dict in id_labels_df.labels.ix[idx]:
id_labels_list.append((Id, label_dict['name'], id_labels_df.created_at[idx]))
else:
id_labels_list.append((Id, np.nan,id_labels_df.created_at[idx]))
In [102]:
id_labels_df = pd.DataFrame(id_labels_list, columns=['id', 'label', 'created_at'])
In [104]:
id_labels_df['created_at'] = pd.to_datetime(id_labels_df['created_at'])
In [106]:
id_labels_df.set_index('created_at', inplace=True)
In [107]:
id_labels_df.head()
Out[107]:
In [108]:
issue_df
Out[108]:
In [56]:
git_df.labels.ix[2956]
Out[56]:
In [54]:
data_df.columns
Out[54]:
In [81]:
user
Out[81]:
In [111]:
data_df.columns
Out[111]:
In [115]:
import numpy as np
from numpy import nan as NaN
In [116]:
NaN
Out[116]:
In [203]:
import pdb
import datetime
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from numpy import nan as NaN
# Functions defined
def user_extract(dict_in):
''' Extract 'login' key's value '''''
return dict_in['login']
def count(resample_in):
''' Count the number of samples in resampled data'''
return len(resample_in)
def distinct_user(resample_in):
''' Count the number of distinct user in resampled data'''
unique_in = resample_in.drop_duplicates()
return len(unique_in)
def mean_day(resample_in):
''' Calculate the mean open day of an issue in resampled data'''
try:
#Create a Series has value equals to created_at
created_at_series = pd.Series(
resample_in.index, index=resample_in.index)
diff = resample_in - created_at_series
sec_mean = diff.sum()/float(len(diff))
#Convert to timedelta object
td = datetime.timedelta(microseconds=int(sec_mean/1000.))
#return day attribute of the time difference
return td.days
except Exception:
return NaN
def comment_func(comment_list):
''' Extract the comment string from the comments column'''
try:
return comment_list[0]['text']
except Exception:
return NaN
def chattiest_user(resample_se):
''' Find the chattiest user in a month'''
try:
counts = resample_se.value_counts().order(ascending=True)
return counts.index[0]
except Exception:
return NaN
def distinct(resample_se):
''' Find the number of distinct user who comment on 'pandas' '''
return len(resample_se.unique())
def percent(resample_se):
''' Find the percentage of comments provided by the chattiest user'''
try:
counts = resample_se.value_counts().order(ascending=False)
return int(counts.ix[0]/float(counts.sum())*100)
except Exception:
return NaN
# Read and load data
json_file = open('closed.json')
json_data = json.load(json_file)
all_data_df = pd.DataFrame(json_data)
###########
# part(1) #
###########
# p1 is the dataframe have title, created_at, labels, closed_at, user, \
# id as columns
p1 = all_data_df[['title', 'created_at', 'labels', 'closed_at',\
'user', 'id']]
# transfer the user values to username string
p1.user = p1.user.map(user_extract)
###########
# part(2) #
###########
# Drop the duplicate rows using id inplace
p1.drop_duplicates(cols='id', inplace=True)
###########
# part(4) #
###########
# Convert created_at and closed_at columns from string to datetime
p1['created_at'] = pd.to_datetime(p1['created_at'])
p1['closed_at'] = pd.to_datetime(p1['closed_at'])
###########
# part(5) #
###########
# Set 'created_at' as index
p1.set_index('created_at', inplace=True)
# Make the monthly number of issue plot
issue_month = p1.title.resample('M', how=count)
issue_month.name = 'Number of Issues'
plt.figure()
issue_ax = issue_month.plot()
issue_ax.set_ylabel(issue_month.name)
# Make the monthly distinct user number plot
distinct_month = p1.user.resample('M', how=distinct_user)
distinct_month.name = 'Number of Distinct User'
plt.figure()
distinct_ax = distinct_month.plot()
distinct_ax.set_ylabel(distinct_month.name)
###########
# part(6) #
###########
# Resample the closed_at and return a series of mean open day
open_day = p1.closed_at.resample('M', how=mean_day)
open_day.name = 'Mean Open Day'
# Concatenate monthly issue number with mean open day
open_day = pd.concat([issue_month, open_day], axis=1)
open_day.columns = ['nIssues', 'mean_days']
plt.figure()
line1 = open_day['nIssues'].plot()
line1.set_ylabel('Number of issues')
plt.figure()
line2 = open_day['mean_days'].plot()
line2.set_ylabel('Mean Open Day')
open_day.to_pickle('mean_day.pkl')
print '_'*80
print open_day.head(20)
###########
# part(7) #
###########
# Create the comment dateframe
comm_df = all_data_df[['created_at', 'comments', 'id']]
comm_df.drop_duplicates(cols='id', inplace=True)
comm_df.comments = comm_df.comments.map(comment_func)
comm_df.created_at = pd.to_datetime(comm_df.created_at)
comm_df.set_index('created_at', inplace=True)
print '_'*80
print comm_df.head(20)
comm_df.to_pickle('comments.pkl')
###########
# part(8) #
###########
# Create user dataframe
user_df = all_data_df[['created_at', 'user', 'id']]
user_df.user = user_df.user.map(user_extract)
user_df.created_at = pd.to_datetime(user_df.created_at)
user_df.set_index('created_at', inplace=True)
user_se = user_df['user']
chattiest_df = user_se.resample('M', how=[count,chattiest_user,percent,distinct])
chattiest_df.columns = ['Number of comments', 'The chattiest', 'Percentage of the chattiest(%)',\
'Number of participants']
print '_'*80
print chattiest_df.head(20)
chattiest_df.to_pickle('chattiest.pkl')
###########
# part(9) #
###########
# Create id_label dataframe with create time as index
id_labels_temp = data_df[['id', 'labels', 'created_at']]
id_labels_temp.drop_duplicates(cols='id', inplace=True)
id_labels_list = []
for (idx,Id) in id_labels_temp.id.iteritems():
if len(id_labels_temp.labels.ix[idx]):
for label_dict in id_labels_temp.labels.ix[idx]:
id_labels_list.append((Id, label_dict['name'], id_labels_temp.created_at[idx]))
else:
id_labels_list.append((Id, np.nan,id_labels_temp.created_at[idx]))
id_labels_df = pd.DataFrame(id_labels_list, columns=['id', 'label', 'created_at'])
id_labels_df['created_at'] = pd.to_datetime(id_labels_df['created_at'])
id_labels_df.set_index('created_at', inplace=True)
In [124]:
id_labels_df.head()
Out[124]:
In [129]:
open_day.head()
Out[129]:
In [132]:
id_labels_df.head()
Out[132]:
In [133]:
p1.closed_at.head()
Out[133]:
In [139]:
id_labels_df.ix[60:100]
Out[139]:
In [135]:
p1.head()
Out[135]:
In [141]:
def open_day_per_issue(series_in):
''' Calculate the days an issue takes to be solved '''
created_at_series = pd.Series(series_in.index, index=series_in.index)
diff = series_in - created_at_series
return float(diff.days)
In [147]:
open_day_per_issue_se = p1.closed_at.apply(open_day_per_issue)
In [143]:
type(p1.closed_at)
Out[143]:
In [144]:
p1.closed_at.index
Out[144]:
In [145]:
%pdb
In [149]:
user.ix[0][1].index
In [151]:
p1_reset = p1.reset_index()
In [152]:
p1_reset.head()
Out[152]:
In [161]:
(p1_reset['created_at']-p1_reset['closed_at']).apply(datetime.timedelta.total_seconds())
In [169]:
a = -(p1_reset['created_at']-p1_reset['closed_at'])
In [164]:
a.map(datetime.timedelta.total_seconds)
In [167]:
a.ix[0][1]
In [171]:
a.ix[0]
Out[171]:
In [173]:
p1_reset
Out[173]:
In [174]:
created_at = p1_reset['created_at']
closed_at = p1_reset['closed_at']
In [175]:
time_diff = closed_at - created_at
In [176]:
time_diff.head()
Out[176]:
In [177]:
time_diff.map(datetime.timedelta.total_seconds)
In [178]:
x = np.timedelta64(2069211000000000, 'ns')
>>> days = x.astype('timedelta64[D]')
In [179]:
days
Out[179]:
In [181]:
a.map(lambda td : td.days)
In [189]:
t = a[0]
In [190]:
t
Out[190]:
In [191]:
24*60*60*1e9
Out[191]:
In [192]:
created = pd.Series(p1.index, index=p1.index)
closed = p1.closed_at
In [195]:
time_diff = closed - created
In [196]:
time_diff.head()
Out[196]:
In [204]:
created = pd.Series(p1.index, index=p1.index)
closed = p1.closed_at
time_diff = closed - created
SEC_DAY = 24*60*60*1e9
open_day_per_issue = time_diff.map(lambda td: td/SEC_DAY)
open_day_per_issue.name = 'open_day'
print open_day_per_issue.head()
In [ ]:
open_day_per_issue
In [199]:
id_labels_df.head()
Out[199]:
In [219]:
label_open_df = pd.merge(id_labels_df.reset_index(), open_day_per_issue.reset_index(), on = 'created_at')
In [236]:
label_open_df.ix[label_open_df['label']=='Bug']['open_day']
Out[236]:
In [231]:
a = label_open_df['label'].drop_duplicates()
In [235]:
a.values
Out[235]:
In [212]:
open_day_per_issue.reset_index()
Out[212]:
In [269]:
label_open_df = pd.merge(id_labels_df.reset_index(), open_day_per_issue.reset_index(),\
on='created_at')
label_open_df.set_index('created_at', inplace=True)
bug_df = label_open_df.ix[label_open_df['label']=='Bug']
LABELS = label_open_df['label'].drop_duplicates().values[1:]
time_per_label = pd.DataFrame([])
for label_str in LABELS:
temp = label_open_df.ix[label_open_df['label']==label_str]['open_day']
label_month = temp.resample('M', how='mean')
label_month.name = label_str
time_per_label = time_per_label.join(label_month, how='outer')
print time_per_label.head()
In [273]:
time_per_label.ix[:5,:5]
Out[273]:
In [281]:
time_per_label[['Bug', 'Enhancement']].plot(label=['a','b'])
Out[281]:
In [277]:
f1 = plt.figure()
In [279]:
ax1 = f1.add_axes
In [245]:
label_month
Out[245]:
In [240]:
time_per_label = pd.DataFrame([], index=label_open_df.index)
In [268]:
for label_str in LABELS:
temp = label_open_df.ix[label_open_df['label']==label_str]['open_day']
label_month = temp.resample('M', how='mean')
label_month.name = label_str
time_per_label = time_per_label.join(label_month, how='outer')
print time_per_label.head()
In [282]:
!README.md
In [283]:
ls
In [ ]: