In [1]:
import pandas as pd
import networkx as nx
%matplotlib inline
In [8]:
df = pd.read_csv('bus_factor.csv', parse_dates=True, names=['repo', 'bus_factor', ''])
In [4]:
df
Out[4]:
In [ ]:
print(df.describe())
In [4]:
df.hist(column='ratio')
In [11]:
df.hist(column='ratio', bins=10, range=(0,0.5))
Out[11]:
In [20]:
df['ratio'].plot.hist(bins=20)
Out[20]:
In [86]:
authors = pd.read_csv('data/repos/apache/mesos/author-churn.csv', parse_dates=True, index_col='author')
authors = authors / authors.sum()
In [87]:
summary = pd.read_csv('data/repos/apache/mesos/summary.csv', parse_dates=True)
main_dev = pd.read_csv('data/repos/apache/mesos/main-dev.csv', parse_dates=True)
dev = main_dev[['main-dev', 'ownership']].groupby('main-dev').sum() / summary['value'][2]
dev = dev / dev.sum()
dev = dev.reset_index()
dev.columns = ['author', 'ownership']
dev = dev.set_index('author')
In [140]:
communication = pd.read_csv('data/repos/apache/mesos/communication.csv')
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['shared'])
page_rank = pd.DataFrame.from_dict(nx.pagerank(G, weight='shared'), orient='index')
page_rank.columns = ['page_rank']
In [89]:
summary = pd.read_csv('data/repos/apache/mesos/summary.csv', parse_dates=True)
refactoring_main_dev = pd.read_csv('data/repos/apache/mesos/refactoring-main-dev.csv', parse_dates=True)
refactoring_dev = refactoring_main_dev[['main-dev', 'ownership']].groupby('main-dev').sum() / summary['value'][2]
refactoring_dev = refactoring_dev / refactoring_dev.sum()
refactoring_dev = refactoring_dev.reset_index()
refactoring_dev.columns = ['author', 'refactoring_ownership']
refactoring_dev = refactoring_dev.set_index('author')
In [90]:
authors_full = pd.concat([dev, refactoring_dev, page_rank, authors], axis='author').fillna(0)
In [91]:
authors_full.corr()
Out[91]:
In [62]:
summary.set_index('statistic')
Out[62]:
In [301]:
x = summary.set_index('statistic').T.reset_index(drop=True)
x.columns.name = None
repo = pd.read_json('data/repos/apache/mesos/description.json', orient='records')
repo = repo[['name', 'full_name', 'language', 'forks', 'watchers', 'created_at', 'size', 'fork', 'description', 'owner']]
repo = repo.transpose()['id']
repo = pd.DataFrame(repo)
repo = repo.transpose()
repo = repo.reset_index(drop=True)
repo = pd.concat([x.T, repo.T]).T
In [ ]:
In [180]:
import urllib2
response = urllib2.urlopen(repo['contributors_url'][0])
html = response.read()
In [239]:
authors_full['commits'].T.to_dict()['ayouwei']
Out[239]:
In [142]:
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['strength'])
G.add_nodes_from(authors.index)
nx.set_node_attributes(G, 'commits', authors_full['commits'].T.to_dict())
nx.set_node_attributes(G, 'pagerank', authors_full['page_rank'].T.to_dict())
nx.set_node_attributes(G, 'added', authors_full['added'].T.to_dict())
nx.set_node_attributes(G, 'deleted', authors_full['deleted'].T.to_dict())
nx.set_node_attributes(G, 'ownership', authors_full['ownership'].T.to_dict())
In [141]:
communication = communication[communication['author'].isin(authors[authors['commits'] > authors.quantile(.50)['commits']].index.tolist())]
In [143]:
nx.write_gml(G, "mesos2.gml")
communication
Out[143]:
In [132]:
authors[authors['commits'] > authors.quantile(.50)['commits']].index.tolist()
communication[communication['author'].isin(authors[authors['commits'] > authors.quantile(.50)['commits']].index.tolist())]
Out[132]:
In [117]:
authors.describe()
Out[117]:
In [84]:
Out[84]:
In [83]:
communication = pd.read_csv('repo/data/apache/mesos/communication.csv')
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['average'])
page_rank = pd.DataFrame.from_dict(nx.pagerank(G, weight='average'), orient='index')
page_rank.columns = ['page_rank']
In [72]:
added['Victor Quinn']
In [66]:
import numpy as np
added = np.log(authors[['added']]*10E7).T
# Create a list to store the data
weights = []
# For each row in the column,
for author in communication['author']:
weights.append(added[author][0] if author in added.columns else 0)
# Create a column from the list
communication['weights'] = weights
In [76]:
communication.describe()
Out[76]:
In [82]:
Out[82]:
In [ ]: