In [2]:
import pandas as pd
import networkx as nx
In [3]:
pd.read_csv('../data/summary.csv', parse_dates=True)
Out[3]:
statistic
value
0
number-of-commits
7472
1
number-of-entities
32005
2
number-of-entities-changed
58612
3
number-of-authors
170
In [4]:
authors_churn = pd.read_csv('../data/authors.csv')
In [47]:
authors_churn
Out[47]:
entity
n-authors
n-revs
0
src/Makefile.am
55
526
1
src/slave/slave.cpp
54
450
2
src/master/master.cpp
50
585
3
include/mesos/mesos.proto
50
188
4
src/master/http.cpp
42
203
5
src/tests/master_tests.cpp
40
214
6
src/master/master.hpp
39
329
7
src/tests/slave_tests.cpp
39
134
8
src/tests/mesos.hpp
36
156
9
src/slave/slave.hpp
34
207
10
src/slave/flags.hpp
34
131
11
docs/configuration.md
34
75
12
src/slave/flags.cpp
34
74
13
src/tests/mesos.cpp
32
103
14
configure.ac
31
187
15
3rdparty/libprocess/src/process.cpp
29
172
16
3rdparty/libprocess/3rdparty/stout/include/sto...
28
84
17
CHANGELOG
27
111
18
src/common/resources.cpp
26
82
19
src/tests/resources_tests.cpp
26
73
20
src/tests/slave_recovery_tests.cpp
25
144
21
src/tests/environment.cpp
25
73
22
include/mesos/v1/mesos.proto
25
48
23
src/sched/sched.cpp
24
171
24
src/tests/fault_tolerance_tests.cpp
24
140
25
src/slave/http.cpp
24
85
26
3rdparty/libprocess/include/process/http.hpp
23
77
27
docs/powered-by-mesos.md
23
74
28
docs/getting-started.md
23
54
29
src/slave/containerizer/docker.cpp
22
99
...
...
...
...
31975
src/third_party/zookeeper-3.2.2/src/java/main/...
1
1
31976
third_party/libprocess/third_party/stout/inclu...
1
1
31977
frameworks/hadoop-0.20.0/src/mapred/org/apache...
1
1
31978
{src => include/mesos}/module/authenticatee.hpp
1
1
31979
{src/third_party/libprocess/third_party/boost-...
1
1
31980
third_party/libprocess/third_party/boost-1.37....
1
1
31981
third_party/libprocess/third_party/boost-1.37....
1
1
31982
third_party/{boto-1.8d => boto-1.9b}/README
1
1
31983
third_party/libprocess/third_party/boost-1.37....
1
1
31984
third_party/glog-0.3.1/packages/deb/libgoogle-...
1
1
31985
frameworks/hadoop-0.20.0/src/test/org/apache/h...
1
1
31986
{src => third_party/libprocess}/third_party/bo...
1
1
31987
frameworks/{hadoop-0.20.0 => hadoop-0.20.2}/sr...
1
1
31988
frameworks/{hadoop-0.20.0 => hadoop-0.20.2}/sr...
1
1
31989
third_party/boost-1.37.0/boost/mpl/aux_/prepro...
1
1
31990
third_party/zookeeper-3.3.1/docs/images/built-...
1
1
31991
frameworks/hadoop-0.20.0/src/contrib/index/src...
1
1
31992
src/third_party/libprocess/third_party/boost-1...
1
1
31993
third_party/libprocess/third_party/boost-1.37....
1
1
31994
src/third_party/boost-1.37.0/boost/mpl/count.hpp
1
1
31995
third_party/boost-1.37.0/boost/mpl/aux_/prepro...
1
1
31996
third_party/libprocess/third_party/stout/tests...
1
1
31997
third_party/boost-1.37.0/boost/mpl/map/aux_/co...
1
1
31998
src/third_party/boost-1.37.0/boost/mpl/pop_bac...
1
1
31999
src/third_party/boost-1.37.0/boost/mpl/map/aux...
1
1
32000
third_party/boost-1.37.0/boost/detail/lwm_pthr...
1
1
32001
{src => third_party/libprocess}/third_party/bo...
1
1
32002
frameworks/hadoop-0.20.2/src/test/org/apache/h...
1
1
32003
{src/third_party/libprocess/third_party => thi...
1
1
32004
frameworks/hadoop-0.20.2/src/test/org/apache/h...
1
1
32005 rows × 3 columns
In [ ]:
authors = pd.read_csv('authors.csv')
In [41]:
authors.head()
Out[41]:
entity
n-authors
n-revs
0
main.go
3
31
1
web.go
3
29
2
README.md
3
26
3
marathon/marathon.go
3
22
4
web_test.go
3
19
In [43]:
authors.describe()
Out[43]:
n-authors
n-revs
count
105.000000
105.000000
mean
1.514286
4.142857
std
0.735295
5.570231
min
1.000000
1.000000
25%
1.000000
1.000000
50%
1.000000
2.000000
75%
2.000000
5.000000
max
3.000000
31.000000
In [24]:
%matplotlib inline
authors.plot(kind='scatter', x='n-authors', y='n-revs');
In [32]:
authors.groupby('n-authors').describe()
Out[32]:
n-revs
n-authors
1
count
66.000000
mean
2.045455
std
1.884608
min
1.000000
25%
1.000000
50%
1.000000
75%
2.000000
max
10.000000
2
count
24.000000
mean
3.791667
std
1.587428
min
2.000000
25%
2.750000
50%
3.500000
75%
5.000000
max
7.000000
3
count
15.000000
mean
13.933333
std
9.284908
min
4.000000
25%
7.000000
50%
9.000000
75%
20.500000
max
31.000000
In [5]:
communication = pd.read_csv('../data/communication.csv', parse_dates=True)
In [6]:
strength = communication['strength']
communication['normal_strength'] = strength.apply(lambda x: (x - strength.min()) / (strength.max() - strength.min()))
In [7]:
communication['strength'].quantile(.9)
Out[7]:
20.0
In [32]:
communication[communication['author'].isin(data[0].head(50))].groupby(['author']).apply(lambda x: list(x.peer)).to_json('../data/data.json')
In [64]:
communication[communication['strength'] > 20].describe()
Out[64]:
shared
average
strength
count
832.000000
832.000000
832.000000
mean
32.850962
116.199519
41.920673
std
47.252115
146.288642
29.275585
min
1.000000
1.000000
21.000000
25%
1.000000
3.000000
23.000000
50%
14.000000
61.000000
27.000000
75%
46.000000
187.500000
43.250000
max
303.000000
746.000000
100.000000
In [10]:
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['strength'])
In [11]:
pos=nx.spring_layout(G)
edgewidth = [ d['strength'] for (u,v,d) in G.edges(data=True)]
nx.draw_networkx_edge_labels(G, pos)
nx.draw_networkx_nodes(G,pos)
nx.draw_networkx_labels(G,pos)
nx.draw_networkx_edges(G, pos, edge_color=edgewidth)
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-11-13d0e9e352a2> in <module>()
3 nx.draw_networkx_edge_labels(G, pos)
4 nx.draw_networkx_nodes(G,pos)
----> 5 nx.draw_networkx_labels(G,pos)
6 nx.draw_networkx_edges(G, pos, edge_color=edgewidth)
7
/home/janisz/anaconda2/lib/python2.7/site-packages/networkx/drawing/nx_pylab.pyc in draw_networkx_labels(G, pos, labels, font_size, font_color, font_family, font_weight, alpha, bbox, ax, **kwds)
732 transform=ax.transData,
733 bbox=bbox,
--> 734 clip_on=True,
735 )
736 text_items[n] = t
/home/janisz/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in text(self, x, y, s, fontdict, withdash, **kwargs)
609 else:
610 t = mtext.Text(
--> 611 x=x, y=y, text=s)
612 self._set_artist_props(t)
613
/home/janisz/anaconda2/lib/python2.7/site-packages/matplotlib/text.pyc in __init__(self, x, y, text, color, verticalalignment, horizontalalignment, multialignment, fontproperties, rotation, linespacing, rotation_mode, usetex, wrap, **kwargs)
218 fontproperties = FontProperties(fontproperties)
219
--> 220 self.set_text(text)
221 self.set_color(color)
222 self.set_usetex(usetex)
/home/janisz/anaconda2/lib/python2.7/site-packages/matplotlib/text.pyc in set_text(self, s)
1204 ACCEPTS: string or anything printable with '%s' conversion.
1205 """
-> 1206 self._text = '%s' % (s,)
1207 self.stale = True
1208
UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 11: ordinal not in range(128)
In [179]:
nx.k_nearest_neighbors(G, weight='strength')
Out[179]:
{2: 2.0}
In [12]:
nx.pagerank(G, weight='strength')
/home/janisz/anaconda2/lib/python2.7/site-packages/networkx/generators/stochastic.py:56: UserWarning: zero out-degree for node Gabriel Monroy
warnings.warn('zero out-degree for node %s' % u)
/home/janisz/anaconda2/lib/python2.7/site-packages/networkx/generators/stochastic.py:56: UserWarning: zero out-degree for node weitao zhou
warnings.warn('zero out-degree for node %s' % u)
Out[12]:
{'ASHUTOSH JAIN': 0.0012838029972018364,
'Aaron Bell': 0.004254588728016053,
'Abhishek Dasgupta': 0.008520728379951112,
'Adam B': 0.013197228118714495,
'Aditi Dixit': 0.00884223171503763,
'Akanksha Agrawal': 0.0018034049460139898,
'Alex Clemmer': 0.005961979576862273,
'Alex Naparu': 0.003686009338910594,
'Alexander Rojas': 0.011225508898713826,
'Alexander Rukletsov': 0.01259506128695875,
'Alexandra Sava': 0.01098858911881367,
'Anand Mazumdar': 0.0106505100433494,
'Andrey Dyatlov': 0.0013466757570183999,
'Andy Konwinski': 0.0085089062961433,
'Andy Pang': 0.002133366491370539,
'Anindya Sinha': 0.006330753040892335,
'Ankur Chauhan': 0.002901644769446062,
'Anton Lindstr\xc3\xb6m': 0.004473770960312722,
'Archana kumari': 0.0024611194207013534,
'Artem Harutyunyan': 0.009197483004702672,
'Avinash sridharan': 0.012199789369165558,
'Bartek Plotka': 0.007425818916875776,
'Ben Mahler': 0.006878674034457637,
'Benjamin Bannier': 0.009285553721712022,
'Benjamin Hindman': 0.0010066741277596305,
'Benjamin Mahler': 0.00962422325485775,
'Bernardo Gomez Palacio': 0.005435539252276146,
'Bernd Mathiske': 0.012124946194804297,
'Bhuvan Arumugam': 0.003636085805605665,
'Bill Farner': 0.00176541057093896,
'Brendan Chang': 0.004523716633155074,
'Brenden Matthews': 0.009173495098556023,
'Brian Wickman': 0.0020683286395719244,
'BrickXu': 0.007723247183879334,
'Charles Reiss': 0.003219290238946392,
'Charlie Carson': 0.0086215433723444,
'Chengwei Yang': 0.007847530982943053,
'Chi Zhang': 0.00891014831133859,
'Chris Heller': 0.0029328276193848835,
'Chris Mattmann': 0.0021702134984966487,
'Christos Kozyrakis': 0.0073844310856464495,
'Cody Maloney': 0.012425468393759362,
'Cong Wang': 0.007411369540308226,
'Connor Doyle': 0.008087378948268305,
'Craig Hansen-Sturm': 0.0017558784686732827,
'Daniel Pravat': 0.0029079081550070812,
'Dario Bazan': 0.002591680298292253,
'Dario Rexin': 0.0035949057024263404,
'Dave Lester': 0.0026463477323144715,
'David Forsythe': 0.00588803988290364,
'David Robinson': 0.008945073533382048,
'Diana Arroyo': 0.005034399849985174,
'Diogo Gomes': 0.0010268432002036125,
'Disha Singh': 0.0009898678225114818,
'Dominic Hamon': 0.01163478573165955,
'Eijsermans': 0.0077232471838793336,
'Elsmore': 0.0028136318796650713,
'Eren G\xc3\xbcven': 0.007723247183879334,
'Evelina Dumitrescu': 0.00978949004524934,
'Felix Abecassis': 0.009072695838518953,
'Gabriel Monroy': 0.0008965929468021519,
'Gajewski': 0.007723247183879334,
'Gast\xc3\xb3n Kleiman': 0.00540484982196651,
'Gaudio': 0.007723247183879334,
'Gilbert Song': 0.004592250921540975,
'Greg Mann': 0.012550676270760917,
'Guangya Liu': 0.011135357408737931,
'Ian Babrou': 0.0010333185732198963,
'Ian Downes': 0.011475636237015937,
'Isabel Jimenez': 0.010533178526112168,
'Itamar Ostricher': 0.0014847979729118845,
'Jake Farrell': 0.001130541543742436,
'Jameel Al-Aziz': 0.001635741628076291,
'James DeFelice': 0.007986130285933538,
'James Peach': 0.008187863038018859,
'Jan Schlicht': 0.007969975925070923,
'Jay Buffington': 0.004716719839082663,
'Jian Qiu': 0.003373806726423023,
'Jiang Yan Xu': 0.0059743446389542544,
'Jie Yu': 0.009723125910085255,
'Jihun Kang': 0.0009678321645912403,
'Jiri Simsa': 0.001233276665796395,
'Jocelyn De La Rosa': 0.0010796785990298413,
'Joe Gordon': 0.005741086642582453,
'Joerg Schad': 0.012720300605325745,
'Jojy Varghese': 0.006061979594123831,
'Jonathon Rossi': 0.0014669597962681453,
'Joris Van Remoortere': 0.01064219466234101,
'Joseph Wu': 0.010290320217668788,
'Kakadia': 0.005602408196065427,
'Kamil Doma?ski': 0.002924726952449014,
'Kamil Domanski': 0.004058127275937309,
'Kapil Arya': 0.011355095568101737,
'Ken Sipe': 0.004231915562532143,
'Kevin Devroede': 0.0011937513588480134,
'Kevin Klues': 0.007345760347729772,
'Kiyonari Harigae': 0.0010587980330164872,
'Klaus Ma': 0.010668246831636431,
'Lily Chen': 0.0058409461425199035,
'M Bauer': 0.0077232471838793336,
'M Lawindi': 0.001373441226737924,
'Mandeep Chadha': 0.002607111677406404,
'Marco Massenzio': 0.011374227174329453,
'Mark Wang': 0.0044204487523337825,
'Martin Weindel': 0.005142954221509686,
'Matei Alexandru Zaharia': 0.0021250883051744055,
'Michael Browning': 0.003990851945589962,
'Michael Lun\xc3\xb8e': 0.001840123375495814,
'Michael Park': 0.011276355701317268,
'Michael Schenck': 0.0012352175817742924,
'Nancy Ko': 0.005602408196065427,
'Neil Conway': 0.011983823160901269,
'Nikita Vetoshkin': 0.005670153075235871,
'Niklas Nielsen': 0.012992692418412327,
'Niklas Q. Nielsen': 0.012902674387417509,
'Oliver Nicholas': 0.0017200442828007932,
'Olivier Sallou': 0.005638067378085849,
'Palak Choudhary': 0.003593994509340104,
'Paul Brett': 0.007673046668137133,
'Qian Zhang': 0.005964715807990473,
'R.B. Boyer': 0.005926662645456753,
'Ricardo Cervera-Navarro': 0.002053823588640451,
'Ritwik Yadav': 0.005056745898468907,
'Ross Allen': 0.003984766646168125,
'Rossi': 0.0014342178861444226,
'Ryan Thomas': 0.00462625010881168,
'Ryuichi Okumura': 0.00451312796760603,
'Samuel': 0.0035939945093401036,
'Shuai Lin': 0.007287448951680762,
'Spike Curtis': 0.005849090984905685,
'Stan Teresen': 0.0016774729077413514,
'Stephan Erb': 0.0077232471838793336,
'Steve Hoffman': 0.007723247183879334,
'Steve Niemitz': 0.0032991516696290026,
'Steven Phung': 0.002378693908879859,
'TILL TOENSHOFF': 0.002466566678039087,
'Thomas Marshall': 0.011737289058217153,
'Thomas Rampelberg': 0.004734488462011918,
'Till Toenshoff': 0.011321423426759646,
'Tim Anderegg': 0.0009387782442315632,
'Timothy Chen': 0.010151519972597807,
'Timothy St. Clair': 0.007241978506733577,
'Tobi Knaup': 0.00969680227574801,
'Tobias Weingartner': 0.0009288121777364256,
'Tom Arnfeld': 0.005286981637169214,
'Tom Galloway': 0.0009178586703806422,
'Tomasz Janiszewski': 0.0009691356230434973,
'Vaibhav Khanduja': 0.009913342314333341,
'Vinod Kone': 0.010173186640961642,
'Vinson Lee': 0.0036750548727362,
'Vivek Juneja': 0.0077232471838793336,
'Weitao': 0.007723247183879334,
'Wojciech Sielski': 0.0017194939971123644,
'Yifan Gu': 0.010015733136145363,
'Yong Qiao Wang': 0.00906553568124986,
'Yong Tang': 0.005311534781505988,
'Yongqiao Wang': 0.010490707623101532,
'Zhitao Li': 0.007452301692620768,
'Zhiwei Chen': 0.0034645485707145447,
'Zuyu Zhang': 0.00442932898604305,
'adam-mesos': 0.0025278495289744254,
'ayouwei': 0.0077232471838793336,
'farukd': 0.0077232471838793336,
'grandlogic': 0.001007721250944405,
'haosdent huang': 0.012181853870332791,
'mlawindi': 0.0013853236987425546,
'niklas': 0.001185472692934757,
'usultrared': 0.0077232471838793336,
'weitao zhou': 0.0008965929468021519}
In [15]:
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['strength'])
d = nx.pagerank(G, weight='strength')
data = pd.DataFrame(d.items()).sort_values(by=1, ascending=False)
In [30]:
communication[communication['author'].isin(data[0].head(50))]
Out[30]:
author
peer
shared
average
strength
normal_strength
165
Abhishek Dasgupta
Kevin Klues
55
102
53
0
200
Michael Park
Isabel Jimenez
123
257
47
0
201
Isabel Jimenez
Michael Park
123
257
47
0
202
Vinod Kone
Benjamin Mahler
255
553
46
0
203
Benjamin Mahler
Vinod Kone
255
553
46
0
204
Tobi Knaup
Bernardo Gomez Palacio
9
20
45
0
208
Kapil Arya
Jie Yu
183
418
43
0
209
Kapil Arya
Benjamin Bannier
270
617
43
0
210
Jie Yu
Kapil Arya
183
418
43
0
211
Benjamin Bannier
Kapil Arya
270
617
43
0
212
Yong Qiao Wang
Michael Browning
3
7
42
0
216
Joris Van Remoortere
Isabel Jimenez
110
259
42
0
217
Isabel Jimenez
Joris Van Remoortere
110
259
42
0
221
Vinod Kone
Kapil Arya
171
424
40
0
223
Neil Conway
Joerg Schad
85
209
40
0
224
Michael Park
Joris Van Remoortere
107
263
40
0
225
Kapil Arya
Vinod Kone
171
424
40
0
226
Joris Van Remoortere
Michael Park
107
263
40
0
227
Joerg Schad
Neil Conway
85
209
40
0
228
Benjamin Mahler
Benjamin Bannier
303
746
40
0
229
Benjamin Bannier
Benjamin Mahler
303
746
40
0
232
Jie Yu
Benjamin Bannier
262
660
39
0
233
Benjamin Bannier
Jie Yu
262
660
39
0
235
Neil Conway
Alexander Rukletsov
84
221
38
0
236
Isabel Jimenez
Benjamin Bannier
215
556
38
0
237
Connor Doyle
Christos Kozyrakis
5
13
38
0
238
Connor Doyle
Qian Zhang
5
13
38
0
240
Benjamin Bannier
Isabel Jimenez
215
556
38
0
241
Alexander Rukletsov
Neil Conway
84
221
38
0
242
Kapil Arya
Ian Downes
99
261
37
0
...
...
...
...
...
...
...
9789
Benjamin Bannier
Dave Lester
1
466
0
0
9790
Benjamin Bannier
Akanksha Agrawal
2
431
0
0
9791
Benjamin Bannier
Gastón Kleiman
4
433
0
0
9792
Benjamin Bannier
Ritwik Yadav
4
432
0
0
9793
Benjamin Bannier
Brian Wickman
2
431
0
0
9794
Benjamin Bannier
Spike Curtis
2
431
0
0
9795
Benjamin Bannier
Kevin Devroede
2
431
0
0
9796
Benjamin Bannier
niklas
3
432
0
0
9797
Benjamin Bannier
Steven Phung
1
432
0
0
9798
Benjamin Bannier
Andy Pang
1
430
0
0
9799
Benjamin Bannier
Steve Niemitz
1
441
0
0
9800
Benjamin Bannier
Jocelyn De La Rosa
1
430
0
0
9801
Benjamin Bannier
Archana kumari
3
432
0
0
9802
Benjamin Bannier
Anton Lindström
4
433
0
0
9803
Benjamin Bannier
Vinson Lee
1
431
0
0
9804
Benjamin Bannier
Joe Gordon
3
433
0
0
9805
Benjamin Bannier
Zhiwei Chen
1
431
0
0
9810
Avinash sridharan
Benjamin Hindman
22
14916
0
0
9811
Artem Harutyunyan
Benjamin Hindman
72
14964
0
0
9812
Artem Harutyunyan
Rossi
1
119
0
0
9842
Andy Konwinski
Benjamin Hindman
17
14904
0
0
9845
Anand Mazumdar
Benjamin Hindman
43
14931
0
0
9846
Alexandra Sava
Benjamin Hindman
26
14909
0
0
9847
Alexander Rukletsov
Ross Allen
1
106
0
0
9848
Alexander Rukletsov
Dave Lester
1
119
0
0
9849
Alexander Rukletsov
Benjamin Hindman
79
14977
0
0
9850
Alexander Rojas
Benjamin Hindman
47
14948
0
0
9860
Aditi Dixit
Benjamin Hindman
16
14908
0
0
9861
Adam B
Benjamin Hindman
63
14943
0
0
9862
Abhishek Dasgupta
Benjamin Hindman
14
14958
0
0
5259 rows × 6 columns
In [ ]:
Content source: janisz/bus-factor-calculator
Similar notebooks: