In [2]:
import pandas as pd
import networkx as nx

In [3]:
pd.read_csv('../data/summary.csv', parse_dates=True)


Out[3]:
statistic value
0 number-of-commits 7472
1 number-of-entities 32005
2 number-of-entities-changed 58612
3 number-of-authors 170

In [4]:
authors_churn = pd.read_csv('../data/authors.csv')

In [47]:
authors_churn


Out[47]:
entity n-authors n-revs
0 src/Makefile.am 55 526
1 src/slave/slave.cpp 54 450
2 src/master/master.cpp 50 585
3 include/mesos/mesos.proto 50 188
4 src/master/http.cpp 42 203
5 src/tests/master_tests.cpp 40 214
6 src/master/master.hpp 39 329
7 src/tests/slave_tests.cpp 39 134
8 src/tests/mesos.hpp 36 156
9 src/slave/slave.hpp 34 207
10 src/slave/flags.hpp 34 131
11 docs/configuration.md 34 75
12 src/slave/flags.cpp 34 74
13 src/tests/mesos.cpp 32 103
14 configure.ac 31 187
15 3rdparty/libprocess/src/process.cpp 29 172
16 3rdparty/libprocess/3rdparty/stout/include/sto... 28 84
17 CHANGELOG 27 111
18 src/common/resources.cpp 26 82
19 src/tests/resources_tests.cpp 26 73
20 src/tests/slave_recovery_tests.cpp 25 144
21 src/tests/environment.cpp 25 73
22 include/mesos/v1/mesos.proto 25 48
23 src/sched/sched.cpp 24 171
24 src/tests/fault_tolerance_tests.cpp 24 140
25 src/slave/http.cpp 24 85
26 3rdparty/libprocess/include/process/http.hpp 23 77
27 docs/powered-by-mesos.md 23 74
28 docs/getting-started.md 23 54
29 src/slave/containerizer/docker.cpp 22 99
... ... ... ...
31975 src/third_party/zookeeper-3.2.2/src/java/main/... 1 1
31976 third_party/libprocess/third_party/stout/inclu... 1 1
31977 frameworks/hadoop-0.20.0/src/mapred/org/apache... 1 1
31978 {src => include/mesos}/module/authenticatee.hpp 1 1
31979 {src/third_party/libprocess/third_party/boost-... 1 1
31980 third_party/libprocess/third_party/boost-1.37.... 1 1
31981 third_party/libprocess/third_party/boost-1.37.... 1 1
31982 third_party/{boto-1.8d => boto-1.9b}/README 1 1
31983 third_party/libprocess/third_party/boost-1.37.... 1 1
31984 third_party/glog-0.3.1/packages/deb/libgoogle-... 1 1
31985 frameworks/hadoop-0.20.0/src/test/org/apache/h... 1 1
31986 {src => third_party/libprocess}/third_party/bo... 1 1
31987 frameworks/{hadoop-0.20.0 => hadoop-0.20.2}/sr... 1 1
31988 frameworks/{hadoop-0.20.0 => hadoop-0.20.2}/sr... 1 1
31989 third_party/boost-1.37.0/boost/mpl/aux_/prepro... 1 1
31990 third_party/zookeeper-3.3.1/docs/images/built-... 1 1
31991 frameworks/hadoop-0.20.0/src/contrib/index/src... 1 1
31992 src/third_party/libprocess/third_party/boost-1... 1 1
31993 third_party/libprocess/third_party/boost-1.37.... 1 1
31994 src/third_party/boost-1.37.0/boost/mpl/count.hpp 1 1
31995 third_party/boost-1.37.0/boost/mpl/aux_/prepro... 1 1
31996 third_party/libprocess/third_party/stout/tests... 1 1
31997 third_party/boost-1.37.0/boost/mpl/map/aux_/co... 1 1
31998 src/third_party/boost-1.37.0/boost/mpl/pop_bac... 1 1
31999 src/third_party/boost-1.37.0/boost/mpl/map/aux... 1 1
32000 third_party/boost-1.37.0/boost/detail/lwm_pthr... 1 1
32001 {src => third_party/libprocess}/third_party/bo... 1 1
32002 frameworks/hadoop-0.20.2/src/test/org/apache/h... 1 1
32003 {src/third_party/libprocess/third_party => thi... 1 1
32004 frameworks/hadoop-0.20.2/src/test/org/apache/h... 1 1

32005 rows × 3 columns


In [ ]:
authors = pd.read_csv('authors.csv')

In [41]:
authors.head()


Out[41]:
entity n-authors n-revs
0 main.go 3 31
1 web.go 3 29
2 README.md 3 26
3 marathon/marathon.go 3 22
4 web_test.go 3 19

In [43]:
authors.describe()


Out[43]:
n-authors n-revs
count 105.000000 105.000000
mean 1.514286 4.142857
std 0.735295 5.570231
min 1.000000 1.000000
25% 1.000000 1.000000
50% 1.000000 2.000000
75% 2.000000 5.000000
max 3.000000 31.000000

In [24]:
%matplotlib inline
authors.plot(kind='scatter', x='n-authors', y='n-revs');



In [32]:
authors.groupby('n-authors').describe()


Out[32]:
n-revs
n-authors
1 count 66.000000
mean 2.045455
std 1.884608
min 1.000000
25% 1.000000
50% 1.000000
75% 2.000000
max 10.000000
2 count 24.000000
mean 3.791667
std 1.587428
min 2.000000
25% 2.750000
50% 3.500000
75% 5.000000
max 7.000000
3 count 15.000000
mean 13.933333
std 9.284908
min 4.000000
25% 7.000000
50% 9.000000
75% 20.500000
max 31.000000

In [5]:
communication = pd.read_csv('../data/communication.csv', parse_dates=True)

In [6]:
strength = communication['strength']
communication['normal_strength'] = strength.apply(lambda x: (x - strength.min()) / (strength.max() - strength.min()))

In [7]:
communication['strength'].quantile(.9)


Out[7]:
20.0

In [32]:
communication[communication['author'].isin(data[0].head(50))].groupby(['author']).apply(lambda x: list(x.peer)).to_json('../data/data.json')

In [64]:
communication[communication['strength'] > 20].describe()


Out[64]:
shared average strength
count 832.000000 832.000000 832.000000
mean 32.850962 116.199519 41.920673
std 47.252115 146.288642 29.275585
min 1.000000 1.000000 21.000000
25% 1.000000 3.000000 23.000000
50% 14.000000 61.000000 27.000000
75% 46.000000 187.500000 43.250000
max 303.000000 746.000000 100.000000

In [10]:
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['strength'])

In [11]:
pos=nx.spring_layout(G)
edgewidth = [ d['strength'] for (u,v,d) in G.edges(data=True)]
nx.draw_networkx_edge_labels(G, pos)
nx.draw_networkx_nodes(G,pos)
nx.draw_networkx_labels(G,pos)
nx.draw_networkx_edges(G, pos, edge_color=edgewidth)


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-11-13d0e9e352a2> in <module>()
      3 nx.draw_networkx_edge_labels(G, pos)
      4 nx.draw_networkx_nodes(G,pos)
----> 5 nx.draw_networkx_labels(G,pos)
      6 nx.draw_networkx_edges(G, pos, edge_color=edgewidth)
      7 

/home/janisz/anaconda2/lib/python2.7/site-packages/networkx/drawing/nx_pylab.pyc in draw_networkx_labels(G, pos, labels, font_size, font_color, font_family, font_weight, alpha, bbox, ax, **kwds)
    732                   transform=ax.transData,
    733                   bbox=bbox,
--> 734                   clip_on=True,
    735                   )
    736         text_items[n] = t

/home/janisz/anaconda2/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in text(self, x, y, s, fontdict, withdash, **kwargs)
    609         else:
    610             t = mtext.Text(
--> 611                 x=x, y=y, text=s)
    612         self._set_artist_props(t)
    613 

/home/janisz/anaconda2/lib/python2.7/site-packages/matplotlib/text.pyc in __init__(self, x, y, text, color, verticalalignment, horizontalalignment, multialignment, fontproperties, rotation, linespacing, rotation_mode, usetex, wrap, **kwargs)
    218             fontproperties = FontProperties(fontproperties)
    219 
--> 220         self.set_text(text)
    221         self.set_color(color)
    222         self.set_usetex(usetex)

/home/janisz/anaconda2/lib/python2.7/site-packages/matplotlib/text.pyc in set_text(self, s)
   1204         ACCEPTS: string or anything printable with '%s' conversion.
   1205         """
-> 1206         self._text = '%s' % (s,)
   1207         self.stale = True
   1208 

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc3 in position 11: ordinal not in range(128)

In [179]:
nx.k_nearest_neighbors(G, weight='strength')


Out[179]:
{2: 2.0}

In [12]:
nx.pagerank(G, weight='strength')


/home/janisz/anaconda2/lib/python2.7/site-packages/networkx/generators/stochastic.py:56: UserWarning: zero out-degree for node Gabriel Monroy
  warnings.warn('zero out-degree for node %s' % u)
/home/janisz/anaconda2/lib/python2.7/site-packages/networkx/generators/stochastic.py:56: UserWarning: zero out-degree for node weitao zhou
  warnings.warn('zero out-degree for node %s' % u)
Out[12]:
{'ASHUTOSH JAIN': 0.0012838029972018364,
 'Aaron Bell': 0.004254588728016053,
 'Abhishek Dasgupta': 0.008520728379951112,
 'Adam B': 0.013197228118714495,
 'Aditi Dixit': 0.00884223171503763,
 'Akanksha Agrawal': 0.0018034049460139898,
 'Alex Clemmer': 0.005961979576862273,
 'Alex Naparu': 0.003686009338910594,
 'Alexander Rojas': 0.011225508898713826,
 'Alexander Rukletsov': 0.01259506128695875,
 'Alexandra Sava': 0.01098858911881367,
 'Anand Mazumdar': 0.0106505100433494,
 'Andrey Dyatlov': 0.0013466757570183999,
 'Andy Konwinski': 0.0085089062961433,
 'Andy Pang': 0.002133366491370539,
 'Anindya Sinha': 0.006330753040892335,
 'Ankur Chauhan': 0.002901644769446062,
 'Anton Lindstr\xc3\xb6m': 0.004473770960312722,
 'Archana kumari': 0.0024611194207013534,
 'Artem Harutyunyan': 0.009197483004702672,
 'Avinash sridharan': 0.012199789369165558,
 'Bartek Plotka': 0.007425818916875776,
 'Ben Mahler': 0.006878674034457637,
 'Benjamin Bannier': 0.009285553721712022,
 'Benjamin Hindman': 0.0010066741277596305,
 'Benjamin Mahler': 0.00962422325485775,
 'Bernardo Gomez Palacio': 0.005435539252276146,
 'Bernd Mathiske': 0.012124946194804297,
 'Bhuvan Arumugam': 0.003636085805605665,
 'Bill Farner': 0.00176541057093896,
 'Brendan Chang': 0.004523716633155074,
 'Brenden Matthews': 0.009173495098556023,
 'Brian Wickman': 0.0020683286395719244,
 'BrickXu': 0.007723247183879334,
 'Charles Reiss': 0.003219290238946392,
 'Charlie Carson': 0.0086215433723444,
 'Chengwei Yang': 0.007847530982943053,
 'Chi Zhang': 0.00891014831133859,
 'Chris Heller': 0.0029328276193848835,
 'Chris Mattmann': 0.0021702134984966487,
 'Christos Kozyrakis': 0.0073844310856464495,
 'Cody Maloney': 0.012425468393759362,
 'Cong Wang': 0.007411369540308226,
 'Connor Doyle': 0.008087378948268305,
 'Craig Hansen-Sturm': 0.0017558784686732827,
 'Daniel Pravat': 0.0029079081550070812,
 'Dario Bazan': 0.002591680298292253,
 'Dario Rexin': 0.0035949057024263404,
 'Dave Lester': 0.0026463477323144715,
 'David Forsythe': 0.00588803988290364,
 'David Robinson': 0.008945073533382048,
 'Diana Arroyo': 0.005034399849985174,
 'Diogo Gomes': 0.0010268432002036125,
 'Disha  Singh': 0.0009898678225114818,
 'Dominic Hamon': 0.01163478573165955,
 'Eijsermans': 0.0077232471838793336,
 'Elsmore': 0.0028136318796650713,
 'Eren G\xc3\xbcven': 0.007723247183879334,
 'Evelina Dumitrescu': 0.00978949004524934,
 'Felix Abecassis': 0.009072695838518953,
 'Gabriel Monroy': 0.0008965929468021519,
 'Gajewski': 0.007723247183879334,
 'Gast\xc3\xb3n Kleiman': 0.00540484982196651,
 'Gaudio': 0.007723247183879334,
 'Gilbert Song': 0.004592250921540975,
 'Greg Mann': 0.012550676270760917,
 'Guangya Liu': 0.011135357408737931,
 'Ian Babrou': 0.0010333185732198963,
 'Ian Downes': 0.011475636237015937,
 'Isabel Jimenez': 0.010533178526112168,
 'Itamar Ostricher': 0.0014847979729118845,
 'Jake Farrell': 0.001130541543742436,
 'Jameel Al-Aziz': 0.001635741628076291,
 'James DeFelice': 0.007986130285933538,
 'James Peach': 0.008187863038018859,
 'Jan Schlicht': 0.007969975925070923,
 'Jay Buffington': 0.004716719839082663,
 'Jian Qiu': 0.003373806726423023,
 'Jiang Yan Xu': 0.0059743446389542544,
 'Jie Yu': 0.009723125910085255,
 'Jihun Kang': 0.0009678321645912403,
 'Jiri Simsa': 0.001233276665796395,
 'Jocelyn De La Rosa': 0.0010796785990298413,
 'Joe Gordon': 0.005741086642582453,
 'Joerg Schad': 0.012720300605325745,
 'Jojy Varghese': 0.006061979594123831,
 'Jonathon Rossi': 0.0014669597962681453,
 'Joris Van Remoortere': 0.01064219466234101,
 'Joseph Wu': 0.010290320217668788,
 'Kakadia': 0.005602408196065427,
 'Kamil Doma?ski': 0.002924726952449014,
 'Kamil Domanski': 0.004058127275937309,
 'Kapil Arya': 0.011355095568101737,
 'Ken Sipe': 0.004231915562532143,
 'Kevin Devroede': 0.0011937513588480134,
 'Kevin Klues': 0.007345760347729772,
 'Kiyonari Harigae': 0.0010587980330164872,
 'Klaus Ma': 0.010668246831636431,
 'Lily Chen': 0.0058409461425199035,
 'M Bauer': 0.0077232471838793336,
 'M Lawindi': 0.001373441226737924,
 'Mandeep Chadha': 0.002607111677406404,
 'Marco Massenzio': 0.011374227174329453,
 'Mark Wang': 0.0044204487523337825,
 'Martin Weindel': 0.005142954221509686,
 'Matei Alexandru Zaharia': 0.0021250883051744055,
 'Michael Browning': 0.003990851945589962,
 'Michael Lun\xc3\xb8e': 0.001840123375495814,
 'Michael Park': 0.011276355701317268,
 'Michael Schenck': 0.0012352175817742924,
 'Nancy  Ko': 0.005602408196065427,
 'Neil Conway': 0.011983823160901269,
 'Nikita Vetoshkin': 0.005670153075235871,
 'Niklas Nielsen': 0.012992692418412327,
 'Niklas Q. Nielsen': 0.012902674387417509,
 'Oliver Nicholas': 0.0017200442828007932,
 'Olivier Sallou': 0.005638067378085849,
 'Palak Choudhary': 0.003593994509340104,
 'Paul Brett': 0.007673046668137133,
 'Qian Zhang': 0.005964715807990473,
 'R.B. Boyer': 0.005926662645456753,
 'Ricardo Cervera-Navarro': 0.002053823588640451,
 'Ritwik Yadav': 0.005056745898468907,
 'Ross Allen': 0.003984766646168125,
 'Rossi': 0.0014342178861444226,
 'Ryan Thomas': 0.00462625010881168,
 'Ryuichi Okumura': 0.00451312796760603,
 'Samuel': 0.0035939945093401036,
 'Shuai Lin': 0.007287448951680762,
 'Spike Curtis': 0.005849090984905685,
 'Stan Teresen': 0.0016774729077413514,
 'Stephan Erb': 0.0077232471838793336,
 'Steve Hoffman': 0.007723247183879334,
 'Steve Niemitz': 0.0032991516696290026,
 'Steven Phung': 0.002378693908879859,
 'TILL TOENSHOFF': 0.002466566678039087,
 'Thomas Marshall': 0.011737289058217153,
 'Thomas Rampelberg': 0.004734488462011918,
 'Till Toenshoff': 0.011321423426759646,
 'Tim Anderegg': 0.0009387782442315632,
 'Timothy Chen': 0.010151519972597807,
 'Timothy St. Clair': 0.007241978506733577,
 'Tobi Knaup': 0.00969680227574801,
 'Tobias Weingartner': 0.0009288121777364256,
 'Tom Arnfeld': 0.005286981637169214,
 'Tom Galloway': 0.0009178586703806422,
 'Tomasz Janiszewski': 0.0009691356230434973,
 'Vaibhav Khanduja': 0.009913342314333341,
 'Vinod Kone': 0.010173186640961642,
 'Vinson Lee': 0.0036750548727362,
 'Vivek Juneja': 0.0077232471838793336,
 'Weitao': 0.007723247183879334,
 'Wojciech Sielski': 0.0017194939971123644,
 'Yifan Gu': 0.010015733136145363,
 'Yong Qiao Wang': 0.00906553568124986,
 'Yong Tang': 0.005311534781505988,
 'Yongqiao Wang': 0.010490707623101532,
 'Zhitao Li': 0.007452301692620768,
 'Zhiwei Chen': 0.0034645485707145447,
 'Zuyu Zhang': 0.00442932898604305,
 'adam-mesos': 0.0025278495289744254,
 'ayouwei': 0.0077232471838793336,
 'farukd': 0.0077232471838793336,
 'grandlogic': 0.001007721250944405,
 'haosdent huang': 0.012181853870332791,
 'mlawindi': 0.0013853236987425546,
 'niklas': 0.001185472692934757,
 'usultrared': 0.0077232471838793336,
 'weitao zhou': 0.0008965929468021519}

In [15]:
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['strength'])
d = nx.pagerank(G, weight='strength')

data = pd.DataFrame(d.items()).sort_values(by=1, ascending=False)

In [30]:
communication[communication['author'].isin(data[0].head(50))]


Out[30]:
author peer shared average strength normal_strength
165 Abhishek Dasgupta Kevin Klues 55 102 53 0
200 Michael Park Isabel Jimenez 123 257 47 0
201 Isabel Jimenez Michael Park 123 257 47 0
202 Vinod Kone Benjamin Mahler 255 553 46 0
203 Benjamin Mahler Vinod Kone 255 553 46 0
204 Tobi Knaup Bernardo Gomez Palacio 9 20 45 0
208 Kapil Arya Jie Yu 183 418 43 0
209 Kapil Arya Benjamin Bannier 270 617 43 0
210 Jie Yu Kapil Arya 183 418 43 0
211 Benjamin Bannier Kapil Arya 270 617 43 0
212 Yong Qiao Wang Michael Browning 3 7 42 0
216 Joris Van Remoortere Isabel Jimenez 110 259 42 0
217 Isabel Jimenez Joris Van Remoortere 110 259 42 0
221 Vinod Kone Kapil Arya 171 424 40 0
223 Neil Conway Joerg Schad 85 209 40 0
224 Michael Park Joris Van Remoortere 107 263 40 0
225 Kapil Arya Vinod Kone 171 424 40 0
226 Joris Van Remoortere Michael Park 107 263 40 0
227 Joerg Schad Neil Conway 85 209 40 0
228 Benjamin Mahler Benjamin Bannier 303 746 40 0
229 Benjamin Bannier Benjamin Mahler 303 746 40 0
232 Jie Yu Benjamin Bannier 262 660 39 0
233 Benjamin Bannier Jie Yu 262 660 39 0
235 Neil Conway Alexander Rukletsov 84 221 38 0
236 Isabel Jimenez Benjamin Bannier 215 556 38 0
237 Connor Doyle Christos Kozyrakis 5 13 38 0
238 Connor Doyle Qian Zhang 5 13 38 0
240 Benjamin Bannier Isabel Jimenez 215 556 38 0
241 Alexander Rukletsov Neil Conway 84 221 38 0
242 Kapil Arya Ian Downes 99 261 37 0
... ... ... ... ... ... ...
9789 Benjamin Bannier Dave Lester 1 466 0 0
9790 Benjamin Bannier Akanksha Agrawal 2 431 0 0
9791 Benjamin Bannier Gastón Kleiman 4 433 0 0
9792 Benjamin Bannier Ritwik Yadav 4 432 0 0
9793 Benjamin Bannier Brian Wickman 2 431 0 0
9794 Benjamin Bannier Spike Curtis 2 431 0 0
9795 Benjamin Bannier Kevin Devroede 2 431 0 0
9796 Benjamin Bannier niklas 3 432 0 0
9797 Benjamin Bannier Steven Phung 1 432 0 0
9798 Benjamin Bannier Andy Pang 1 430 0 0
9799 Benjamin Bannier Steve Niemitz 1 441 0 0
9800 Benjamin Bannier Jocelyn De La Rosa 1 430 0 0
9801 Benjamin Bannier Archana kumari 3 432 0 0
9802 Benjamin Bannier Anton Lindström 4 433 0 0
9803 Benjamin Bannier Vinson Lee 1 431 0 0
9804 Benjamin Bannier Joe Gordon 3 433 0 0
9805 Benjamin Bannier Zhiwei Chen 1 431 0 0
9810 Avinash sridharan Benjamin Hindman 22 14916 0 0
9811 Artem Harutyunyan Benjamin Hindman 72 14964 0 0
9812 Artem Harutyunyan Rossi 1 119 0 0
9842 Andy Konwinski Benjamin Hindman 17 14904 0 0
9845 Anand Mazumdar Benjamin Hindman 43 14931 0 0
9846 Alexandra Sava Benjamin Hindman 26 14909 0 0
9847 Alexander Rukletsov Ross Allen 1 106 0 0
9848 Alexander Rukletsov Dave Lester 1 119 0 0
9849 Alexander Rukletsov Benjamin Hindman 79 14977 0 0
9850 Alexander Rojas Benjamin Hindman 47 14948 0 0
9860 Aditi Dixit Benjamin Hindman 16 14908 0 0
9861 Adam B Benjamin Hindman 63 14943 0 0
9862 Abhishek Dasgupta Benjamin Hindman 14 14958 0 0

5259 rows × 6 columns


In [ ]: