In [1]:
import pandas as pd
import networkx as nx
%matplotlib inline

In [8]:
df = pd.read_csv('bus_factor.csv', parse_dates=True, names=['repo', 'bus_factor', ''])

In [4]:
df


Out[4]:
activeadmin/activeadmin 4 38
0 6to5/6to5 159 10
1 6to5/6to5 8 531
2 6to5/6to5 8 19
3 adobe/brackets 9 26
4 afaqurk/linux-dash 9 212
5 aFarkas/html5shiv 13 89
6 aFarkas/lazysizes 13 34
7 aFarkas/lazysizes 13 34
8 AFNetworking/AFNetworking 108 25
9 airbnb/javascript 108 129
10 airbnb/javascript 108 321
11 adobe-fonts/source-code-pro 96 225
12 ajaxorg/ace 93 319
13 ajaxorg/cloud9 23 78
14 Alamofire/Alamofire 94 92
15 alebcay/awesome-shell 94 100
16 alexreisner/geocoder 42 366
17 alex/what-happens-when 74 87
18 alexwolfe/Buttons 4 232
19 alexwolfe/Buttons 4 63
20 alexwolfe/Buttons 5 12
21 alrra/browser-logos 5 33
22 alvarotrigo/fullPage.js 5 15
23 alvarotrigo/fullPage.js 5 15
24 amix/vimrc 10 92
25 amsul/pickadate.js 10 113
26 andreafabrizi/Dropbox-Uploader 10 25
27 andris9/Nodemailer 29 64
28 andris9/Nodemailer 29 15
29 andymccurdy/redis-py 29 87
... ... ... ...
53 astuetz/PagerSlidingTabStrip 2 183
54 aterrien/jQuery-Knob 2 6
55 aterrien/jQuery-Knob 9 21
56 atom/atom-shell 102 358
57 atom/atom 9 19
58 Aufree/trip-to-iOS 109 11
59 aurajs/aura 14 47
60 Automattic/_s 14 341
61 Automattic/socket.io 43 341
62 avelino/awesome-go 43 114
63 avelino/awesome-go 212 427
64 baconjs/bacon.js 35 97
65 balderdashy/sails 66 238
66 b4winckler/macvim 19 43
67 basecamp/pow 8 23
68 bayandin/awesome-awesomeness 8 44
69 bbatsov/rails-style-guide 42 89
70 bbatsov/rails-style-guide 42 84
71 bbatsov/rubocop 92 150
72 BBC-News/Imager.js 92 256
73 BBC-News/wraith 10 25
74 BBC-News/wraith 20 62
75 bcit-ci/CodeIgniter 10 27
76 bebraw/jswiki 128 3
77 benpickles/peity 128 510
78 benpickles/peity 3 7
79 benweet/stackedit 7 39
80 binarylogic/authlogic 7 25
81 binux/pyspider 10 98
82 binux/pyspider 10 32

83 rows × 3 columns


In [ ]:
print(df.describe())

In [4]:
df.hist(column='ratio')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-4-1b0a677d99ff> in <module>()
----> 1 df.hist(column='ratio')

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/tools/plotting.pyc in hist_frame(data, column, by, grid, xlabelsize, xrot, ylabelsize, yrot, ax, sharex, sharey, figsize, layout, bins, **kwds)
   2826         if not isinstance(column, (list, np.ndarray, Index)):
   2827             column = [column]
-> 2828         data = data[column]
   2829     data = data._get_numeric_data()
   2830     naxes = len(data.columns)

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1961         if isinstance(key, (Series, np.ndarray, Index, list)):
   1962             # either boolean or fancy integer index
-> 1963             return self._getitem_array(key)
   1964         elif isinstance(key, DataFrame):
   1965             return self._getitem_frame(key)

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_array(self, key)
   2005             return self.take(indexer, axis=0, convert=False)
   2006         else:
-> 2007             indexer = self.ix._convert_to_indexer(key, axis=1)
   2008             return self.take(indexer, axis=1, convert=True)
   2009 

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/core/indexing.pyc in _convert_to_indexer(self, obj, axis, is_setter)
   1148                 mask = check == -1
   1149                 if mask.any():
-> 1150                     raise KeyError('%s not in index' % objarr[mask])
   1151 
   1152                 return _values_from_object(indexer)

KeyError: "['ratio'] not in index"

In [11]:
df.hist(column='ratio', bins=10, range=(0,0.5))


Out[11]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fc91d7c5d10>]], dtype=object)

In [20]:
df['ratio'].plot.hist(bins=20)


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fc9198e72d0>

In [86]:
authors = pd.read_csv('data/repos/apache/mesos/author-churn.csv', parse_dates=True, index_col='author')
authors = authors / authors.sum()

In [87]:
summary = pd.read_csv('data/repos/apache/mesos/summary.csv', parse_dates=True)
main_dev = pd.read_csv('data/repos/apache/mesos/main-dev.csv', parse_dates=True)
dev = main_dev[['main-dev', 'ownership']].groupby('main-dev').sum()  / summary['value'][2]
dev = dev / dev.sum()
dev = dev.reset_index()
dev.columns = ['author', 'ownership']
dev = dev.set_index('author')

In [140]:
communication = pd.read_csv('data/repos/apache/mesos/communication.csv')
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['shared'])
page_rank = pd.DataFrame.from_dict(nx.pagerank(G, weight='shared'), orient='index')
page_rank.columns = ['page_rank']

In [89]:
summary = pd.read_csv('data/repos/apache/mesos/summary.csv', parse_dates=True)
refactoring_main_dev = pd.read_csv('data/repos/apache/mesos/refactoring-main-dev.csv', parse_dates=True)
refactoring_dev = refactoring_main_dev[['main-dev', 'ownership']].groupby('main-dev').sum()  / summary['value'][2]
refactoring_dev = refactoring_dev / refactoring_dev.sum()
refactoring_dev = refactoring_dev.reset_index()
refactoring_dev.columns = ['author', 'refactoring_ownership']
refactoring_dev = refactoring_dev.set_index('author')

In [90]:
authors_full = pd.concat([dev, refactoring_dev, page_rank, authors], axis='author').fillna(0)

In [91]:
authors_full.corr()


Out[91]:
ownership refactoring_ownership page_rank added deleted commits
ownership 1.000000 0.999670 0.366070 0.999555 0.999526 0.997076
refactoring_ownership 0.999670 1.000000 0.374196 0.999128 0.999171 0.997069
page_rank 0.366070 0.374196 1.000000 0.368540 0.361584 0.425089
added 0.999555 0.999128 0.368540 1.000000 0.999934 0.996621
deleted 0.999526 0.999171 0.361584 0.999934 1.000000 0.995903
commits 0.997076 0.997069 0.425089 0.996621 0.995903 1.000000

In [62]:
summary.set_index('statistic')


Out[62]:
value
statistic
number-of-commits 7502
number-of-entities 32202
number-of-entities-changed 57830
number-of-authors 170

In [301]:
x = summary.set_index('statistic').T.reset_index(drop=True)
x.columns.name = None
repo = pd.read_json('data/repos/apache/mesos/description.json', orient='records')
repo = repo[['name', 'full_name', 'language', 'forks', 'watchers', 'created_at', 'size', 'fork', 'description', 'owner']]
repo = repo.transpose()['id']
repo = pd.DataFrame(repo)
repo = repo.transpose()
repo = repo.reset_index(drop=True)
repo = pd.concat([x.T, repo.T]).T

In [ ]:


In [180]:
import urllib2
response = urllib2.urlopen(repo['contributors_url'][0])
html = response.read()

In [239]:
authors_full['commits'].T.to_dict()['ayouwei']


Out[239]:
1.7292062943109112e-05

In [142]:
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['strength'])
G.add_nodes_from(authors.index)
nx.set_node_attributes(G, 'commits', authors_full['commits'].T.to_dict())
nx.set_node_attributes(G, 'pagerank', authors_full['page_rank'].T.to_dict())
nx.set_node_attributes(G, 'added', authors_full['added'].T.to_dict())
nx.set_node_attributes(G, 'deleted', authors_full['deleted'].T.to_dict())
nx.set_node_attributes(G, 'ownership', authors_full['ownership'].T.to_dict())

In [141]:
communication = communication[communication['author'].isin(authors[authors['commits'] > authors.quantile(.50)['commits']].index.tolist())]

In [143]:
nx.write_gml(G, "mesos2.gml")
communication


Out[143]:
author peer shared average strength
168 Kevin Klues Abhishek Dasgupta 55 102 53
169 Abhishek Dasgupta Kevin Klues 55 102 53
192 Nikita Vetoshkin Ken Sipe 4 8 50
204 Michael Park Isabel Jimenez 123 258 47
205 Isabel Jimenez Michael Park 123 258 47
206 Vinod Kone Benjamin Mahler 255 554 46
207 Benjamin Mahler Vinod Kone 255 554 46
208 Tobi Knaup Bernardo Gomez Palacio 9 20 45
209 Bernardo Gomez Palacio Tobi Knaup 9 20 45
211 Bernardo Gomez Palacio Bill Farner 4 9 44
212 Kapil Arya Jie Yu 183 418 43
213 Kapil Arya Benjamin Bannier 270 617 43
214 Jie Yu Kapil Arya 183 418 43
215 Benjamin Bannier Kapil Arya 270 617 43
217 Nikita Vetoshkin Anindya Sinha 3 7 42
218 Joris Van Remoortere Isabel Jimenez 110 259 42
219 Isabel Jimenez Joris Van Remoortere 110 259 42
220 Christos Kozyrakis Qian Zhang 3 7 42
222 Joerg Schad Abhishek Dasgupta 61 148 41
223 Abhishek Dasgupta Joerg Schad 61 148 41
224 Zhitao Li Ryuichi Okumura 4 10 40
225 Vinod Kone Kapil Arya 172 426 40
227 Michael Park Joris Van Remoortere 108 264 40
228 Kapil Arya Vinod Kone 172 426 40
229 Joris Van Remoortere Michael Park 108 264 40
230 Benjamin Mahler Benjamin Bannier 303 746 40
231 Benjamin Bannier Benjamin Mahler 303 746 40
234 Neil Conway Joerg Schad 90 227 39
235 Joerg Schad Neil Conway 90 227 39
236 Jie Yu Benjamin Bannier 262 660 39
... ... ... ... ... ...
9922 Benjamin Bannier Andy Pang 1 430 0
9923 Benjamin Bannier Steve Niemitz 3 442 0
9924 Benjamin Bannier Jocelyn De La Rosa 1 430 0
9925 Benjamin Bannier Archana kumari 3 432 0
9926 Benjamin Bannier Anton Lindström 4 433 0
9927 Benjamin Bannier Vinson Lee 1 431 0
9928 Benjamin Bannier Joe Gordon 3 433 0
9929 Benjamin Bannier Zhiwei Chen 1 431 0
9930 Ben Mahler Matei Alexandru Zaharia 3 391 0
9931 Ben Mahler Benjamin Hindman 14 15002 0
9932 Bartek Plotka Matei Alexandru Zaharia 2 393 0
9933 Bartek Plotka Benjamin Hindman 9 15005 0
9934 Avinash sridharan Benjamin Hindman 22 15013 0
9935 Artem Harutyunyan Benjamin Hindman 72 15061 0
9936 Artem Harutyunyan Rossi 1 119 0
9966 Andy Konwinski Benjamin Hindman 17 15001 0
9969 Anand Mazumdar Benjamin Hindman 43 15028 0
9970 Alexandra Sava Benjamin Hindman 26 15006 0
9971 Alexander Rukletsov Ross Allen 1 106 0
9972 Alexander Rukletsov Dave Lester 1 119 0
9973 Alexander Rukletsov Benjamin Hindman 79 15074 0
9974 Alexander Rojas Benjamin Hindman 47 15045 0
9975 Alex Naparu Matei Alexandru Zaharia 1 390 0
9976 Alex Naparu Benjamin Hindman 5 15002 0
9977 Alex Clemmer Kevin Klues 1 112 0
9978 Alex Clemmer Matei Alexandru Zaharia 2 453 0
9979 Alex Clemmer Benjamin Hindman 36 15065 0
9984 Aditi Dixit Benjamin Hindman 16 15005 0
9985 Adam B Benjamin Hindman 64 15040 0
9986 Abhishek Dasgupta Benjamin Hindman 14 15055 0

7789 rows × 5 columns


In [132]:
authors[authors['commits'] > authors.quantile(.50)['commits']].index.tolist()
communication[communication['author'].isin(authors[authors['commits'] > authors.quantile(.50)['commits']].index.tolist())]


Out[132]:
author peer shared average strength
168 Kevin Klues Abhishek Dasgupta 55 102 53
169 Abhishek Dasgupta Kevin Klues 55 102 53
192 Nikita Vetoshkin Ken Sipe 4 8 50
204 Michael Park Isabel Jimenez 123 258 47
205 Isabel Jimenez Michael Park 123 258 47
206 Vinod Kone Benjamin Mahler 255 554 46
207 Benjamin Mahler Vinod Kone 255 554 46
208 Tobi Knaup Bernardo Gomez Palacio 9 20 45
209 Bernardo Gomez Palacio Tobi Knaup 9 20 45
211 Bernardo Gomez Palacio Bill Farner 4 9 44
212 Kapil Arya Jie Yu 183 418 43
213 Kapil Arya Benjamin Bannier 270 617 43
214 Jie Yu Kapil Arya 183 418 43
215 Benjamin Bannier Kapil Arya 270 617 43
217 Nikita Vetoshkin Anindya Sinha 3 7 42
218 Joris Van Remoortere Isabel Jimenez 110 259 42
219 Isabel Jimenez Joris Van Remoortere 110 259 42
220 Christos Kozyrakis Qian Zhang 3 7 42
222 Joerg Schad Abhishek Dasgupta 61 148 41
223 Abhishek Dasgupta Joerg Schad 61 148 41
224 Zhitao Li Ryuichi Okumura 4 10 40
225 Vinod Kone Kapil Arya 172 426 40
227 Michael Park Joris Van Remoortere 108 264 40
228 Kapil Arya Vinod Kone 172 426 40
229 Joris Van Remoortere Michael Park 108 264 40
230 Benjamin Mahler Benjamin Bannier 303 746 40
231 Benjamin Bannier Benjamin Mahler 303 746 40
234 Neil Conway Joerg Schad 90 227 39
235 Joerg Schad Neil Conway 90 227 39
236 Jie Yu Benjamin Bannier 262 660 39
... ... ... ... ... ...
9922 Benjamin Bannier Andy Pang 1 430 0
9923 Benjamin Bannier Steve Niemitz 3 442 0
9924 Benjamin Bannier Jocelyn De La Rosa 1 430 0
9925 Benjamin Bannier Archana kumari 3 432 0
9926 Benjamin Bannier Anton Lindström 4 433 0
9927 Benjamin Bannier Vinson Lee 1 431 0
9928 Benjamin Bannier Joe Gordon 3 433 0
9929 Benjamin Bannier Zhiwei Chen 1 431 0
9930 Ben Mahler Matei Alexandru Zaharia 3 391 0
9931 Ben Mahler Benjamin Hindman 14 15002 0
9932 Bartek Plotka Matei Alexandru Zaharia 2 393 0
9933 Bartek Plotka Benjamin Hindman 9 15005 0
9934 Avinash sridharan Benjamin Hindman 22 15013 0
9935 Artem Harutyunyan Benjamin Hindman 72 15061 0
9936 Artem Harutyunyan Rossi 1 119 0
9966 Andy Konwinski Benjamin Hindman 17 15001 0
9969 Anand Mazumdar Benjamin Hindman 43 15028 0
9970 Alexandra Sava Benjamin Hindman 26 15006 0
9971 Alexander Rukletsov Ross Allen 1 106 0
9972 Alexander Rukletsov Dave Lester 1 119 0
9973 Alexander Rukletsov Benjamin Hindman 79 15074 0
9974 Alexander Rojas Benjamin Hindman 47 15045 0
9975 Alex Naparu Matei Alexandru Zaharia 1 390 0
9976 Alex Naparu Benjamin Hindman 5 15002 0
9977 Alex Clemmer Kevin Klues 1 112 0
9978 Alex Clemmer Matei Alexandru Zaharia 2 453 0
9979 Alex Clemmer Benjamin Hindman 36 15065 0
9984 Aditi Dixit Benjamin Hindman 16 15005 0
9985 Adam B Benjamin Hindman 64 15040 0
9986 Abhishek Dasgupta Benjamin Hindman 14 15055 0

7789 rows × 5 columns


In [117]:
authors.describe()


Out[117]:
added deleted commits
count 170.000000 1.700000e+02 170.000000
mean 0.005882 5.882353e-03 0.005882
std 0.064830 6.786453e-02 0.051363
min 0.000000 0.000000e+00 0.000017
25% 0.000002 4.435283e-07 0.000035
50% 0.000034 5.876750e-06 0.000147
75% 0.000404 1.065022e-04 0.001366
max 0.843722 8.830542e-01 0.668511

In [84]:



Out[84]:
page_rank
Gabriel Monroy 0.000896
weitao zhou 0.000912
Ian Babrou 0.000921
Tom Galloway 0.000924
Diogo Gomes 0.000924
Tim Anderegg 0.000924
Tobias Weingartner 0.000932
grandlogic 0.000940
Michael Schenck 0.000947
mlawindi 0.000950
Disha Singh 0.000959
Jihun Kang 0.000962
Itamar Ostricher 0.000962
Jocelyn De La Rosa 0.000973
Andrey Dyatlov 0.000980
Jiri Simsa 0.000988
Kevin Devroede 0.000990
Jake Farrell 0.001004
Kiyonari Harigae 0.001005
Kamil Doma?ski 0.001009
M Lawindi 0.001026
ASHUTOSH JAIN 0.001032
Oliver Nicholas 0.001062
Wojciech Sielski 0.001072
Craig Hansen-Sturm 0.001087
Bill Farner 0.001100
Andy Pang 0.001104
Akanksha Agrawal 0.001124
Jonathon Rossi 0.001149
niklas 0.001154
... ...
Anand Mazumdar 0.010000
Niklas Nielsen 0.010808
Guangya Liu 0.010835
Niklas Q. Nielsen 0.011353
Abhishek Dasgupta 0.011786
Greg Mann 0.012134
Cody Maloney 0.012288
Alexander Rojas 0.012592
Jiang Yan Xu 0.014008
Artem Harutyunyan 0.014209
Adam B 0.015143
Bernd Mathiske 0.015507
Till Toenshoff 0.015945
Ian Downes 0.016292
Dominic Hamon 0.016529
haosdent huang 0.017402
Joerg Schad 0.018139
Alexander Rukletsov 0.018570
Timothy Chen 0.019393
Joseph Wu 0.020566
Isabel Jimenez 0.022506
Joris Van Remoortere 0.023240
Michael Park 0.024305
Neil Conway 0.027533
Jie Yu 0.030604
Kapil Arya 0.030834
Vinod Kone 0.034903
Benjamin Mahler 0.041376
Benjamin Hindman 0.043383
Benjamin Bannier 0.046414

169 rows × 1 columns


In [83]:
communication = pd.read_csv('repo/data/apache/mesos/communication.csv')
G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['average'])
page_rank = pd.DataFrame.from_dict(nx.pagerank(G, weight='average'), orient='index')
page_rank.columns = ['page_rank']


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-83-19f143fc536a> in <module>()
----> 1 communication = pd.read_csv('repo/data/apache/mesos/communication.csv')
      2 G=nx.from_pandas_dataframe(communication, 'author', 'peer', ['average'])
      3 page_rank = pd.DataFrame.from_dict(nx.pagerank(G, weight='average'), orient='index')
      4 page_rank.columns = ['page_rank']

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    496                     skip_blank_lines=skip_blank_lines)
    497 
--> 498         return _read(filepath_or_buffer, kwds)
    499 
    500     parser_f.__name__ = name

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    273 
    274     # Create the parser.
--> 275     parser = TextFileReader(filepath_or_buffer, **kwds)
    276 
    277     if (nrows is not None) and (chunksize is not None):

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    588             self.options['has_index_names'] = kwds['has_index_names']
    589 
--> 590         self._make_engine(self.engine)
    591 
    592     def _get_options_with_defaults(self, engine):

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    729     def _make_engine(self, engine='c'):
    730         if engine == 'c':
--> 731             self._engine = CParserWrapper(self.f, **self.options)
    732         else:
    733             if engine == 'python':

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1101         kwds['allow_leading_cols'] = self.index_col is not False
   1102 
-> 1103         self._reader = _parser.TextReader(src, **kwds)
   1104 
   1105         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3246)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6111)()

IOError: File repo/data/apache/mesos/communication.csv does not exist

In [72]:
added['Victor Quinn']


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-72-88ce287dee8d> in <module>()
----> 1 added['Victor Quinn']

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1967             return self._getitem_multilevel(key)
   1968         else:
-> 1969             return self._getitem_column(key)
   1970 
   1971     def _getitem_column(self, key):

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1974         # get column
   1975         if self.columns.is_unique:
-> 1976             return self._get_item_cache(key)
   1977 
   1978         # duplicate columns & possible reduce dimensionality

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1089         res = cache.get(item)
   1090         if res is None:
-> 1091             values = self._data.get(item)
   1092             res = self._box_item_values(item, values)
   1093             cache[item] = res

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   3209 
   3210             if not isnull(item):
-> 3211                 loc = self.items.get_loc(item)
   3212             else:
   3213                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/janisz/anaconda2/lib/python2.7/site-packages/pandas/core/index.pyc in get_loc(self, key, method, tolerance)
   1757                                  'backfill or nearest lookups')
   1758             key = _values_from_object(key)
-> 1759             return self._engine.get_loc(key)
   1760 
   1761         indexer = self.get_indexer([key], method=method,

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3979)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3843)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12265)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12216)()

KeyError: 'Victor Quinn'

In [66]:
import numpy as np

added = np.log(authors[['added']]*10E7).T
# Create a list to store the data
weights = []

# For each row in the column,
for author in communication['author']:
    weights.append(added[author][0] if author in added.columns else 0)

# Create a column from the list
communication['weights'] = weights

In [76]:
communication.describe()


Out[76]:
shared average strength weights
count 864.000000 864.000000 864.000000 864
mean 3.372685 57.125000 17.412037 0
std 7.779254 86.882898 20.888861 0
min 1.000000 1.000000 0.000000 0
25% 1.000000 8.000000 4.000000 0
50% 1.000000 17.000000 12.000000 0
75% 3.000000 73.000000 22.000000 0
max 131.000000 328.000000 100.000000 0

In [82]:



Out[82]:
shared average strength
count 10004.000000 10004.000000 10004.000000
mean 8.868653 495.041184 8.717713
std 21.432479 2376.565457 14.043615
min 1.000000 1.000000 0.000000
25% 1.000000 33.000000 2.000000
50% 2.000000 75.000000 5.000000
75% 7.000000 146.000000 10.000000
max 370.000000 15421.000000 100.000000

In [ ]: