Finding Key Connectors


In [1]:
users = [
    {'id': 0, 'name': 'Hero'},
    {'id': 1, 'name': 'Dunn'},
    {'id': 2, 'name': 'Sue'},
    {'id': 3, 'name': 'Chi'},
    {'id': 4, 'name': 'Thor'},
    {'id': 5, 'name': 'Clive'},
    {'id': 6, 'name': 'Hicks'},
    {'id': 7, 'name': 'Devin'},
    {'id': 8, 'name': 'Kate'},
    {'id': 9, 'name': 'Klein'}
]

In [2]:
friendship = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4),
              (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

In [3]:
users


Out[3]:
[{'id': 0, 'name': 'Hero'},
 {'id': 1, 'name': 'Dunn'},
 {'id': 2, 'name': 'Sue'},
 {'id': 3, 'name': 'Chi'},
 {'id': 4, 'name': 'Thor'},
 {'id': 5, 'name': 'Clive'},
 {'id': 6, 'name': 'Hicks'},
 {'id': 7, 'name': 'Devin'},
 {'id': 8, 'name': 'Kate'},
 {'id': 9, 'name': 'Klein'}]

In [4]:
friendship


Out[4]:
[(0, 1),
 (0, 2),
 (1, 2),
 (1, 3),
 (2, 3),
 (3, 4),
 (4, 5),
 (5, 6),
 (5, 7),
 (6, 8),
 (7, 8),
 (8, 9)]

In [5]:
for user in users:
    user['friends'] = []

In [6]:
users


Out[6]:
[{'friends': [], 'id': 0, 'name': 'Hero'},
 {'friends': [], 'id': 1, 'name': 'Dunn'},
 {'friends': [], 'id': 2, 'name': 'Sue'},
 {'friends': [], 'id': 3, 'name': 'Chi'},
 {'friends': [], 'id': 4, 'name': 'Thor'},
 {'friends': [], 'id': 5, 'name': 'Clive'},
 {'friends': [], 'id': 6, 'name': 'Hicks'},
 {'friends': [], 'id': 7, 'name': 'Devin'},
 {'friends': [], 'id': 8, 'name': 'Kate'},
 {'friends': [], 'id': 9, 'name': 'Klein'}]

In [7]:
for i, j in friendship:
    users[i]['friends'].append(users[j])
    users[j]['friends'].append(users[i])

In [8]:
def number_of_friends(user):
    '''how many friends do _user_ have'''
    return len(user['friends'])

In [9]:
help(number_of_friends)


Help on function number_of_friends in module __main__:

number_of_friends(user)
    how many friends do _user_ have


In [10]:
avg = sum(number_of_friends(user) for user in users) / len(users)

In [11]:
num_friends_by_id = [
    (user['id'], number_of_friends(user))
    for user in users
]

In [12]:
num_friends_by_id


Out[12]:
[(0, 2),
 (1, 3),
 (2, 3),
 (3, 3),
 (4, 2),
 (5, 3),
 (6, 2),
 (7, 2),
 (8, 3),
 (9, 1)]

In [13]:
sorted(num_friends_by_id, key = lambda user: user[1], reverse = True)


Out[13]:
[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

Data Scientists You May Know


In [14]:
def friends_of_friend_id_in(user):
    return [foaf['id']
            for friend in user['friends']
            for foaf in friend['friends']]

In [15]:
friends_of_friend_id_in(users[0])


Out[15]:
[0, 2, 3, 0, 1, 3]

In [16]:
from collections import Counter

In [17]:
help(Counter)


Help on class Counter in module collections:

class Counter(builtins.dict)
 |  Dict subclass for counting hashable items.  Sometimes called a bag
 |  or multiset.  Elements are stored as dictionary keys and their counts
 |  are stored as dictionary values.
 |  
 |  >>> c = Counter('abcdeabcdabcaba')  # count elements from a string
 |  
 |  >>> c.most_common(3)                # three most common elements
 |  [('a', 5), ('b', 4), ('c', 3)]
 |  >>> sorted(c)                       # list all unique elements
 |  ['a', 'b', 'c', 'd', 'e']
 |  >>> ''.join(sorted(c.elements()))   # list elements with repetitions
 |  'aaaaabbbbcccdde'
 |  >>> sum(c.values())                 # total of all counts
 |  15
 |  
 |  >>> c['a']                          # count of letter 'a'
 |  5
 |  >>> for elem in 'shazam':           # update counts from an iterable
 |  ...     c[elem] += 1                # by adding 1 to each element's count
 |  >>> c['a']                          # now there are seven 'a'
 |  7
 |  >>> del c['b']                      # remove all 'b'
 |  >>> c['b']                          # now there are zero 'b'
 |  0
 |  
 |  >>> d = Counter('simsalabim')       # make another counter
 |  >>> c.update(d)                     # add in the second counter
 |  >>> c['a']                          # now there are nine 'a'
 |  9
 |  
 |  >>> c.clear()                       # empty the counter
 |  >>> c
 |  Counter()
 |  
 |  Note:  If a count is set to zero or reduced to zero, it will remain
 |  in the counter until the entry is deleted or the counter is cleared:
 |  
 |  >>> c = Counter('aaabbc')
 |  >>> c['b'] -= 2                     # reduce the count of 'b' by two
 |  >>> c.most_common()                 # 'b' is still in, but its count is zero
 |  [('a', 3), ('c', 1), ('b', 0)]
 |  
 |  Method resolution order:
 |      Counter
 |      builtins.dict
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __add__(self, other)
 |      Add counts from two counters.
 |      
 |      >>> Counter('abbb') + Counter('bcc')
 |      Counter({'b': 4, 'c': 2, 'a': 1})
 |  
 |  __and__(self, other)
 |      Intersection is the minimum of corresponding counts.
 |      
 |      >>> Counter('abbb') & Counter('bcc')
 |      Counter({'b': 1})
 |  
 |  __delitem__(self, elem)
 |      Like dict.__delitem__() but does not raise KeyError for missing values.
 |  
 |  __iadd__(self, other)
 |      Inplace add from another counter, keeping only positive counts.
 |      
 |      >>> c = Counter('abbb')
 |      >>> c += Counter('bcc')
 |      >>> c
 |      Counter({'b': 4, 'c': 2, 'a': 1})
 |  
 |  __iand__(self, other)
 |      Inplace intersection is the minimum of corresponding counts.
 |      
 |      >>> c = Counter('abbb')
 |      >>> c &= Counter('bcc')
 |      >>> c
 |      Counter({'b': 1})
 |  
 |  __init__(*args, **kwds)
 |      Create a new, empty Counter object.  And if given, count elements
 |      from an input iterable.  Or, initialize the count from another mapping
 |      of elements to their counts.
 |      
 |      >>> c = Counter()                           # a new, empty counter
 |      >>> c = Counter('gallahad')                 # a new counter from an iterable
 |      >>> c = Counter({'a': 4, 'b': 2})           # a new counter from a mapping
 |      >>> c = Counter(a=4, b=2)                   # a new counter from keyword args
 |  
 |  __ior__(self, other)
 |      Inplace union is the maximum of value from either counter.
 |      
 |      >>> c = Counter('abbb')
 |      >>> c |= Counter('bcc')
 |      >>> c
 |      Counter({'b': 3, 'c': 2, 'a': 1})
 |  
 |  __isub__(self, other)
 |      Inplace subtract counter, but keep only results with positive counts.
 |      
 |      >>> c = Counter('abbbc')
 |      >>> c -= Counter('bccd')
 |      >>> c
 |      Counter({'b': 2, 'a': 1})
 |  
 |  __missing__(self, key)
 |      The count of elements not in the Counter is zero.
 |  
 |  __neg__(self)
 |      Subtracts from an empty counter.  Strips positive and zero counts,
 |      and flips the sign on negative counts.
 |  
 |  __or__(self, other)
 |      Union is the maximum of value in either of the input counters.
 |      
 |      >>> Counter('abbb') | Counter('bcc')
 |      Counter({'b': 3, 'c': 2, 'a': 1})
 |  
 |  __pos__(self)
 |      Adds an empty counter, effectively stripping negative and zero counts
 |  
 |  __reduce__(self)
 |      helper for pickle
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  __sub__(self, other)
 |      Subtract count, but keep only results with positive counts.
 |      
 |      >>> Counter('abbbc') - Counter('bccd')
 |      Counter({'b': 2, 'a': 1})
 |  
 |  copy(self)
 |      Return a shallow copy.
 |  
 |  elements(self)
 |      Iterator over elements repeating each as many times as its count.
 |      
 |      >>> c = Counter('ABCABC')
 |      >>> sorted(c.elements())
 |      ['A', 'A', 'B', 'B', 'C', 'C']
 |      
 |      # Knuth's example for prime factors of 1836:  2**2 * 3**3 * 17**1
 |      >>> prime_factors = Counter({2: 2, 3: 3, 17: 1})
 |      >>> product = 1
 |      >>> for factor in prime_factors.elements():     # loop over factors
 |      ...     product *= factor                       # and multiply them
 |      >>> product
 |      1836
 |      
 |      Note, if an element's count has been set to zero or is a negative
 |      number, elements() will ignore it.
 |  
 |  most_common(self, n=None)
 |      List the n most common elements and their counts from the most
 |      common to the least.  If n is None, then list all element counts.
 |      
 |      >>> Counter('abcdeabcdabcaba').most_common(3)
 |      [('a', 5), ('b', 4), ('c', 3)]
 |  
 |  subtract(*args, **kwds)
 |      Like dict.update() but subtracts counts instead of replacing them.
 |      Counts can be reduced below zero.  Both the inputs and outputs are
 |      allowed to contain zero and negative counts.
 |      
 |      Source can be an iterable, a dictionary, or another Counter instance.
 |      
 |      >>> c = Counter('which')
 |      >>> c.subtract('witch')             # subtract elements from another iterable
 |      >>> c.subtract(Counter('watch'))    # subtract elements from another counter
 |      >>> c['h']                          # 2 in which, minus 1 in witch, minus 1 in watch
 |      0
 |      >>> c['w']                          # 1 in which, minus 1 in witch, minus 1 in watch
 |      -1
 |  
 |  update(*args, **kwds)
 |      Like dict.update() but add counts instead of replacing them.
 |      
 |      Source can be an iterable, a dictionary, or another Counter instance.
 |      
 |      >>> c = Counter('which')
 |      >>> c.update('witch')           # add elements from another iterable
 |      >>> d = Counter('watch')
 |      >>> c.update(d)                 # add elements from another counter
 |      >>> c['h']                      # four 'h' in which, witch, and watch
 |      4
 |  
 |  ----------------------------------------------------------------------
 |  Class methods defined here:
 |  
 |  fromkeys(iterable, v=None) from builtins.type
 |      Returns a new dict with keys from iterable and values equal to value.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from builtins.dict:
 |  
 |  __contains__(self, key, /)
 |      True if D has a key k, else False.
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __le__(self, value, /)
 |      Return self<=value.
 |  
 |  __len__(self, /)
 |      Return len(self).
 |  
 |  __lt__(self, value, /)
 |      Return self<value.
 |  
 |  __ne__(self, value, /)
 |      Return self!=value.
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  __setitem__(self, key, value, /)
 |      Set self[key] to value.
 |  
 |  __sizeof__(...)
 |      D.__sizeof__() -> size of D in memory, in bytes
 |  
 |  clear(...)
 |      D.clear() -> None.  Remove all items from D.
 |  
 |  get(...)
 |      D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None.
 |  
 |  items(...)
 |      D.items() -> a set-like object providing a view on D's items
 |  
 |  keys(...)
 |      D.keys() -> a set-like object providing a view on D's keys
 |  
 |  pop(...)
 |      D.pop(k[,d]) -> v, remove specified key and return the corresponding value.
 |      If key is not found, d is returned if given, otherwise KeyError is raised
 |  
 |  popitem(...)
 |      D.popitem() -> (k, v), remove and return some (key, value) pair as a
 |      2-tuple; but raise KeyError if D is empty.
 |  
 |  setdefault(...)
 |      D.setdefault(k[,d]) -> D.get(k,d), also set D[k]=d if k not in D
 |  
 |  values(...)
 |      D.values() -> an object providing a view on D's values
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from builtins.dict:
 |  
 |  __hash__ = None


In [18]:
def not_same(user1, user2):
    '''whether _user1_ is not the same as _user2_'''
    return user1 != user2

In [19]:
def not_friend(user1, user2):
    '''whether _user1_ is a friend of _user2_'''
    return user1 not in user2['friends']

In [20]:
def friends_of_friend_ids(user):
    return Counter([foaf['id'] 
                   for friend in user['friends']
                   for foaf in friend['friends']
                   if not_same(foaf, user)
                   if not_friend(foaf, user)])

In [21]:
friends_of_friend_ids(users[3])


Out[21]:
Counter({0: 2, 5: 1})

In [22]:
interests = [
    (0, 'Hadoop'), (0, 'Big Data'), (0, 'HBase'), (0, 'Java'),
    (0, 'Spark'), (0, 'Storm'), (0, 'Cassandra'),
    (1, 'NoSQL'), (1, 'MongoDB'), (1, 'Cassandra'), (1, 'HBase'), 
    (1, 'Postgres'), (2, 'Python'), (2, 'scikit-learn'), (2, 'scipy'),
    (2, 'numpy'), (2, 'statsmodels'), (2, 'pandas'), (3, 'R'), (3, 'Python'),
    (3, 'statistics'), (3, 'regression'), (3, 'probability'),
    (4, 'machine learning'), (4, 'regression'), (4, 'decision trees'),
    (4, 'libsvm'), (5, 'Python'), (5, 'R'), (5, 'Java'), (5, 'C++'),
    (5, 'Haskell'), (5, 'programming languages'), (6, 'statistics'),
    (6, 'probability'), (6, 'mathematics'), (6, 'theory'),
    (7, 'machine learning'), (7, 'scikit-learn'), (7, 'Mahout'),
    (7, 'neural networks'), (8, 'neural networks'), (8, 'deep learning'),
    (8, 'Big Data'), (8, 'artificial intelligence'), (9, 'Hadoop'),
    (9, 'Java'), (9, 'MapReduce'), (9, 'Big Data')
]

In [24]:
def data_scientists_who_like(target_interest, interestsets):
    '''find data scientists via interests'''
    return [user_id
           for user_id, user_interest in interestsets
           if user_interest == target_interest]

In [26]:
data_scientists_who_like('Big Data', interests)


Out[26]:
[0, 8, 9]

In [27]:
from collections import defaultdict

In [28]:
help(defaultdict)


Help on class defaultdict in module collections:

class defaultdict(builtins.dict)
 |  defaultdict(default_factory[, ...]) --> dict with default factory
 |  
 |  The default factory is called without arguments to produce
 |  a new value when a key is not present, in __getitem__ only.
 |  A defaultdict compares equal to a dict with the same items.
 |  All remaining arguments are treated the same as if they were
 |  passed to the dict constructor, including keyword arguments.
 |  
 |  Method resolution order:
 |      defaultdict
 |      builtins.dict
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __copy__(...)
 |      D.copy() -> a shallow copy of D.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __init__(self, /, *args, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  __missing__(...)
 |      __missing__(key) # Called by __getitem__ for missing key; pseudo-code:
 |      if self.default_factory is None: raise KeyError((key,))
 |      self[key] = value = self.default_factory()
 |      return value
 |  
 |  __reduce__(...)
 |      Return state information for pickling.
 |  
 |  __repr__(self, /)
 |      Return repr(self).
 |  
 |  copy(...)
 |      D.copy() -> a shallow copy of D.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  default_factory
 |      Factory for default value called by __missing__().
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from builtins.dict:
 |  
 |  __contains__(self, key, /)
 |      True if D has a key k, else False.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __iter__(self, /)
 |      Implement iter(self).
 |  
 |  __le__(self, value, /)
 |      Return self<=value.
 |  
 |  __len__(self, /)
 |      Return len(self).
 |  
 |  __lt__(self, value, /)
 |      Return self<value.
 |  
 |  __ne__(self, value, /)
 |      Return self!=value.
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  __setitem__(self, key, value, /)
 |      Set self[key] to value.
 |  
 |  __sizeof__(...)
 |      D.__sizeof__() -> size of D in memory, in bytes
 |  
 |  clear(...)
 |      D.clear() -> None.  Remove all items from D.
 |  
 |  fromkeys(iterable, value=None, /) from builtins.type
 |      Returns a new dict with keys from iterable and values equal to value.
 |  
 |  get(...)
 |      D.get(k[,d]) -> D[k] if k in D, else d.  d defaults to None.
 |  
 |  items(...)
 |      D.items() -> a set-like object providing a view on D's items
 |  
 |  keys(...)
 |      D.keys() -> a set-like object providing a view on D's keys
 |  
 |  pop(...)
 |      D.pop(k[,d]) -> v, remove specified key and return the corresponding value.
 |      If key is not found, d is returned if given, otherwise KeyError is raised
 |  
 |  popitem(...)
 |      D.popitem() -> (k, v), remove and return some (key, value) pair as a
 |      2-tuple; but raise KeyError if D is empty.
 |  
 |  setdefault(...)
 |      D.setdefault(k[,d]) -> D.get(k,d), also set D[k]=d if k not in D
 |  
 |  update(...)
 |      D.update([E, ]**F) -> None.  Update D from dict/iterable E and F.
 |      If E is present and has a .keys() method, then does:  for k in E: D[k] = E[k]
 |      If E is present and lacks a .keys() method, then does:  for k, v in E: D[k] = v
 |      In either case, this is followed by: for k in F:  D[k] = F[k]
 |  
 |  values(...)
 |      D.values() -> an object providing a view on D's values
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from builtins.dict:
 |  
 |  __hash__ = None


In [36]:
user_ids_by_interest = defaultdict(list)

In [37]:
user_ids_by_interest


Out[37]:
defaultdict(list, {})

In [38]:
for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)

In [39]:
user_ids_by_interest


Out[39]:
defaultdict(list,
            {'Big Data': [0, 8, 9],
             'C++': [5],
             'Cassandra': [0, 1],
             'HBase': [0, 1],
             'Hadoop': [0, 9],
             'Haskell': [5],
             'Java': [0, 5, 9],
             'Mahout': [7],
             'MapReduce': [9],
             'MongoDB': [1],
             'NoSQL': [1],
             'Postgres': [1],
             'Python': [2, 3, 5],
             'R': [3, 5],
             'Spark': [0],
             'Storm': [0],
             'artificial intelligence': [8],
             'decision trees': [4],
             'deep learning': [8],
             'libsvm': [4],
             'machine learning': [4, 7],
             'mathematics': [6],
             'neural networks': [7, 8],
             'numpy': [2],
             'pandas': [2],
             'probability': [3, 6],
             'programming languages': [5],
             'regression': [3, 4],
             'scikit-learn': [2, 7],
             'scipy': [2],
             'statistics': [3, 6],
             'statsmodels': [2],
             'theory': [6]})

In [40]:
interests_by_user_id = defaultdict(list)
for user_id, user_interest in interests:
    interests_by_user_id[user_id].append(user_interest)

In [44]:
interests_by_user_id


Out[44]:
defaultdict(list,
            {0: ['Hadoop',
              'Big Data',
              'HBase',
              'Java',
              'Spark',
              'Storm',
              'Cassandra'],
             1: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
             2: ['Python',
              'scikit-learn',
              'scipy',
              'numpy',
              'statsmodels',
              'pandas'],
             3: ['R', 'Python', 'statistics', 'regression', 'probability'],
             4: ['machine learning', 'regression', 'decision trees', 'libsvm'],
             5: ['Python',
              'R',
              'Java',
              'C++',
              'Haskell',
              'programming languages'],
             6: ['statistics', 'probability', 'mathematics', 'theory'],
             7: ['machine learning',
              'scikit-learn',
              'Mahout',
              'neural networks'],
             8: ['neural networks',
              'deep learning',
              'Big Data',
              'artificial intelligence'],
             9: ['Hadoop', 'Java', 'MapReduce', 'Big Data']})

In [45]:
def common_interest_users(user):
    return Counter([interested_user_id
                   for interest in interests_by_user_id[user['id']]
                   for interested_user_id in user_ids_by_interest[interest]
                   if interested_user_id != user['id']])

In [46]:
common_interest_users(users[0])


Out[46]:
Counter({1: 2, 5: 1, 8: 1, 9: 3})

Salaries and Experience


In [47]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                        (48000, 0.7), (76000, 6),
                        (69000, 6.5), (76000, 7.5),
                        (60000, 2.5), (83000, 10),
                        (48000, 1.9), (63000, 4.2)]

In [48]:
salary_by_tenure = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    salary_by_tenure[tenure].append(salary)

In [49]:
salary_by_tenure


Out[49]:
defaultdict(list,
            {0.7: [48000],
             1.9: [48000],
             2.5: [60000],
             4.2: [63000],
             6: [76000],
             6.5: [69000],
             7.5: [76000],
             8.1: [88000],
             8.7: [83000],
             10: [83000]})

In [50]:
average_salary_by_tenure = {
    tenure: sum(salaries) / len(salaries)
    for tenure, salaries in salary_by_tenure.items()
}

In [51]:
average_salary_by_tenure


Out[51]:
{0.7: 48000.0,
 1.9: 48000.0,
 2.5: 60000.0,
 4.2: 63000.0,
 6: 76000.0,
 6.5: 69000.0,
 7.5: 76000.0,
 8.1: 88000.0,
 8.7: 83000.0,
 10: 83000.0}

In [52]:
def tenure_bucket(tenure):
    if tenure < 2:
        return 'less than two'
    elif tenure <= 5:
        return 'between two and five'
    else:
        return 'more than five'

In [53]:
salary_by_tenure_bucket = defaultdict(list)

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure_bucket[bucket].append(salary)

In [54]:
salary_by_tenure_bucket


Out[54]:
defaultdict(list,
            {'between two and five': [60000, 63000],
             'less than two': [48000, 48000],
             'more than five': [83000, 88000, 76000, 69000, 76000, 83000]})

In [55]:
average_salary_by_bucket = {
    bucket: sum(salaries) / len(salaries)
    for bucket, salaries in salary_by_tenure_bucket.items()
}

In [56]:
average_salary_by_bucket


Out[56]:
{'between two and five': 61500.0,
 'less than two': 48000.0,
 'more than five': 79166.66666666667}

In [57]:
def predict_paid_or_unpaid(years_experience):
    if years_experience < 3.0:
        return 'paid'
    elif years_experience < 8.5:
        return 'unpaid'
    else:
        return 'paid'

Topics of Interest


In [59]:
words_and_counts = Counter(word
                         for user, interest in interests
                         for word in interest.lower().split())

In [65]:
words_and_counts


Out[65]:
Counter({'artificial': 1,
         'big': 3,
         'c++': 1,
         'cassandra': 2,
         'data': 3,
         'decision': 1,
         'deep': 1,
         'hadoop': 2,
         'haskell': 1,
         'hbase': 2,
         'intelligence': 1,
         'java': 3,
         'languages': 1,
         'learning': 3,
         'libsvm': 1,
         'machine': 2,
         'mahout': 1,
         'mapreduce': 1,
         'mathematics': 1,
         'mongodb': 1,
         'networks': 2,
         'neural': 2,
         'nosql': 1,
         'numpy': 1,
         'pandas': 1,
         'postgres': 1,
         'probability': 2,
         'programming': 1,
         'python': 3,
         'r': 2,
         'regression': 2,
         'scikit-learn': 2,
         'scipy': 1,
         'spark': 1,
         'statistics': 2,
         'statsmodels': 1,
         'storm': 1,
         'theory': 1,
         'trees': 1})

In [62]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print(word, count)


java 3
big 3
data 3
python 3
learning 3
r 2
scikit-learn 2
hbase 2
neural 2
machine 2
regression 2
statistics 2
cassandra 2
hadoop 2
networks 2
probability 2

In [ ]: