Find "key connectors" in the following graph


In [35]:
from __future__ import division

In [22]:
users = [
        { "id": 0, "name": "Hero" },
        { "id": 1, "name": "Dunn" },
        { "id": 2, "name": "Sue" },
        { "id": 3, "name": "Chi" },
        { "id": 4, "name": "Thor" },
        { "id": 5, "name": "Clive" },
        { "id": 6, "name": "Hicks" },
        { "id": 7, "name": "Devin" },
        { "id": 8, "name": "Kate" },
        { "id": 9, "name": "Klein" }
]

In [23]:
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]

In [24]:
# Add a list of friends to each user for reference
for user in users:
    user['friends'] = []
    
# Populate with friends
for i,j in friendships:
    users[i]['friends'].append(users[j]) # adding j as friend to i
    users[j]['friends'].append(users[i]) # reciprocate

In [28]:
# How many friends?
def number_of_friends(user):
    return len(user['friends'])

In [32]:
# each user's friend count
[number_of_friends(u) for u in users]


Out[32]:
[2, 3, 3, 3, 2, 3, 2, 2, 3, 1]

In [33]:
# total count
total_connections = sum([number_of_friends(u) for u in users])

In [34]:
total_connections


Out[34]:
24

In [38]:
# Gotta have da total if ya gonna get da average
num_users = len(users)

avg_connections = total_connections / num_users

In [39]:
avg_connections


Out[39]:
2.4

Find the people with the most friends


In [40]:
num_friends_by_id = [(u['id'], number_of_friends(u)) for u in users]

In [43]:
sorted(num_friends_by_id, key = lambda x: x[1], reverse=True)


Out[43]:
[(1, 3),
 (2, 3),
 (3, 3),
 (5, 3),
 (8, 3),
 (0, 2),
 (4, 2),
 (6, 2),
 (7, 2),
 (9, 1)]

People you may know


In [48]:
# Brute force try
def friends_of_friends_ids_bad(user):
    return [foaf['id']
            for friend in user['friends'] # for all the friends
            for foaf in friend['friends']] # get their friends

In [49]:
friends_of_friends_ids_bad(users[0])


Out[49]:
[0, 2, 3, 0, 1, 3]

In [60]:
# redo this accounting for dupes and self

from collections import Counter

def not_the_same(user, other_user):
    # check if two users are the same
    return user['id'] != other_user['id']

def not_friends(user, other_user):
    # Check if other user is in friends list of user
    return all(not_the_same(friend, other_user)
              for friend in user['friends'])

def friends_of_friend_ids(user):
    return Counter(foaf['id']
                  for friend in user['friends'] # check user friends
                  for foaf in friend['friends'] # and count their friends
                  if not_the_same(user, foaf) # as long as they aren't user
                  and not_friends(user, foaf)) # and not my friend

In [62]:
[friends_of_friend_ids(u) for u in users]


Out[62]:
[Counter({3: 2}),
 Counter({4: 1}),
 Counter({4: 1}),
 Counter({0: 2, 5: 1}),
 Counter({1: 1, 2: 1, 6: 1, 7: 1}),
 Counter({8: 2, 3: 1}),
 Counter({7: 2, 9: 1, 4: 1}),
 Counter({6: 2, 9: 1, 4: 1}),
 Counter({5: 2}),
 Counter({6: 1, 7: 1})]

Let's try recommending via interests


In [63]:
interests = [
        (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
        (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
        (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
        (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
        (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
        (3, "statistics"), (3, "regression"), (3, "probability"),
        (4, "machine learning"), (4, "regression"), (4, "decision trees"),
        (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
        (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
        (6, "probability"), (6, "mathematics"), (6, "theory"),
        (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
        (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
        (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
        (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [66]:
# Find people who like a given interest
def users_who_like(target_interest):
    return [user_id
           for user_id, user_interest in interests
           if user_interest == target_interest]

# Slow since it searches all interests

In [69]:
[users_who_like(i) for _,i in interests]


Out[69]:
[[0, 9],
 [0, 8, 9],
 [0, 1],
 [0, 5, 9],
 [0],
 [0],
 [0, 1],
 [1],
 [1],
 [0, 1],
 [0, 1],
 [1],
 [2, 3, 5],
 [2, 7],
 [2],
 [2],
 [2],
 [2],
 [3, 5],
 [2, 3, 5],
 [3, 6],
 [3, 4],
 [3, 6],
 [4, 7],
 [3, 4],
 [4],
 [4],
 [2, 3, 5],
 [3, 5],
 [0, 5, 9],
 [5],
 [5],
 [5],
 [3, 6],
 [3, 6],
 [6],
 [6],
 [4, 7],
 [2, 7],
 [7],
 [7, 8],
 [7, 8],
 [8],
 [0, 8, 9],
 [8],
 [0, 9],
 [0, 5, 9],
 [9],
 [0, 8, 9]]

In [72]:
# Instead let's index users by their interests and simply retrieve
from collections import defaultdict

user_ids_by_interest = defaultdict(list)
# also do the reverse dict
interests_by_user_id = defaultdict(list)

# For all user, interest pairs make a dict
for user_id, interest in interests:
    user_ids_by_interest[interest].append(user_id)
    interests_by_user_id[user_id].append(interest)

In [75]:
user_ids_by_interest.items()


Out[75]:
[('Java', [0, 5, 9]),
 ('neural networks', [7, 8]),
 ('NoSQL', [1]),
 ('Hadoop', [0, 9]),
 ('Mahout', [7]),
 ('Storm', [0]),
 ('regression', [3, 4]),
 ('statistics', [3, 6]),
 ('probability', [3, 6]),
 ('programming languages', [5]),
 ('Python', [2, 3, 5]),
 ('deep learning', [8]),
 ('Haskell', [5]),
 ('mathematics', [6]),
 ('Spark', [0]),
 ('numpy', [2]),
 ('pandas', [2]),
 ('artificial intelligence', [8]),
 ('theory', [6]),
 ('libsvm', [4]),
 ('C++', [5]),
 ('R', [3, 5]),
 ('HBase', [0, 1]),
 ('Postgres', [1]),
 ('decision trees', [4]),
 ('Big Data', [0, 8, 9]),
 ('MongoDB', [1]),
 ('scikit-learn', [2, 7]),
 ('MapReduce', [9]),
 ('machine learning', [4, 7]),
 ('scipy', [2]),
 ('statsmodels', [2]),
 ('Cassandra', [0, 1])]

In [80]:
# now make the recommender
def most_common_interests_with(user):
    return Counter(interested_user_id
                  for interest in interests_by_user_id[user['id']] # for all of given user's interests
                  for interested_user_id in user_ids_by_interest[interest] # find everyone who shares that interest
                  if interested_user_id != user['id']) # as long as they are not that user

In [82]:
[(u['id'], most_common_interests_with(u)) for u in users]


Out[82]:
[(0, Counter({9: 3, 1: 2, 8: 1, 5: 1})),
 (1, Counter({0: 2})),
 (2, Counter({3: 1, 5: 1, 7: 1})),
 (3, Counter({5: 2, 6: 2, 2: 1, 4: 1})),
 (4, Counter({3: 1, 7: 1})),
 (5, Counter({3: 2, 0: 1, 9: 1, 2: 1})),
 (6, Counter({3: 2})),
 (7, Counter({8: 1, 2: 1, 4: 1})),
 (8, Counter({0: 1, 9: 1, 7: 1})),
 (9, Counter({0: 3, 8: 1, 5: 1}))]

Salaries


In [83]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
                            (48000, 0.7), (76000, 6),
                            (69000, 6.5), (76000, 7.5),
                            (60000, 2.5), (83000, 10),
                            (48000, 1.9), (63000, 4.2)]

In [87]:
# Find avg salary by tenure
salary_by_tenure = defaultdict(list)

# Bucket tenure
def tenure_bucket(tenure):
    if tenure < 2:
        return "less than two"
    elif tenure < 5:
        return "between two and five"
    else:
        return "more than five"

for salary, tenure in salaries_and_tenures:
    bucket = tenure_bucket(tenure)
    salary_by_tenure[bucket].append(salary)
    
average_salary_by_tenure = {
    bucket : sum(salaries) / len(salaries)
    for bucket, salaries in salary_by_tenure.items()
}

In [88]:
average_salary_by_tenure


Out[88]:
{'between two and five': 61500.0,
 'less than two': 48000.0,
 'more than five': 79166.66666666667}

Topics of interest


In [89]:
interests = [
    (0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
    (0, "Spark"), (0, "Storm"), (0, "Cassandra"),
    (1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
    (1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
    (2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
    (3, "statistics"), (3, "regression"), (3, "probability"),
    (4, "machine learning"), (4, "regression"), (4, "decision trees"),
    (4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
    (5, "Haskell"), (5, "programming languages"), (6, "statistics"),
    (6, "probability"), (6, "mathematics"), (6, "theory"),
    (7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
    (7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
    (8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
    (9, "Java"), (9, "MapReduce"), (9, "Big Data")
]

In [90]:
words_and_counts = Counter(word
                          for user, interest in interests
                          for word in interest.lower().split())

In [92]:
for word, count in words_and_counts.most_common():
    if count > 1:
        print word, count


learning 3
java 3
python 3
big 3
data 3
hbase 2
regression 2
cassandra 2
statistics 2
probability 2
hadoop 2
networks 2
machine 2
neural 2
scikit-learn 2
r 2

In [ ]: