In [35]:
from __future__ import division
In [22]:
users = [
{ "id": 0, "name": "Hero" },
{ "id": 1, "name": "Dunn" },
{ "id": 2, "name": "Sue" },
{ "id": 3, "name": "Chi" },
{ "id": 4, "name": "Thor" },
{ "id": 5, "name": "Clive" },
{ "id": 6, "name": "Hicks" },
{ "id": 7, "name": "Devin" },
{ "id": 8, "name": "Kate" },
{ "id": 9, "name": "Klein" }
]
In [23]:
friendships = [(0, 1), (0, 2), (1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 8), (8, 9)]
In [24]:
# Add a list of friends to each user for reference
for user in users:
user['friends'] = []
# Populate with friends
for i,j in friendships:
users[i]['friends'].append(users[j]) # adding j as friend to i
users[j]['friends'].append(users[i]) # reciprocate
In [28]:
# How many friends?
def number_of_friends(user):
return len(user['friends'])
In [32]:
# each user's friend count
[number_of_friends(u) for u in users]
Out[32]:
In [33]:
# total count
total_connections = sum([number_of_friends(u) for u in users])
In [34]:
total_connections
Out[34]:
In [38]:
# Gotta have da total if ya gonna get da average
num_users = len(users)
avg_connections = total_connections / num_users
In [39]:
avg_connections
Out[39]:
In [40]:
num_friends_by_id = [(u['id'], number_of_friends(u)) for u in users]
In [43]:
sorted(num_friends_by_id, key = lambda x: x[1], reverse=True)
Out[43]:
In [48]:
# Brute force try
def friends_of_friends_ids_bad(user):
return [foaf['id']
for friend in user['friends'] # for all the friends
for foaf in friend['friends']] # get their friends
In [49]:
friends_of_friends_ids_bad(users[0])
Out[49]:
In [60]:
# redo this accounting for dupes and self
from collections import Counter
def not_the_same(user, other_user):
# check if two users are the same
return user['id'] != other_user['id']
def not_friends(user, other_user):
# Check if other user is in friends list of user
return all(not_the_same(friend, other_user)
for friend in user['friends'])
def friends_of_friend_ids(user):
return Counter(foaf['id']
for friend in user['friends'] # check user friends
for foaf in friend['friends'] # and count their friends
if not_the_same(user, foaf) # as long as they aren't user
and not_friends(user, foaf)) # and not my friend
In [62]:
[friends_of_friend_ids(u) for u in users]
Out[62]:
In [63]:
interests = [
(0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
(0, "Spark"), (0, "Storm"), (0, "Cassandra"),
(1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
(1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
(2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
(3, "statistics"), (3, "regression"), (3, "probability"),
(4, "machine learning"), (4, "regression"), (4, "decision trees"),
(4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
(5, "Haskell"), (5, "programming languages"), (6, "statistics"),
(6, "probability"), (6, "mathematics"), (6, "theory"),
(7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
(7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
(8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
(9, "Java"), (9, "MapReduce"), (9, "Big Data")
]
In [66]:
# Find people who like a given interest
def users_who_like(target_interest):
return [user_id
for user_id, user_interest in interests
if user_interest == target_interest]
# Slow since it searches all interests
In [69]:
[users_who_like(i) for _,i in interests]
Out[69]:
In [72]:
# Instead let's index users by their interests and simply retrieve
from collections import defaultdict
user_ids_by_interest = defaultdict(list)
# also do the reverse dict
interests_by_user_id = defaultdict(list)
# For all user, interest pairs make a dict
for user_id, interest in interests:
user_ids_by_interest[interest].append(user_id)
interests_by_user_id[user_id].append(interest)
In [75]:
user_ids_by_interest.items()
Out[75]:
In [80]:
# now make the recommender
def most_common_interests_with(user):
return Counter(interested_user_id
for interest in interests_by_user_id[user['id']] # for all of given user's interests
for interested_user_id in user_ids_by_interest[interest] # find everyone who shares that interest
if interested_user_id != user['id']) # as long as they are not that user
In [82]:
[(u['id'], most_common_interests_with(u)) for u in users]
Out[82]:
In [83]:
salaries_and_tenures = [(83000, 8.7), (88000, 8.1),
(48000, 0.7), (76000, 6),
(69000, 6.5), (76000, 7.5),
(60000, 2.5), (83000, 10),
(48000, 1.9), (63000, 4.2)]
In [87]:
# Find avg salary by tenure
salary_by_tenure = defaultdict(list)
# Bucket tenure
def tenure_bucket(tenure):
if tenure < 2:
return "less than two"
elif tenure < 5:
return "between two and five"
else:
return "more than five"
for salary, tenure in salaries_and_tenures:
bucket = tenure_bucket(tenure)
salary_by_tenure[bucket].append(salary)
average_salary_by_tenure = {
bucket : sum(salaries) / len(salaries)
for bucket, salaries in salary_by_tenure.items()
}
In [88]:
average_salary_by_tenure
Out[88]:
In [89]:
interests = [
(0, "Hadoop"), (0, "Big Data"), (0, "HBase"), (0, "Java"),
(0, "Spark"), (0, "Storm"), (0, "Cassandra"),
(1, "NoSQL"), (1, "MongoDB"), (1, "Cassandra"), (1, "HBase"),
(1, "Postgres"), (2, "Python"), (2, "scikit-learn"), (2, "scipy"),
(2, "numpy"), (2, "statsmodels"), (2, "pandas"), (3, "R"), (3, "Python"),
(3, "statistics"), (3, "regression"), (3, "probability"),
(4, "machine learning"), (4, "regression"), (4, "decision trees"),
(4, "libsvm"), (5, "Python"), (5, "R"), (5, "Java"), (5, "C++"),
(5, "Haskell"), (5, "programming languages"), (6, "statistics"),
(6, "probability"), (6, "mathematics"), (6, "theory"),
(7, "machine learning"), (7, "scikit-learn"), (7, "Mahout"),
(7, "neural networks"), (8, "neural networks"), (8, "deep learning"),
(8, "Big Data"), (8, "artificial intelligence"), (9, "Hadoop"),
(9, "Java"), (9, "MapReduce"), (9, "Big Data")
]
In [90]:
words_and_counts = Counter(word
for user, interest in interests
for word in interest.lower().split())
In [92]:
for word, count in words_and_counts.most_common():
if count > 1:
print word, count
In [ ]: