In [ ]:
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import combinations
In [67]:
unique_followers = {}
brand_followers = {}
limit = 100
row_id = 0
with open('data/brand_followers_final.tsv') as data_file:
for row in data_file:
followers = row.split()
brand = followers.pop(0)
brand_followers[brand] = []
if len(followers) > 1000: #Filtering brands with less than 1000 followers
for x in followers:
f = int(x)
brand_followers[brand].append(f)
if f not in unique_followers.keys():
unique_followers[f] = []
unique_followers[f].append(brand)
row_id += 1
if row_id == limit:
break
print('No. of Unique Followers', len(unique_followers.keys()))
print('No. of Brands', len(brand_followers.keys()))
In [68]:
hist, bins = np.histogram(np.asarray([len(v) for v in list(unique_followers.values())]),bins=50)
width = 0.9 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.yscale("log")
plt.bar(center, hist, align='center', width=width)
plt.title('Followers Count Vs Brands Followed')
Out[68]:
In [76]:
choice = random.choice(range(len(brand_followers.keys())))
#choice = 100
follower_id = sorted(unique_followers.items(), key=lambda x: len(x[1]), reverse=True)[choice][0]
print('Chosen follower %d who follows %d brands'%(follower_id,len(unique_followers[follower_id])))
In [77]:
unique_followers[follower_id]
Out[77]:
In [78]:
scores = {}
brands = unique_followers[follower_id]
for b in combinations(brand_followers.keys(), 2):
if b[0] in brands and b[1] not in brands:
neighbors = set(brand_followers[b[0]])
neighbors2 = set(brand_followers[b[1]])
#scores.append((b[0],b[1], 1. * len(neighbors & neighbors2) / len(neighbors | neighbors2)))
if b[1] not in scores.keys():
score = 1. * len(neighbors & neighbors2) / len(neighbors | neighbors2)
scores[b[1]] = score
else:
scores[b[1]] += score
sorted(scores.items(), key=lambda x: x[1]/len(unique_followers[follower_id]), reverse=True)[:20]
Out[78]:
In [79]:
pct = 0.75
to_remove = random.sample(unique_followers[follower_id], int(len(unique_followers[follower_id]) * pct))
In [80]:
print('Removed %d brands'% len(to_remove), to_remove)
In [81]:
brands_followed = set(unique_followers[follower_id])
brands_followed = brands_followed - set(to_remove)
In [82]:
scores = {}
for b in combinations(brand_followers.keys(), 2):
if b[0] in brands_followed and b[1] not in brands_followed:
neighbors = set(brand_followers[b[0]]) - set([follower_id])
neighbors2 = set(brand_followers[b[1]]) - set([follower_id])
#scores.append((b[0],b[1], 1. * len(neighbors & neighbors2) / len(neighbors | neighbors2)))
if b[1] not in scores.keys():
score = 1. * len(neighbors & neighbors2) / len(neighbors | neighbors2)
scores[b[1]] = score
else:
scores[b[1]] += score
prediction = sorted(scores.items(), key=lambda x: x[1]/len(brands_followed), reverse=True)[:len(to_remove)]
print('Predicted:\n',[x[0] for x in prediction])
print('Actual:\n', to_remove)
n_correct = len(set(unique_followers[follower_id]) & set([x[0] for x in prediction]))
print('Jaccard finds %d/%d for accuracy of %.3f' % (n_correct, len(to_remove),
(1. * n_correct / len(to_remove))))
In [ ]: