In [2]:
from datetime import datetime
start = datetime.now()
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import combinations
from collections import defaultdict, Counter
from sklearn.feature_extraction import DictVectorizer
In [4]:
BRANDS_LIST = list()
N_BRANDS = int()
N_FOLLOWERS = int()
DV = DictVectorizer(dtype=int)
In [5]:
def load_data(in_file, brands_to_load = None, max_limit = 1404, verbose = False):
"""Loads data from the given data file.
If the brands to be loaded is less than the maximum rows in the datafile, randomize the loading.
Args:
brands_to_load .... Number of brands to be loaded.
in_file .... A string representing the location and datafile name to be loaded.
Returns:
None. The data is loaded into the global variables."""
if brands_to_load is None:
brands_to_load = max_limit
#Randomize Loading
if brands_to_load != max_limit:
choices = sorted(random.sample(range(max_limit), brands_to_load))
max_row_id = max(choices)
else:
choices = range(max_limit)
max_row_id = max_limit-1
brand_no = 0
cur_idx = 0
brand_followers = list()
with open(in_file) as data_file:
for row in data_file:
if brand_no > max_row_id:
break
if brand_no != choices[cur_idx]:
brand_no += 1
continue
else:
followers = row.split()
brand = followers.pop(0)
followers_count = len(followers)
#Ignoring brands with less than 1000 followers
if followers_count > 1000:
followers_dict = Counter(followers)
brand_followers.append(followers_dict)
BRANDS_LIST.append(brand)
if verbose:
print('Loaded',brand,'-',followers_count,'followers.')
else:
if verbose:
print('Skipped',brand,'-',followers_count,'followers.')
brand_no += 1
cur_idx += 1
data = DV.fit_transform(brand_followers)
return data
In [6]:
brands_to_load = 100
max_rows = 1404
verbose = True
BF_DATA = load_data('data/brand_followers_final.tsv', brands_to_load, max_rows, verbose)
N_BRANDS,N_FOLLOWERS = BF_DATA.get_shape()
print('\nNo. of Brands', N_BRANDS)
print('No. of Unique Followers', N_FOLLOWERS)
In [16]:
hist, bins = np.histogram(sum(BF_DATA).toarray(),bins=50)
width = 0.9 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.yscale("log")
plt.bar(center, hist, align='center', width=width)
plt.title('Brands Followed Vs Followers Count')
Out[16]:
In [6]:
x = sum(BF_DATA).toarray()
followers = np.where(x >= 5)[1]
print('Unique Followers Pool',len(followers))
print('\nSample Values:')
print('\tFollower Idx:',followers[0], 'Follower ID:', DV.feature_names_[followers[0]])
print('\tBrands Followed:', len(BF_DATA.getcol(followers[0]).nonzero()[0]))
In [7]:
pct = 0.005
follower_ind = random.sample(list(followers), int(len(followers) * pct))
print('Sampled',len(follower_ind),'followers')
print('Indices:',follower_ind)
In [8]:
brands_to_remove = {}
for follower_idx in follower_ind:
brand_idx = np.random.choice(BF_DATA.getcol(follower_idx).nonzero()[0])
brands_to_remove[follower_idx] = brand_idx
BF_DATA[brand_idx, follower_idx] = 0
print('Removed edge', (brand_idx,follower_idx))
In [9]:
def get_similarity_matrix(data='BF_DATA', similarity='Jaccard'):
if similarity == 'Jaccard' and data=='BF_DATA':
sim_mat = np.zeros((N_BRANDS,N_BRANDS))
for brand_pair in combinations(range(N_BRANDS),2):
b1 = BF_DATA.getrow(brand_pair[0])
b2 = BF_DATA.getrow(brand_pair[1])
common_connections = b1.multiply(b2).sum()
total_connections = b1.sum() + b2.sum() - common_connections
sim = common_connections/total_connections
sim_mat[brand_pair[0] , brand_pair[1]] = sim_mat[brand_pair[1] , brand_pair[0]] = sim
return sim_mat
In [10]:
SIM_MAT = get_similarity_matrix()
In [11]:
def predict_rank(scores, norm_factor, removed_brand):
rank = 0
for tup in sorted(scores.items(), key=lambda x: x[1]/norm_factor, reverse=True):
rank += 1
if tup[0] == removed_brand:
return rank
return None
In [12]:
def check_overlap(user_idx, brand_followed, brand_not_followed, brand_removed):
if brand_removed == brand_not_followed:
if SIM_MAT[brand_followed,brand_not_followed] == 0:
print('[BR]No Overlap between',BRANDS_LIST[brand_followed], BRANDS_LIST[brand_not_followed])
return True
else:
b1 = BF_DATA.getrow(brand_followed)
b2 = BF_DATA.getrow(brand_not_followed)
if b1.multiply(b2).sum() == 0:
print('[AR]No Overlap between',BRANDS_LIST[brand_followed], BRANDS_LIST[brand_not_followed])
return True
return False
In [13]:
prediction_rank = dict()
for follower_idx in follower_ind:
brands_followed = BF_DATA.getcol(follower_idx).nonzero()[0]
brands_not_followed = np.delete(np.arange(N_BRANDS),brands_followed)
scores = {}
no_overlaps = {}
for brand_not_followed in brands_not_followed:
scores[brand_not_followed] = 0
for brand_followed in brands_followed:
scores[brand_not_followed] += SIM_MAT[brand_followed,brand_not_followed]
#if check_overlap(follower_idx, brand_followed, brand_not_followed,brands_to_remove[follower_idx]):
#print('Yikes!',brand_followed, brand_not_followed)
prediction_rank[follower_idx] = predict_rank(scores, len(brands_followed), brands_to_remove[follower_idx])
In [14]:
list(prediction_rank.items())
Out[14]:
In [15]:
from functools import reduce
rank_sum = reduce(lambda x,y: x+y, prediction_rank.values())
print('Mean Rank', rank_sum/len(prediction_rank))
In [16]:
hist, bins = np.histogram(list(prediction_rank.values()),bins=50)
width = 0.9 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
#plt.yscale("log")
plt.bar(center, hist, align='center', width=width)
plt.title('Rank Distribution')
Out[16]:
In [17]:
mean_rank = {}
brands_followed_count = {}
for k in prediction_rank:
count = len(BF_DATA.getcol(k).nonzero()[0])
if count in mean_rank.keys():
mean_rank[count] += prediction_rank[k]
brands_followed_count[count] += 1
else:
mean_rank[count] = prediction_rank[k]
brands_followed_count[count] = 1
In [18]:
brands_followed_count
Out[18]:
In [19]:
for k in mean_rank:
mean_rank[k] //= brands_followed_count[k]
mean_rank
Out[19]:
In [20]:
data_points = mean_rank.items()
x = [p[0] for p in data_points]
y = [p[1] for p in data_points]
plt.bar(x,y)
plt.title('Average Rank by number of brands followed')
Out[20]:
In [21]:
good_predictions = []
bad_predictions = []
max_rank = max([p[1] for p in data_points])
for k in prediction_rank:
if prediction_rank[k] >= max_rank*0.5:
bad_predictions.append(brands_to_remove[k])
else:
good_predictions.append(brands_to_remove[k])
final_list = list(set(bad_predictions) - set(good_predictions))
final_list.sort()
for b in final_list:
print(BRANDS_LIST[b])
In [22]:
stop = datetime.now()
print('Time taken',stop-start)
Time taken 0:02:27.579176
In [ ]: