In [2]:
    
from datetime import datetime
start = datetime.now()
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from itertools import combinations
from collections import defaultdict, Counter
from sklearn.feature_extraction import DictVectorizer
    
In [4]:
    
BRANDS_LIST = list()
N_BRANDS = int()
N_FOLLOWERS = int()
DV = DictVectorizer(dtype=int)
    
In [5]:
    
def load_data(in_file, brands_to_load = None, max_limit = 1404, verbose = False):
    """Loads data from the given data file.
    If the brands to be loaded is less than the maximum rows in the datafile, randomize the loading.
    Args:
      brands_to_load .... Number of brands to be loaded.
      in_file       .... A string representing the location and datafile name to be loaded.
    Returns:
      None. The data is loaded into the global variables."""
    if brands_to_load is None:
        brands_to_load = max_limit 
    
    #Randomize Loading
    if brands_to_load != max_limit:
        choices = sorted(random.sample(range(max_limit), brands_to_load))
        max_row_id = max(choices)
    else:
        choices = range(max_limit)
        max_row_id = max_limit-1
        
    brand_no = 0
    cur_idx = 0
    brand_followers = list()
    
    with open(in_file) as data_file:
        for row in data_file:
            if brand_no > max_row_id:
                break
            if brand_no != choices[cur_idx]:
                brand_no += 1
                continue
            else:
                followers = row.split()
                brand = followers.pop(0)
                followers_count = len(followers)
            
                #Ignoring brands with less than 1000 followers
                if followers_count > 1000:  
                    
                    followers_dict = Counter(followers)
                    brand_followers.append(followers_dict)
                    BRANDS_LIST.append(brand)
                    
                    if verbose:
                        print('Loaded',brand,'-',followers_count,'followers.')
                else:
                    if verbose:
                        print('Skipped',brand,'-',followers_count,'followers.')
            
                brand_no += 1
                cur_idx += 1
    
    data = DV.fit_transform(brand_followers)
    
    return data
    
In [6]:
    
brands_to_load = 100
max_rows = 1404
verbose = True
BF_DATA = load_data('data/brand_followers_final.tsv', brands_to_load, max_rows, verbose)
N_BRANDS,N_FOLLOWERS = BF_DATA.get_shape()
print('\nNo. of Brands', N_BRANDS)
print('No. of Unique Followers', N_FOLLOWERS)
    
    
In [16]:
    
hist, bins = np.histogram(sum(BF_DATA).toarray(),bins=50)    
width = 0.9 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.yscale("log")
plt.bar(center, hist, align='center', width=width)
plt.title('Brands Followed Vs Followers Count')
    
    Out[16]:
    
In [6]:
    
x = sum(BF_DATA).toarray()
followers = np.where(x >= 5)[1]
print('Unique Followers Pool',len(followers)) 
print('\nSample Values:')
print('\tFollower Idx:',followers[0], 'Follower ID:', DV.feature_names_[followers[0]])
print('\tBrands Followed:', len(BF_DATA.getcol(followers[0]).nonzero()[0]))
    
    
In [7]:
    
pct = 0.005
follower_ind = random.sample(list(followers), int(len(followers) * pct))
print('Sampled',len(follower_ind),'followers')
print('Indices:',follower_ind)
    
    
In [8]:
    
brands_to_remove = {}
for follower_idx in follower_ind:
    brand_idx = np.random.choice(BF_DATA.getcol(follower_idx).nonzero()[0])
    brands_to_remove[follower_idx] = brand_idx
    BF_DATA[brand_idx, follower_idx] = 0
    print('Removed edge', (brand_idx,follower_idx))
    
    
In [9]:
    
def get_similarity_matrix(data='BF_DATA', similarity='Jaccard'):
    
    if similarity == 'Jaccard' and data=='BF_DATA':
        
        sim_mat = np.zeros((N_BRANDS,N_BRANDS))
        
        for brand_pair in combinations(range(N_BRANDS),2):
            
            b1 = BF_DATA.getrow(brand_pair[0])
            b2 = BF_DATA.getrow(brand_pair[1])
            
            common_connections = b1.multiply(b2).sum()
            total_connections = b1.sum() + b2.sum() - common_connections
            sim = common_connections/total_connections
            
            sim_mat[brand_pair[0] , brand_pair[1]] = sim_mat[brand_pair[1] , brand_pair[0]] = sim
            
    return sim_mat
    
In [10]:
    
SIM_MAT = get_similarity_matrix()
    
In [11]:
    
def predict_rank(scores, norm_factor, removed_brand):
    rank = 0
    for tup in sorted(scores.items(), key=lambda x: x[1]/norm_factor, reverse=True):
        rank += 1
        if tup[0] == removed_brand:
            return rank
    return None
    
In [12]:
    
def check_overlap(user_idx, brand_followed, brand_not_followed, brand_removed):
    if brand_removed == brand_not_followed:
        if SIM_MAT[brand_followed,brand_not_followed] == 0:
            print('[BR]No Overlap between',BRANDS_LIST[brand_followed], BRANDS_LIST[brand_not_followed])
            return True
        else:
            b1 = BF_DATA.getrow(brand_followed)
            b2 = BF_DATA.getrow(brand_not_followed)
            if b1.multiply(b2).sum() == 0:
                print('[AR]No Overlap between',BRANDS_LIST[brand_followed], BRANDS_LIST[brand_not_followed])
                return True
    return False
    
In [13]:
    
prediction_rank = dict()
for follower_idx in follower_ind:
    brands_followed = BF_DATA.getcol(follower_idx).nonzero()[0]
    brands_not_followed = np.delete(np.arange(N_BRANDS),brands_followed)
    scores = {}
    no_overlaps = {}
    for brand_not_followed in brands_not_followed:
        scores[brand_not_followed] = 0
        for brand_followed in brands_followed:
            scores[brand_not_followed] += SIM_MAT[brand_followed,brand_not_followed]
            #if check_overlap(follower_idx, brand_followed, brand_not_followed,brands_to_remove[follower_idx]):
                #print('Yikes!',brand_followed, brand_not_followed)
    prediction_rank[follower_idx] = predict_rank(scores, len(brands_followed), brands_to_remove[follower_idx])
    
In [14]:
    
list(prediction_rank.items())
    
    Out[14]:
In [15]:
    
from functools import reduce
rank_sum = reduce(lambda x,y: x+y, prediction_rank.values())
print('Mean Rank', rank_sum/len(prediction_rank))
    
    
In [16]:
    
hist, bins = np.histogram(list(prediction_rank.values()),bins=50)    
width = 0.9 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
#plt.yscale("log")
plt.bar(center, hist, align='center', width=width)
plt.title('Rank Distribution')
    
    Out[16]:
    
In [17]:
    
mean_rank = {}
brands_followed_count = {}
for k in prediction_rank:
    count = len(BF_DATA.getcol(k).nonzero()[0])
    if count in mean_rank.keys():
        mean_rank[count] += prediction_rank[k]
        brands_followed_count[count] += 1
    else:
        mean_rank[count] = prediction_rank[k]
        brands_followed_count[count] = 1
    
In [18]:
    
brands_followed_count
    
    Out[18]:
In [19]:
    
for k in mean_rank:
    mean_rank[k] //= brands_followed_count[k]
mean_rank
    
    Out[19]:
In [20]:
    
data_points = mean_rank.items()
x = [p[0] for p in data_points]
y = [p[1] for p in data_points]
plt.bar(x,y)
plt.title('Average Rank by number of brands followed')
    
    Out[20]:
    
In [21]:
    
good_predictions = []
bad_predictions = []
max_rank = max([p[1] for p in data_points])
for k in prediction_rank:
    if prediction_rank[k] >= max_rank*0.5:
        bad_predictions.append(brands_to_remove[k])
    else:
        good_predictions.append(brands_to_remove[k])
final_list = list(set(bad_predictions) - set(good_predictions))
final_list.sort()
for b in final_list:
    print(BRANDS_LIST[b])
    
    
In [22]:
    
stop = datetime.now()
print('Time taken',stop-start)
    
    
Time taken 0:02:27.579176
In [ ]: