In [1]:

    
import numpy as np
import pandas as pd
import networkx as nx
import json
import sys

Step1: build the initial state of the entire user network, as well as the purchae history of the users

Input: sample_dataset/batch_log.json



In [2]:

    
batchlogfile = 'sample_dataset/batch_log.json'
df_batch = pd.read_json(batchlogfile, lines=True)



In [3]:

    
index_purchase = ['event_type','id','timestamp','amount']
index_friend = ['event_type','id1','id2','timestamp']



In [4]:

    
#df_batch.head()



In [5]:

    
#df_batch.describe()



In [6]:

    
# Read D and T
df_DT=df_batch[df_batch['D'].notnull()]
df_DT=df_DT[['D','T']]
D = df_DT.values[0][0]
T = df_DT.values[0][1]
#print(D)
#print(T)
#df_DT.head()



In [7]:

    
# check D and T values
if D < 1:
    print('Program terminated because of D < 1')
    sys.exit()
if T < 2:
    print('Program terminated because of T < 2')
    sys.exit()



In [8]:

    
#for possible_value in set(df['event_type'].tolist()):
#    print(possible_value)



In [9]:

    
df_purchase = df_batch[df_batch['event_type']=='purchase']
df_purchase = df_purchase[index_purchase]
df_purchase = df_purchase.dropna(how='any')
# If sort on the timestamp is needed, commentout the following line
# df_purchase = df_purchase.sort_values('timestamp')
#df_purchase.shape



In [10]:

    
df_friend=df_batch[(df_batch['event_type']=='befriend') | (df_batch['event_type']=='unfriend')]
df_friend=df_friend[index_friend]
df_friend=df_friend.dropna(how='any')
# If sort on the timestamp is needed, commentout the following line
#df_friend=df_friend.sort_values('timestamp')
#df_friend.shape



In [11]:

    
G = nx.Graph()



In [12]:

    
idlist = set(df_purchase.id.tolist())
G.add_nodes_from(idlist)
#len(list(G.nodes()))



In [13]:

    
def Add_edges(data):
    for row in data.itertuples():
        id10 = row.id1
        id20 = row.id2
        event_type0 = row.event_type
        if event_type0 == 'befriend':
            G.add_edge(id10,id20)
        if event_type0 == 'unfriend':
            if G.has_edge(id10,id20):
                G.remove_edge(id10,id20)



In [14]:

    
Add_edges(df_friend)



In [15]:

    
#len(list(G.edges()))



In [16]:

    
#G[10.0]



In [17]:

    
#G.number_of_nodes()



In [18]:

    
#G.number_of_edges()



In [19]:

    
# define a function to calcualte the mean and sd for userid's network
def Get_Mean_SD(userid):
    Nodes = list(nx.ego_graph(G, userid, D, center=False))
    df_Nodes = df_purchase.loc[df_purchase['id'].isin(Nodes)]
    if len(df_Nodes) >= 2:    
        if len(df_Nodes) > T:
            df_Nodes = df_Nodes.sort_values('timestamp').iloc[-int(T):]
        #df_Nodes.shape
        #the std from pd is different from np; np is correct
        #mean = df_Nodes.amount.mean()
        #sd = df_Nodes.amount.std()
        mean = np.mean(df_Nodes['amount'])
        sd = np.std(df_Nodes['amount'])
        mean = float("{0:.2f}".format(mean))
        sd = float("{0:.2f}".format(sd))
    else:
        mean=np.nan
        sd=np.nan
    
    return mean, sd



In [20]:

    
#Get_Mean_SD(0.0)



In [21]:

    
#df_purchase.head()



In [22]:

    
#df_purchase.tail()



In [23]:

    
#df_purchase.shape

Step2: Determine whether a purchase is anomalous

input file: sample_dataset/stream_log.json



In [24]:

    
# read in the stream_log.json
streamlogfile = 'sample_dataset/stream_log.json'
df_stream = pd.read_json(streamlogfile, lines=True)
# If sort on the timestamp is needed, commentout the following line
#df_stream = df_stream.sort_values('timestamp')

# open output file flagged_purchases.json
flaggedfile = 'log_output/flagged_purchases.json'
f = open(flaggedfile, 'w')



In [25]:

    
# Determine whether a purchase is anomalous; update purchase history; update social network
for i in range(0, len(df_stream)):
    datai = df_stream.iloc[i]
    event_type = datai['event_type']
    if (event_type == 'purchase') & (not datai[index_purchase].isnull().any()):
        # update purchase history
        df_purchase = df_purchase.append(datai[index_purchase])
        timestamp = datai['timestamp']
        timestamp = str(timestamp)
        userid = datai['id']
        if (not G.has_node(userid)):
            G.add_node(userid)
        amount = datai['amount']
        mean, sd = Get_Mean_SD(userid)
        if mean != np.nan:
            mean_3sd = mean + (3*sd)
            if amount > mean_3sd:
                f.write('{{"event_type":"{0:s}", "timestamp":"{1:s}", "id": "{2:.0f}", "amount": "{3:.2f}", "mean": "{4:.2f}", "sd": "{5:.2f}"}}\n'.format(event_type, timestamp, userid, amount, mean, sd))
    # update social network
    if (event_type == 'befriend') & (not datai[index_friend].isnull().any()):
        df_friend=df_friend.append(datai[index_friend])
        id1 = datai['id1']
        id2 = datai['id2']
        G.add_edge(id1,id2)
    if (event_type == 'unfriend') & (not datai[index_friend].isnull().any()):
        df_friend=df_friend.append(datai[index_friend])
        id1 = datai['id1']
        id2 = datai['id2']
        if G.has_edge(id1,id2):
            G.remove_edge(id1,id2)



In [26]:

    
f.close()



In [ ]: