In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import json
import sys

Step1: build the initial state of the entire user network, as well as the purchae history of the users

Input: sample_dataset/batch_log.json


In [2]:
batchlogfile = 'sample_dataset/batch_log.json'
df_batch = pd.read_json(batchlogfile, lines=True)

In [3]:
index_purchase = ['event_type','id','timestamp','amount']
index_friend = ['event_type','id1','id2','timestamp']

In [4]:
#df_batch.head()

In [5]:
#df_batch.describe()

In [6]:
# Read D and T
df_DT=df_batch[df_batch['D'].notnull()]
df_DT=df_DT[['D','T']]
D = df_DT.values[0][0]
T = df_DT.values[0][1]
#print(D)
#print(T)
#df_DT.head()

In [7]:
# check D and T values
if D < 1:
    print('Program terminated because of D < 1')
    sys.exit()
if T < 2:
    print('Program terminated because of T < 2')
    sys.exit()

In [8]:
#for possible_value in set(df['event_type'].tolist()):
#    print(possible_value)

In [9]:
df_purchase = df_batch[df_batch['event_type']=='purchase']
df_purchase = df_purchase[index_purchase]
df_purchase = df_purchase.dropna(how='any')
# If sort on the timestamp is needed, commentout the following line
# df_purchase = df_purchase.sort_values('timestamp')
#df_purchase.shape

In [10]:
df_friend=df_batch[(df_batch['event_type']=='befriend') | (df_batch['event_type']=='unfriend')]
df_friend=df_friend[index_friend]
df_friend=df_friend.dropna(how='any')
# If sort on the timestamp is needed, commentout the following line
#df_friend=df_friend.sort_values('timestamp')
#df_friend.shape

In [11]:
G = nx.Graph()

In [12]:
idlist = set(df_purchase.id.tolist())
G.add_nodes_from(idlist)
#len(list(G.nodes()))

In [13]:
def Add_edges(data):
    for row in data.itertuples():
        id10 = row.id1
        id20 = row.id2
        event_type0 = row.event_type
        if event_type0 == 'befriend':
            G.add_edge(id10,id20)
        if event_type0 == 'unfriend':
            if G.has_edge(id10,id20):
                G.remove_edge(id10,id20)

In [14]:
Add_edges(df_friend)

In [15]:
#len(list(G.edges()))

In [16]:
#G[10.0]

In [17]:
#G.number_of_nodes()

In [18]:
#G.number_of_edges()

In [19]:
# define a function to calcualte the mean and sd for userid's network
def Get_Mean_SD(userid):
    Nodes = list(nx.ego_graph(G, userid, D, center=False))
    df_Nodes = df_purchase.loc[df_purchase['id'].isin(Nodes)]
    if len(df_Nodes) >= 2:    
        if len(df_Nodes) > T:
            df_Nodes = df_Nodes.sort_values('timestamp').iloc[-int(T):]
        #df_Nodes.shape
        #the std from pd is different from np; np is correct
        #mean = df_Nodes.amount.mean()
        #sd = df_Nodes.amount.std()
        mean = np.mean(df_Nodes['amount'])
        sd = np.std(df_Nodes['amount'])
        mean = float("{0:.2f}".format(mean))
        sd = float("{0:.2f}".format(sd))
    else:
        mean=np.nan
        sd=np.nan
    
    return mean, sd

In [20]:
#Get_Mean_SD(0.0)

In [21]:
#df_purchase.head()

In [22]:
#df_purchase.tail()

In [23]:
#df_purchase.shape

Step2: Determine whether a purchase is anomalous

input file: sample_dataset/stream_log.json


In [24]:
# read in the stream_log.json
streamlogfile = 'sample_dataset/stream_log.json'
df_stream = pd.read_json(streamlogfile, lines=True)
# If sort on the timestamp is needed, commentout the following line
#df_stream = df_stream.sort_values('timestamp')

# open output file flagged_purchases.json
flaggedfile = 'log_output/flagged_purchases.json'
f = open(flaggedfile, 'w')

In [25]:
# Determine whether a purchase is anomalous; update purchase history; update social network
for i in range(0, len(df_stream)):
    datai = df_stream.iloc[i]
    event_type = datai['event_type']
    if (event_type == 'purchase') & (not datai[index_purchase].isnull().any()):
        # update purchase history
        df_purchase = df_purchase.append(datai[index_purchase])
        timestamp = datai['timestamp']
        timestamp = str(timestamp)
        userid = datai['id']
        if (not G.has_node(userid)):
            G.add_node(userid)
        amount = datai['amount']
        mean, sd = Get_Mean_SD(userid)
        if mean != np.nan:
            mean_3sd = mean + (3*sd)
            if amount > mean_3sd:
                f.write('{{"event_type":"{0:s}", "timestamp":"{1:s}", "id": "{2:.0f}", "amount": "{3:.2f}", "mean": "{4:.2f}", "sd": "{5:.2f}"}}\n'.format(event_type, timestamp, userid, amount, mean, sd))
    # update social network
    if (event_type == 'befriend') & (not datai[index_friend].isnull().any()):
        df_friend=df_friend.append(datai[index_friend])
        id1 = datai['id1']
        id2 = datai['id2']
        G.add_edge(id1,id2)
    if (event_type == 'unfriend') & (not datai[index_friend].isnull().any()):
        df_friend=df_friend.append(datai[index_friend])
        id1 = datai['id1']
        id2 = datai['id2']
        if G.has_edge(id1,id2):
            G.remove_edge(id1,id2)

In [26]:
f.close()

In [ ]: