In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import json
import sys
In [2]:
batchlogfile = 'sample_dataset/batch_log.json'
df_batch = pd.read_json(batchlogfile, lines=True)
In [3]:
index_purchase = ['event_type','id','timestamp','amount']
index_friend = ['event_type','id1','id2','timestamp']
In [4]:
#df_batch.head()
In [5]:
#df_batch.describe()
In [6]:
# Read D and T
df_DT=df_batch[df_batch['D'].notnull()]
df_DT=df_DT[['D','T']]
D = df_DT.values[0][0]
T = df_DT.values[0][1]
#print(D)
#print(T)
#df_DT.head()
In [7]:
# check D and T values
if D < 1:
print('Program terminated because of D < 1')
sys.exit()
if T < 2:
print('Program terminated because of T < 2')
sys.exit()
In [8]:
#for possible_value in set(df['event_type'].tolist()):
# print(possible_value)
In [9]:
df_purchase = df_batch[df_batch['event_type']=='purchase']
df_purchase = df_purchase[index_purchase]
df_purchase = df_purchase.dropna(how='any')
# If sort on the timestamp is needed, commentout the following line
# df_purchase = df_purchase.sort_values('timestamp')
#df_purchase.shape
In [10]:
df_friend=df_batch[(df_batch['event_type']=='befriend') | (df_batch['event_type']=='unfriend')]
df_friend=df_friend[index_friend]
df_friend=df_friend.dropna(how='any')
# If sort on the timestamp is needed, commentout the following line
#df_friend=df_friend.sort_values('timestamp')
#df_friend.shape
In [11]:
G = nx.Graph()
In [12]:
idlist = set(df_purchase.id.tolist())
G.add_nodes_from(idlist)
#len(list(G.nodes()))
In [13]:
def Add_edges(data):
for row in data.itertuples():
id10 = row.id1
id20 = row.id2
event_type0 = row.event_type
if event_type0 == 'befriend':
G.add_edge(id10,id20)
if event_type0 == 'unfriend':
if G.has_edge(id10,id20):
G.remove_edge(id10,id20)
In [14]:
Add_edges(df_friend)
In [15]:
#len(list(G.edges()))
In [16]:
#G[10.0]
In [17]:
#G.number_of_nodes()
In [18]:
#G.number_of_edges()
In [19]:
# define a function to calcualte the mean and sd for userid's network
def Get_Mean_SD(userid):
Nodes = list(nx.ego_graph(G, userid, D, center=False))
df_Nodes = df_purchase.loc[df_purchase['id'].isin(Nodes)]
if len(df_Nodes) >= 2:
if len(df_Nodes) > T:
df_Nodes = df_Nodes.sort_values('timestamp').iloc[-int(T):]
#df_Nodes.shape
#the std from pd is different from np; np is correct
#mean = df_Nodes.amount.mean()
#sd = df_Nodes.amount.std()
mean = np.mean(df_Nodes['amount'])
sd = np.std(df_Nodes['amount'])
mean = float("{0:.2f}".format(mean))
sd = float("{0:.2f}".format(sd))
else:
mean=np.nan
sd=np.nan
return mean, sd
In [20]:
#Get_Mean_SD(0.0)
In [21]:
#df_purchase.head()
In [22]:
#df_purchase.tail()
In [23]:
#df_purchase.shape
In [24]:
# read in the stream_log.json
streamlogfile = 'sample_dataset/stream_log.json'
df_stream = pd.read_json(streamlogfile, lines=True)
# If sort on the timestamp is needed, commentout the following line
#df_stream = df_stream.sort_values('timestamp')
# open output file flagged_purchases.json
flaggedfile = 'log_output/flagged_purchases.json'
f = open(flaggedfile, 'w')
In [25]:
# Determine whether a purchase is anomalous; update purchase history; update social network
for i in range(0, len(df_stream)):
datai = df_stream.iloc[i]
event_type = datai['event_type']
if (event_type == 'purchase') & (not datai[index_purchase].isnull().any()):
# update purchase history
df_purchase = df_purchase.append(datai[index_purchase])
timestamp = datai['timestamp']
timestamp = str(timestamp)
userid = datai['id']
if (not G.has_node(userid)):
G.add_node(userid)
amount = datai['amount']
mean, sd = Get_Mean_SD(userid)
if mean != np.nan:
mean_3sd = mean + (3*sd)
if amount > mean_3sd:
f.write('{{"event_type":"{0:s}", "timestamp":"{1:s}", "id": "{2:.0f}", "amount": "{3:.2f}", "mean": "{4:.2f}", "sd": "{5:.2f}"}}\n'.format(event_type, timestamp, userid, amount, mean, sd))
# update social network
if (event_type == 'befriend') & (not datai[index_friend].isnull().any()):
df_friend=df_friend.append(datai[index_friend])
id1 = datai['id1']
id2 = datai['id2']
G.add_edge(id1,id2)
if (event_type == 'unfriend') & (not datai[index_friend].isnull().any()):
df_friend=df_friend.append(datai[index_friend])
id1 = datai['id1']
id2 = datai['id2']
if G.has_edge(id1,id2):
G.remove_edge(id1,id2)
In [26]:
f.close()
In [ ]: