In [1]:
import random
import numpy as np
import pandas as pd
import datetime
import math
In [2]:
# Choose range of user IDs
ids = list(range(5000,6001))
id_list = []
# Repeat each user ID a random number of times based on a Gaussian distribution
for i in ids:
y = np.repeat(i, int(random.gauss(25,8))).tolist()
id_list.append(y)
id_list = [item for sublist in id_list for item in sublist]
date_list = []
# Add a random date to each user ID
for i in id_list:
year = 2016
month = random.randint(1,12)
day = random.randint(1,28)
hour = random.randint(0,23)
minute = random.randint(0,59)
second = random.randint(0,59)
date = datetime.datetime(year, month, day, hour, minute, second)
date_list.append(date)
# Convert each list to Series/DataFrame
id_series = pd.Series(id_list, name='user_id')
date_series = pd.Series(date_list, name='event_timestamp')
df = pd.concat([id_series,date_series], axis=1)
df = df.sort_values('event_timestamp').reset_index(drop=True)
# Create additional columns in DF, with each having the 'login' event_type
df['lat'] = 0.0
df['lon'] = 0.0
df['event_type'] = "login"
# Create random latitude and longitude coordinates for each event, centered on Chicago
def lat_lon(group):
x0 = 41.8781
y0 = 87.6298
for i in range(0,len(group)):
# Ensure that each successive lat/lon does not deviate too far from the initial
radius = random.randint(2000,50000)
rad_deg = radius/111300
r = rad_deg
u = float(random.uniform(0.0,1.0))
v = float(random.uniform(0.0,1.0))
w = r * math.sqrt(u)
t = 2*math.pi*v
x = w*math.cos(t)
y = w*math.sin(t)
group['lat'].iloc[i] = x + x0
group['lon'].iloc[i] = y + y0
return group
df = df.groupby('user_id').apply(lat_lon)
df = df.sort_values(by=['user_id','event_timestamp']).reset_index(drop=True)
# Function for creating a random number of events for every 'login' event
def add_events(group):
event_num = random.randint(1,10)
idlist = np.repeat(group['user_id'],event_num).tolist()
# Increment timestamp randomly, with 5-15 min as boundaries
timestamp = group['event_timestamp'].iloc[0]
timestamplist = []
for i in range(0,event_num):
timestamp += datetime.timedelta(minutes=random.randint(5,15))
timestamplist.append(timestamp)
# Insert the same lat/lon for each new as the 'login' event
latlist = np.repeat(group['lat'], event_num).tolist()
lonlist = np.repeat(group['lon'], event_num).tolist()
# For each new event, probabilistically choose one of the 3 additional events
events = ['level', 'buy_coins', 'megapack']
eventlist = np.random.choice(events, event_num, p=[0.85,0.1,0.05])
# Create new dataframe from the new data
d = {'user_id': idlist, 'event_timestamp': timestamplist, 'lat': latlist, 'lon': lonlist, 'event_type': eventlist}
eventdf = pd.DataFrame(d)
# Append the newly-generated data to the initial group
group = group.append(eventdf, ignore_index=True)
return group
# Apply each new group to
df = df.groupby(['user_id','event_timestamp']).apply(add_events)
# Function to increment each occurrence of the 'level' event_type
def level_up(group):
level=1
for i in range(0,len(group)):
if group['event_type'].iloc[i] == "level":
group['event_type'].iloc[i] = "level_" + str(level)
level+=1
return group
df = df.groupby(['user_id']).apply(level_up)
df = df[['user_id', 'event_timestamp', 'lat','lon','event_type']]
df.to_csv("test_user_data.csv", index=False)
In [ ]: