In [11]:
import json
import calendar
import random
from datetime import date, timedelta
from faker import Faker
import numpy as np
from pandas import DataFrame
from delorean import parse
import pandas as pd
In [15]:
fake = Faker()
In [16]:
usernames = set()
usernames_no = 1000
# populate the set with 1000 unique usernames
while len(usernames) < usernames_no:
usernames.add(fake.user_name())
In [29]:
def get_random_name_and_gender():
skew = .6
male = random.random() > skew
if male:
return fake.name_male(), 'M'
else:
return fake.name_female(), 'F'
def get_users(usernames):
users = []
for username in usernames:
name, gender = get_random_name_and_gender()
user = {
'username': username,
'name': name,
'gender': gender,
'email': fake.email(),
'age': fake.random_int(min=18, max=90),
'address': fake.address(),
}
users.append(json.dumps(user))
return users
users = get_users(usernames)
users[:3]
Out[29]:
In [43]:
# Campaign Name format
# e.g. InternalType_StartDate_EndDate_TargetAge_Target_Gender_Currency
def get_type():
types =['ABB', 'BMX', 'GRZ', 'KTR']
return random.choice(types)
def get_start_end_dates():
duration = random.randint(1,2 * 365)
offset = random.randint(-365, 365)
start = date.today() - timedelta(days=offset)
end = start + timedelta(days=duration)
def _format_date(date_):
return date_.strftime("%Y%m%d")
return _format_date(start), _format_date(end)
def get_age():
age = random.randint(20,45)
age -= age % 5
diff = random.randint(5,25)
diff -= diff % 5
return '{}-{}'.format(age, age + diff)
def get_gender():
return random.choice(('M', 'F', 'B'))
def get_currency():
return random.choice(('GBP', 'EUR', 'USD'))
def get_campaign_name():
separator = '_'
type_ = get_type()
start_end = separator.join(get_start_end_dates())
age = get_age()
gender = get_gender()
currency = get_currency()
return separator.join((type_, start_end, age, gender, currency))
In [44]:
def get_campaign_data():
name = get_campaign_name()
budget = random.randint(10**3, 10**6)
spent = random.randint(10**2, budget)
clicks = int(random.triangular(10**2,10**5, 0.2 * 10**5))
impressions = int(random.gauss(0.5 * 10 **6, 2))
return {
'cmp_name': name,
'cmp_bgt' : budget,
'cmp_spent': spent,
'cmp_clicks': clicks,
'cmp_impr': impressions
}
In [45]:
def get_data(users):
data = []
for user in users:
campaigns = [get_campaign_data() for _ in range(random.randint(2,8))]
data.append({'user': user, 'campaigns': campaigns})
return data
In [52]:
raw_data = get_data(users)
In [53]:
data = []
for datum in raw_data:
for campaign in datum['campaigns']:
campaign.update({'user': datum['user']})
data.append(campaign)
data[:2]
Out[53]:
In [54]:
df = DataFrame(data)
df.head()
Out[54]:
In [55]:
df.count()
Out[55]:
In [56]:
df.describe()
Out[56]:
In [58]:
df.sort_values(by=['cmp_bgt'], ascending=False).head(3)
Out[58]:
In [62]:
# Unpack campaign name into individual units
def unpack_campaign_name(name):
type_,start,end,age,gender,currency = name.split('_')
start = parse(start).date
end = parse(end).date
return type_,start,end,age,gender, currency
In [63]:
campaign_data = df['cmp_name'].apply(unpack_campaign_name)
campaign_cols = ['Type', 'Start', 'End', 'Age', 'Gender', 'Currency']
campaign_df = DataFrame(campaign_data.tolist(), columns=campaign_cols, index=df.index)
campaign_df.head(3)
Out[63]:
In [64]:
df = df.join(campaign_df)
In [65]:
df[['cmp_name'] + campaign_cols ].head(3)
Out[65]:
In [ ]: