In [11]:
import json
import calendar
import random
from datetime import date, timedelta

from faker import Faker
import numpy as np
from pandas import DataFrame
from delorean import parse
import pandas as pd

In [15]:
fake = Faker()

In [16]:
usernames = set()
usernames_no = 1000
# populate the set with 1000 unique usernames
while len(usernames) < usernames_no:
    usernames.add(fake.user_name())

In [29]:
def get_random_name_and_gender():
    skew = .6
    male = random.random() > skew
    if male:
        return fake.name_male(), 'M'
    else:
        return fake.name_female(), 'F'

def get_users(usernames):
    users = []
    for username in usernames:
        name, gender = get_random_name_and_gender()
        
        user = {
            'username': username,
            'name': name,
            'gender': gender,
            'email': fake.email(),
            'age': fake.random_int(min=18, max=90),
            'address': fake.address(),
        }
        
        users.append(json.dumps(user))
    return users

users = get_users(usernames)
users[:3]


Out[29]:
['{"username": "bradley28", "name": "Dennis Rodriguez", "gender": "M", "email": "handrews@bullock.com", "age": 20, "address": "316 Rebecca Junction\\nNorth Tracyhaven, NV 88546"}',
 '{"username": "fvega", "name": "Stephanie Elliott", "gender": "F", "email": "katrina22@hotmail.com", "age": 61, "address": "990 Isabel Groves\\nSouth Pamelamouth, CA 03428"}',
 '{"username": "shawn17", "name": "Jennifer Gordon", "gender": "F", "email": "megan56@baker-brown.com", "age": 70, "address": "PSC 3743, Box 9377\\nAPO AA 81631"}']

In [43]:
# Campaign Name format
# e.g. InternalType_StartDate_EndDate_TargetAge_Target_Gender_Currency
def get_type():
    types =['ABB', 'BMX', 'GRZ', 'KTR']
    return random.choice(types)

def get_start_end_dates():
    duration = random.randint(1,2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days=offset)
    end = start + timedelta(days=duration)
    
    def _format_date(date_):
        return date_.strftime("%Y%m%d")
    
    return _format_date(start), _format_date(end)

def get_age():
    age = random.randint(20,45)
    age -= age % 5
    diff = random.randint(5,25)
    diff -= diff % 5
    return '{}-{}'.format(age, age + diff)

def get_gender():
    return random.choice(('M', 'F', 'B'))

def get_currency():
    return random.choice(('GBP', 'EUR', 'USD'))

def get_campaign_name():
    separator = '_'
    type_ = get_type()
    start_end = separator.join(get_start_end_dates())
    age = get_age()
    gender = get_gender()
    currency = get_currency()
    return separator.join((type_, start_end, age, gender, currency))

In [44]:
def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)
    clicks = int(random.triangular(10**2,10**5, 0.2 * 10**5))
    impressions = int(random.gauss(0.5 * 10 **6, 2))
    return {
        'cmp_name': name,
        'cmp_bgt' : budget,
        'cmp_spent': spent,
        'cmp_clicks': clicks,
        'cmp_impr': impressions
    }

In [45]:
def get_data(users):
    data = []
    for user in users:
        campaigns = [get_campaign_data() for _ in range(random.randint(2,8))]
        data.append({'user': user, 'campaigns': campaigns})
    
    return data

In [52]:
raw_data = get_data(users)

In [53]:
data = []
for datum in raw_data:
    for campaign in datum['campaigns']:
        campaign.update({'user': datum['user']})
        data.append(campaign)
        
        
data[:2]


Out[53]:
[{'cmp_name': 'BMX_20171106_20180721_20-30_B_USD',
  'cmp_bgt': 130777,
  'cmp_spent': 5627,
  'cmp_clicks': 42132,
  'cmp_impr': 499997,
  'user': '{"username": "bradley28", "name": "Dennis Rodriguez", "gender": "M", "email": "handrews@bullock.com", "age": 20, "address": "316 Rebecca Junction\\nNorth Tracyhaven, NV 88546"}'},
 {'cmp_name': 'GRZ_20190118_20191121_35-60_B_EUR',
  'cmp_bgt': 755156,
  'cmp_spent': 507488,
  'cmp_clicks': 11490,
  'cmp_impr': 500000,
  'user': '{"username": "bradley28", "name": "Dennis Rodriguez", "gender": "M", "email": "handrews@bullock.com", "age": 20, "address": "316 Rebecca Junction\\nNorth Tracyhaven, NV 88546"}'}]

In [54]:
df = DataFrame(data)
df.head()


Out[54]:
cmp_bgt cmp_clicks cmp_impr cmp_name cmp_spent user
0 130777 42132 499997 BMX_20171106_20180721_20-30_B_USD 5627 {"username": "bradley28", "name": "Dennis Rodr...
1 755156 11490 500000 GRZ_20190118_20191121_35-60_B_EUR 507488 {"username": "bradley28", "name": "Dennis Rodr...
2 909499 57347 499998 BMX_20190518_20210124_40-55_F_USD 391664 {"username": "bradley28", "name": "Dennis Rodr...
3 440708 28488 500000 BMX_20171121_20180710_30-40_F_GBP 1372 {"username": "bradley28", "name": "Dennis Rodr...
4 469229 19998 499997 KTR_20170718_20190407_40-45_M_GBP 135046 {"username": "bradley28", "name": "Dennis Rodr...

In [55]:
df.count()


Out[55]:
cmp_bgt       4993
cmp_clicks    4993
cmp_impr      4993
cmp_name      4993
cmp_spent     4993
user          4993
dtype: int64

In [56]:
df.describe()


Out[56]:
cmp_bgt cmp_clicks cmp_impr cmp_spent
count 4993.000000 4993.000000 4993.000000 4993.000000
mean 498412.981574 39751.811937 499999.508712 249229.814340
std 288350.371396 21802.315623 2.023592 217212.315455
min 1014.000000 971.000000 499993.000000 117.000000
25% 251301.000000 21793.000000 499998.000000 69057.000000
50% 498640.000000 36434.000000 499999.000000 189142.000000
75% 745990.000000 55456.000000 500001.000000 379566.000000
max 999869.000000 99105.000000 500007.000000 991444.000000

In [58]:
df.sort_values(by=['cmp_bgt'], ascending=False).head(3)


Out[58]:
cmp_bgt cmp_clicks cmp_impr cmp_name cmp_spent user
412 999869 47081 500003 KTR_20190422_20200225_40-55_M_USD 529745 {"username": "kerri56", "name": "Melissa Deleo...
3199 999682 40217 499999 BMX_20180608_20200415_25-45_F_EUR 408362 {"username": "portermargaret", "name": "Charle...
3884 999467 35854 500000 KTR_20181217_20201124_40-60_B_GBP 633906 {"username": "jeffrey24", "name": "Michael Luc...

In [62]:
# Unpack campaign name into individual units
def unpack_campaign_name(name):
    type_,start,end,age,gender,currency = name.split('_')
    start = parse(start).date
    end = parse(end).date
    return type_,start,end,age,gender, currency

In [63]:
campaign_data = df['cmp_name'].apply(unpack_campaign_name)
campaign_cols = ['Type', 'Start', 'End', 'Age', 'Gender', 'Currency']
campaign_df = DataFrame(campaign_data.tolist(), columns=campaign_cols, index=df.index)
campaign_df.head(3)


Out[63]:
Type Start End Age Gender Currency
0 BMX 2017-06-11 2018-07-21 20-30 B USD
1 GRZ 2019-01-18 2019-11-21 35-60 B EUR
2 BMX 2019-05-18 2021-01-24 40-55 F USD

In [64]:
df = df.join(campaign_df)

In [65]:
df[['cmp_name'] + campaign_cols ].head(3)


Out[65]:
cmp_name Type Start End Age Gender Currency
0 BMX_20171106_20180721_20-30_B_USD BMX 2017-06-11 2018-07-21 20-30 B USD
1 GRZ_20190118_20191121_35-60_B_EUR GRZ 2019-01-18 2019-11-21 35-60 B EUR
2 BMX_20190518_20210124_40-55_F_USD BMX 2019-05-18 2021-01-24 40-55 F USD

In [ ]: