In [1]:
# !pip install --upgrade faker delorean

In [4]:
import json
import calendar
import random
from datetime import date, timedelta

import faker
import numpy as np
from pandas import DataFrame
from delorean import parse
import pandas as pd

import matplotlib.pyplot

# make the graphs nicer
matplotlib.pyplot.style.use
# pd.set_option('display.mpl_style', 'default')


Out[4]:
<function matplotlib.style.core.use>

In [5]:
fake = faker.Faker()

In [6]:
usernames = set()

In [7]:
usernames_no = 1000

In [19]:
while len(usernames) < usernames_no:
    usernames.add(fake.user_name())
    
# dir(fake)

In [14]:
# usernames

In [15]:
def get_random_name_and_gender():
    skew = .6  # 60% of users will be female
    male = random.random() > skew
    if male:
        return fake.name_male(), 'M'
    else:
        return fake.name_female(), 'F'

def get_users(usernames):
    users = []
    for username in usernames:
        name, gender = get_random_name_and_gender()
        user = {
            'username': username,
            'name': name,
            'gender': gender,
            'email': fake.email(),
            'age': fake.random_int(min=18, max=90),
            'address': fake.address(),
        }
        users.append(json.dumps(user))
    return users

users = get_users(usernames)
users[:3]


Out[15]:
['{"username": "elizabeth14", "name": "Shane Jones", "gender": "M", "age": 59, "address": "811 Maria Manors Apt. 383\\nWest Nicholas, GU 91147-8066", "email": "linbrandon@yahoo.com"}',
 '{"username": "matthewlee", "name": "Laura Thomas", "gender": "F", "age": 51, "address": "44062 Martinez Path\\nBestburgh, NM 73590-5397", "email": "amandabest@ellis-sanchez.com"}',
 '{"username": "frank51", "name": "Samantha Bryant", "gender": "F", "age": 76, "address": "98909 Fitzpatrick Centers\\nSouth Robertmouth, GA 62869", "email": "mwilson@gmail.com"}']

In [20]:
# campaign name format:
# InternalType_StartDate_EndDate_TargetAge_TargetGender_Currency
def get_type():
    # just some gibberish internal codes
    types = ['AKX', 'BYU', 'GRZ', 'KTR']
    return random.choice(types)

def get_start_end_dates():
    duration = random.randint(1, 2 * 365)
    offset = random.randint(-365, 365)
    start = date.today() - timedelta(days=offset)
    end = start + timedelta(days=duration)
    
    def _format_date(date_):
        return date_.strftime("%Y%m%d")
    
    return _format_date(start), _format_date(end)

def get_age():
    age = random.randint(20, 45)
    age -= age % 5
    diff = random.randint(5, 25)
    diff -= diff % 5
    return '{}-{}'.format(age, age + diff)

def get_gender():
    return random.choice(('M', 'F', 'B'))

def get_currency():
    return random.choice(('GBP', 'EUR', 'USD'))

def get_campaign_name():
    separator = '_'
    type_ = get_type()
    start_end = separator.join(get_start_end_dates())
    age = get_age()
    gender = get_gender()
    currency = get_currency()
    return separator.join(
        (type_, start_end, age, gender, currency))

In [21]:
#6

def get_campaign_data():
    name = get_campaign_name()
    budget = random.randint(10**3, 10**6)
    spent = random.randint(10**2, budget)    
    clicks = int(random.triangular(10**2, 10**5, 0.2 * 10**5)) 
    impressions = int(random.gauss(0.5 * 10**6, 2))
    return {
        'cmp_name': name,
        'cmp_bgt': budget,
        'cmp_spent': spent,
        'cmp_clicks': clicks,
        'cmp_impr': impressions
    }

In [22]:
#7

def get_data(users):
    data = []
    for user in users:
        campaigns = [get_campaign_data()
                     for _ in range(random.randint(2, 8))]
        data.append({'user': user, 'campaigns': campaigns})
    return data

In [ ]: