Generating synthetic data


In [1]:
import pandas as pd 
import numpy as np 
from numpy.random import normal
from numpy.random import exponential
from numpy.random import choice
import string
from random import choice as ch

In [2]:
# number of points to use: 
n_points = 200

Generating age

  • not normally distrubuted
    • 5% nulls
    • 3% ambiguous outliers
    • 5% clear outliers (outside of p01, p99)

In [3]:
# generating an exponential distribution
ages = pd.Series([exponential(15) for x in range(n_points)]) + 18

# Generating 5% nulls 
fraction = .05
ages.loc[ages.sample(frac=fraction).index] = np.nan

# Made only .5% "super-outlier", because 5% would affect the means a lot
fraction = .005
ages.loc[ages.sample(frac=fraction).index] = [normal(180, 50) for x in range(len(ages.sample(frac=fraction)))]

# making 1% super-super outlier, to mess up the standard deviation and mean 
ages.loc[50] = 300000000

# Making 4% outliers under 18 
fraction = .04
ages.loc[ages.sample(frac=fraction).index] = normal(5, 3)

# Choosing only positives
ages = ages[ages>0]

# Making them integers
ages = ages.astype(int)

Generating the categorical data

  • gender
    • 40% 'male'
    • 55% 'female'
    • 2% 'MALE'
    • 3% null

In [4]:
gender = pd.Series([choice(['male', 'female', 'MALE', np.NaN], 
                           p=[.4, .55, .02, .03]) for x in range(n_points)])

Generating the height data

  • normally distributed
    • 3% 2-sigma outliers
    • 2% 3-sigma outliers
    • 2% missing data

In [5]:
heights = pd.Series([normal(170, 15) for x in range(n_points)])

# make a couple of super outliers 
heights.iloc[20] = 252
heights.iloc[21] = 65
heights.iloc[22] = 235

# force to integers: 
heights = heights.astype(int)

# make 2% missing data
fraction = .02
heights.loc[heights.sample(frac=fraction).index] = np.NaN

Joining them together:


In [6]:
all_data = pd.DataFrame([ages, heights, gender]).T

In [7]:
all_data.columns = ['age', 'height', 'gender']

Preview:


In [8]:
all_data.head()


Out[8]:
age height gender
0 88 163 female
1 29 158 female
2 42 159 female
3 25 179 male
4 32 169 male

Generating some duplicated data:


In [9]:
entry = all_data.iloc[4]

In [10]:
entry


Out[10]:
age         32
height     169
gender    male
Name: 4, dtype: object

Making an index:


In [11]:
n = 10
index = ["".join(ch(string.ascii_uppercase) for i in range(n)) for x in range(n_points)]

# forcing the index of the duplicated data to be duplicated too: 
for row in range(4,8): 
    all_data.iloc[row] = entry  
    index[row] = 'YPUQAPSOYJ'

all_data.index = index

In [12]:
all_data.head(20)


Out[12]:
age height gender
CFLOXRHMDR 88 163 female
FXLJSNLSOG 29 158 female
FWDIVJKGOI 42 159 female
YWEBKQWHRE 25 179 male
YPUQAPSOYJ 32 169 male
YPUQAPSOYJ 32 169 male
YPUQAPSOYJ 32 169 male
YPUQAPSOYJ 32 169 male
SSZQEGTLNK NaN 162 male
PRFEFXNGWN 36 166 female
IIVXDNOAIV 1 165 female
VVQYVNRAGQ 18 134 female
YVEDWPTEEB 31 149 female
LGPQPJXWPI 34 172 female
VURXKXJUTM 22 174 female
CWCFROPRFE 22 NaN male
HAFHEBLLVW 26 161 male
TIHIBNNHAB 45 181 female
OMOVKQLHYO 27 170 male
PFCTYIBBNQ 1 173 female

In [13]:
all_data.to_csv('all_data.csv')