Generating synthetic data



In [1]:

    
import pandas as pd 
import numpy as np 
from numpy.random import normal
from numpy.random import exponential
from numpy.random import choice
import string
from random import choice as ch



In [2]:

    
# number of points to use: 
n_points = 200

Generating age

not normally distrubuted
- 5% nulls
- 3% ambiguous outliers
- 5% clear outliers (outside of p01, p99)



In [3]:

    
# generating an exponential distribution
ages = pd.Series([exponential(15) for x in range(n_points)]) + 18

# Generating 5% nulls 
fraction = .05
ages.loc[ages.sample(frac=fraction).index] = np.nan

# Made only .5% "super-outlier", because 5% would affect the means a lot
fraction = .005
ages.loc[ages.sample(frac=fraction).index] = [normal(180, 50) for x in range(len(ages.sample(frac=fraction)))]

# making 1% super-super outlier, to mess up the standard deviation and mean 
ages.loc[50] = 300000000

# Making 4% outliers under 18 
fraction = .04
ages.loc[ages.sample(frac=fraction).index] = normal(5, 3)

# Choosing only positives
ages = ages[ages>0]

# Making them integers
ages = ages.astype(int)

Generating the categorical data

gender
- 40% 'male'
- 55% 'female'
- 2% 'MALE'
- 3% null



In [4]:

    
gender = pd.Series([choice(['male', 'female', 'MALE', np.NaN], 
                           p=[.4, .55, .02, .03]) for x in range(n_points)])

Generating the height data

normally distributed
- 3% 2-sigma outliers
- 2% 3-sigma outliers
- 2% missing data



In [5]:

    
heights = pd.Series([normal(170, 15) for x in range(n_points)])

# make a couple of super outliers 
heights.iloc[20] = 252
heights.iloc[21] = 65
heights.iloc[22] = 235

# force to integers: 
heights = heights.astype(int)

# make 2% missing data
fraction = .02
heights.loc[heights.sample(frac=fraction).index] = np.NaN

Joining them together:



In [6]:

    
all_data = pd.DataFrame([ages, heights, gender]).T



In [7]:

    
all_data.columns = ['age', 'height', 'gender']

Preview:



In [8]:

    
all_data.head()

Generating some duplicated data:



In [9]:

    
entry = all_data.iloc[4]



In [10]:

    
entry









    Out[10]:





age         32
height     169
gender    male
Name: 4, dtype: object

Making an index:



In [11]:

    
n = 10
index = ["".join(ch(string.ascii_uppercase) for i in range(n)) for x in range(n_points)]

# forcing the index of the duplicated data to be duplicated too: 
for row in range(4,8): 
    all_data.iloc[row] = entry  
    index[row] = 'YPUQAPSOYJ'

all_data.index = index



In [12]:

    
all_data.head(20)









    Out[12]:






  
    
      
      age
      height
      gender
    
  
  
    
      CFLOXRHMDR
      88
      163
      female
    
    
      FXLJSNLSOG
      29
      158
      female
    
    
      FWDIVJKGOI
      42
      159
      female
    
    
      YWEBKQWHRE
      25
      179
      male
    
    
      YPUQAPSOYJ
      32
      169
      male
    
    
      YPUQAPSOYJ
      32
      169
      male
    
    
      YPUQAPSOYJ
      32
      169
      male
    
    
      YPUQAPSOYJ
      32
      169
      male
    
    
      SSZQEGTLNK
      NaN
      162
      male
    
    
      PRFEFXNGWN
      36
      166
      female
    
    
      IIVXDNOAIV
      1
      165
      female
    
    
      VVQYVNRAGQ
      18
      134
      female
    
    
      YVEDWPTEEB
      31
      149
      female
    
    
      LGPQPJXWPI
      34
      172
      female
    
    
      VURXKXJUTM
      22
      174
      female
    
    
      CWCFROPRFE
      22
      NaN
      male
    
    
      HAFHEBLLVW
      26
      161
      male
    
    
      TIHIBNNHAB
      45
      181
      female
    
    
      OMOVKQLHYO
      27
      170
      male
    
    
      PFCTYIBBNQ
      1
      173
      female



In [13]:

    
all_data.to_csv('all_data.csv')

	age	height	gender
CFLOXRHMDR	88	163	female
FXLJSNLSOG	29	158	female
FWDIVJKGOI	42	159	female
YWEBKQWHRE	25	179	male
YPUQAPSOYJ	32	169	male
YPUQAPSOYJ	32	169	male
YPUQAPSOYJ	32	169	male
YPUQAPSOYJ	32	169	male
SSZQEGTLNK	NaN	162	male
PRFEFXNGWN	36	166	female
IIVXDNOAIV	1	165	female
VVQYVNRAGQ	18	134	female
YVEDWPTEEB	31	149	female
LGPQPJXWPI	34	172	female
VURXKXJUTM	22	174	female
CWCFROPRFE	22	NaN	male
HAFHEBLLVW	26	161	male
TIHIBNNHAB	45	181	female
OMOVKQLHYO	27	170	male
PFCTYIBBNQ	1	173	female