In [1]:
import pandas as pd
import numpy as np
from numpy.random import normal
from numpy.random import exponential
from numpy.random import choice
import string
from random import choice as ch
In [2]:
# number of points to use:
n_points = 200
In [3]:
# generating an exponential distribution
ages = pd.Series([exponential(15) for x in range(n_points)]) + 18
# Generating 5% nulls
fraction = .05
ages.loc[ages.sample(frac=fraction).index] = np.nan
# Made only .5% "super-outlier", because 5% would affect the means a lot
fraction = .005
ages.loc[ages.sample(frac=fraction).index] = [normal(180, 50) for x in range(len(ages.sample(frac=fraction)))]
# making 1% super-super outlier, to mess up the standard deviation and mean
ages.loc[50] = 300000000
# Making 4% outliers under 18
fraction = .04
ages.loc[ages.sample(frac=fraction).index] = normal(5, 3)
# Choosing only positives
ages = ages[ages>0]
# Making them integers
ages = ages.astype(int)
In [4]:
gender = pd.Series([choice(['male', 'female', 'MALE', np.NaN],
p=[.4, .55, .02, .03]) for x in range(n_points)])
In [5]:
heights = pd.Series([normal(170, 15) for x in range(n_points)])
# make a couple of super outliers
heights.iloc[20] = 252
heights.iloc[21] = 65
heights.iloc[22] = 235
# force to integers:
heights = heights.astype(int)
# make 2% missing data
fraction = .02
heights.loc[heights.sample(frac=fraction).index] = np.NaN
In [6]:
all_data = pd.DataFrame([ages, heights, gender]).T
In [7]:
all_data.columns = ['age', 'height', 'gender']
Preview:
In [8]:
all_data.head()
Out[8]:
In [9]:
entry = all_data.iloc[4]
In [10]:
entry
Out[10]:
Making an index:
In [11]:
n = 10
index = ["".join(ch(string.ascii_uppercase) for i in range(n)) for x in range(n_points)]
# forcing the index of the duplicated data to be duplicated too:
for row in range(4,8):
all_data.iloc[row] = entry
index[row] = 'YPUQAPSOYJ'
all_data.index = index
In [12]:
all_data.head(20)
Out[12]:
In [13]:
all_data.to_csv('all_data.csv')