In [1]:
    
import pandas as pd 
import numpy as np 
from numpy.random import normal
from numpy.random import exponential
from numpy.random import choice
import string
from random import choice as ch
    
In [2]:
    
# number of points to use: 
n_points = 200
    
In [3]:
    
# generating an exponential distribution
ages = pd.Series([exponential(15) for x in range(n_points)]) + 18
# Generating 5% nulls 
fraction = .05
ages.loc[ages.sample(frac=fraction).index] = np.nan
# Made only .5% "super-outlier", because 5% would affect the means a lot
fraction = .005
ages.loc[ages.sample(frac=fraction).index] = [normal(180, 50) for x in range(len(ages.sample(frac=fraction)))]
# making 1% super-super outlier, to mess up the standard deviation and mean 
ages.loc[50] = 300000000
# Making 4% outliers under 18 
fraction = .04
ages.loc[ages.sample(frac=fraction).index] = normal(5, 3)
# Choosing only positives
ages = ages[ages>0]
# Making them integers
ages = ages.astype(int)
    
In [4]:
    
gender = pd.Series([choice(['male', 'female', 'MALE', np.NaN], 
                           p=[.4, .55, .02, .03]) for x in range(n_points)])
    
In [5]:
    
heights = pd.Series([normal(170, 15) for x in range(n_points)])
# make a couple of super outliers 
heights.iloc[20] = 252
heights.iloc[21] = 65
heights.iloc[22] = 235
# force to integers: 
heights = heights.astype(int)
# make 2% missing data
fraction = .02
heights.loc[heights.sample(frac=fraction).index] = np.NaN
    
In [6]:
    
all_data = pd.DataFrame([ages, heights, gender]).T
    
In [7]:
    
all_data.columns = ['age', 'height', 'gender']
    
Preview:
In [8]:
    
all_data.head()
    
    Out[8]:
In [9]:
    
entry = all_data.iloc[4]
    
In [10]:
    
entry
    
    Out[10]:
Making an index:
In [11]:
    
n = 10
index = ["".join(ch(string.ascii_uppercase) for i in range(n)) for x in range(n_points)]
# forcing the index of the duplicated data to be duplicated too: 
for row in range(4,8): 
    all_data.iloc[row] = entry  
    index[row] = 'YPUQAPSOYJ'
all_data.index = index
    
In [12]:
    
all_data.head(20)
    
    Out[12]:
In [13]:
    
all_data.to_csv('all_data.csv')