Generating Sample Data from Distributions

Then treating data as if they were samples of real data


In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%matplotlib inline
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [3]:
import pandas as pd
print(pd.__version__)


0.23.4

Plot randomly generated classification dataset

http://scikit-learn.org/stable/auto_examples/datasets/plot_random_dataset.html

http://scikit-learn.org/stable/datasets/index.html#sample-generators

http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html

Plot several randomly generated 2D classification datasets. This example illustrates the :func:datasets.make_classification :func:datasets.make_blobs and :func:datasets.make_gaussian_quantiles functions.

For make_classification, three binary and two multi-class classification datasets are generated, with different numbers of informative features and clusters per class.


In [20]:
import numpy as np

from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs

N_SAMPLES = 500

TRAIN_SEED = 42
TEST_SEED = 13

# change seed for test data set
# SEED = TRAIN_SEED
SEED = TEST_SEED

# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html#sklearn.datasets.make_blobs

# https://www.welt.de/motor/news/article156991316/Unfallstatistik-2015.html
# http://www.openculture.com/2017/12/why-incompetent-people-think-theyre-amazing.html
# 0: young drivers with fast cars: red
# 1: reasonable drivers: green
# 2: a little bit older, more kilometers, general noise: yellow
# 3: really old drivers: red
# 4: young drivers: red
# 5: another green just to have a counter part to all the red ones: green
# 6: people who do not drive a lot: green
# 7: people who drive a lot: yellow
# 8: young people with slow cars: yellow

centers = [(150, 35, 50), (110, 50, 25), (120, 55, 30), (120, 75, 20), (120, 30, 30), 
           (140, 45, 40), (110, 40, 15), (130, 50, 45), (100, 25, 15)]
cluster_std = [4, 9, 18, 8, 9, 5, 8, 12, 5]

# X, y = make_blobs(n_samples=300, n_features=3, centers=centers, random_state=13, cluster_std = cluster_std)
# X, y = make_blobs(n_samples=300, n_features=3, centers=centers, random_state=42, cluster_std = cluster_std)
X, y = make_blobs(n_samples=N_SAMPLES, n_features=3, centers=centers, random_state=SEED, cluster_std = cluster_std)

# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
# X, y = make_classification(n_features=3, n_redundant=0, n_informative=3,
#                              n_clusters_per_class=2, n_classes=3, random_state=42)

feature_names = ['speed', 'age' ,'miles']
df = pd.DataFrame(X, columns=feature_names)
df = df.round()
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.clip.html

df['speed'] = df['speed'].clip(60,200)
df['age'] = df['age'].clip(16,100)
df['miles'] = df['miles'].clip(1,500)

# merges clusters into one group
for group in np.nditer(y, op_flags=['readwrite']):
    if group == 3 or group == 4:
        group[...] = 0
    if group == 5 or group == 6:
        group[...] = 1
    if group == 7 or group == 8:
        group[...] = 2

In [21]:
df['group'] = y

In [22]:
df.describe()


Out[22]:
speed age miles group
count 500.000000 500.000000 500.000000 500.000000
mean 122.220000 45.342000 29.852000 0.996000
std 16.857995 16.724127 15.367186 0.818121
min 87.000000 16.000000 1.000000 0.000000
25% 109.000000 33.000000 18.000000 0.000000
50% 120.000000 42.500000 29.000000 1.000000
75% 135.000000 55.000000 41.000000 2.000000
max 160.000000 96.000000 76.000000 2.000000

In [23]:
# df.to_csv('./insurance-customers-300-2.csv', sep=';', index=False)
# df.to_csv('./insurance-customers-300.csv', sep=';', index=False)
# df.to_csv('./insurance-customers-1500.csv', sep=';', index=False)
# df.to_csv('./insurance-customers-10000.csv', sep=';', index=False)
df.to_csv('./insurance-customers-500-test.csv', sep=';', index=False)

In [24]:
# check
# !curl -O https://raw.githubusercontent.com/DJCordhose/data-viz/master/data/insurance-customers-1500.csv
# !curl -O https://raw.githubusercontent.com/DJCordhose/data-viz/master/data/insurance-customers-300.csv

In [25]:
!ls -l


total 6777
-rw-r--r-- 1 olive 197609   15195 Mar 28 17:31 0-generate.ipynb
drwxr-xr-x 1 olive 197609       0 Mar 14 23:44 data
drwxr-xr-x 1 olive 197609       0 Mar 14 23:44 figures
-rw-r--r-- 1 olive 197609  188340 Mar 28 17:27 insurance-customers-10000.csv
-rw-r--r-- 1 olive 197609    9459 Mar 28 17:27 insurance-customers-500.csv
-rw-r--r-- 1 olive 197609    9441 Mar 28 17:31 insurance-customers-500-test.csv
drwxr-xr-x 1 olive 197609       0 Mar 14 23:44 model
-rw-r--r-- 1 olive 197609    5680 Mar 14 23:44 production preparation.ipynb
-rw-r--r-- 1 olive 197609      88 Mar 14 23:44 sample_insurance.json
drwxr-xr-x 1 olive 197609       0 Mar 14 23:44 tf
-rw-r--r-- 1 olive 197609  635656 Mar 14 23:44 U3-M1-example.ipynb
-rw-r--r-- 1 olive 197609 3652988 Mar 14 23:44 U3-M2-nn-intro.ipynb
-rw-r--r-- 1 olive 197609 2016209 Mar 14 23:44 U3-M3-nn-no-bullshit.ipynb
-rw-r--r-- 1 olive 197609  343107 Mar 14 23:44 U3-M3-nn-simplified.ipynb
-rw-r--r-- 1 olive 197609    8412 Mar 14 23:44 U4-M1-Preparing TensorFlow models.ipynb
-rw-r--r-- 1 olive 197609   20381 Mar 14 23:44 U4-M2-Serving TensorFlow models.ipynb
-rw-r--r-- 1 olive 197609    9954 Mar 14 23:44 U4-M3-Deploying to the Browser using TensorFlow.js.ipynb

In [26]:
# customers = pd.read_csv('./insurance-customers-300.csv', sep=';')
# customers = pd.read_csv('./insurance-customers-1500.csv', sep=';')
# customers = pd.read_csv('./insurance-customers-10000.csv', sep=';')
customers = pd.read_csv('./insurance-customers-500-test.csv', sep=';')

In [27]:
customers.describe()


Out[27]:
speed age miles group
count 500.000000 500.000000 500.000000 500.000000
mean 122.220000 45.342000 29.852000 0.996000
std 16.857995 16.724127 15.367186 0.818121
min 87.000000 16.000000 1.000000 0.000000
25% 109.000000 33.000000 18.000000 0.000000
50% 120.000000 42.500000 29.000000 1.000000
75% 135.000000 55.000000 41.000000 2.000000
max 160.000000 96.000000 76.000000 2.000000

In [ ]: