In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
%matplotlib inline
%pylab inline
In [3]:
import pandas as pd
print(pd.__version__)
http://scikit-learn.org/stable/auto_examples/datasets/plot_random_dataset.html
http://scikit-learn.org/stable/datasets/index.html#sample-generators
http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
Plot several randomly generated 2D classification datasets.
This example illustrates the :func:datasets.make_classification
:func:datasets.make_blobs and :func:datasets.make_gaussian_quantiles
functions.
For make_classification, three binary and two multi-class classification
datasets are generated, with different numbers of informative features and
clusters per class.
In [20]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
N_SAMPLES = 500
TRAIN_SEED = 42
TEST_SEED = 13
# change seed for test data set
# SEED = TRAIN_SEED
SEED = TEST_SEED
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html#sklearn.datasets.make_blobs
# https://www.welt.de/motor/news/article156991316/Unfallstatistik-2015.html
# http://www.openculture.com/2017/12/why-incompetent-people-think-theyre-amazing.html
# 0: young drivers with fast cars: red
# 1: reasonable drivers: green
# 2: a little bit older, more kilometers, general noise: yellow
# 3: really old drivers: red
# 4: young drivers: red
# 5: another green just to have a counter part to all the red ones: green
# 6: people who do not drive a lot: green
# 7: people who drive a lot: yellow
# 8: young people with slow cars: yellow
centers = [(150, 35, 50), (110, 50, 25), (120, 55, 30), (120, 75, 20), (120, 30, 30),
(140, 45, 40), (110, 40, 15), (130, 50, 45), (100, 25, 15)]
cluster_std = [4, 9, 18, 8, 9, 5, 8, 12, 5]
# X, y = make_blobs(n_samples=300, n_features=3, centers=centers, random_state=13, cluster_std = cluster_std)
# X, y = make_blobs(n_samples=300, n_features=3, centers=centers, random_state=42, cluster_std = cluster_std)
X, y = make_blobs(n_samples=N_SAMPLES, n_features=3, centers=centers, random_state=SEED, cluster_std = cluster_std)
# http://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html
# X, y = make_classification(n_features=3, n_redundant=0, n_informative=3,
# n_clusters_per_class=2, n_classes=3, random_state=42)
feature_names = ['speed', 'age' ,'miles']
df = pd.DataFrame(X, columns=feature_names)
df = df.round()
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.clip.html
df['speed'] = df['speed'].clip(60,200)
df['age'] = df['age'].clip(16,100)
df['miles'] = df['miles'].clip(1,500)
# merges clusters into one group
for group in np.nditer(y, op_flags=['readwrite']):
if group == 3 or group == 4:
group[...] = 0
if group == 5 or group == 6:
group[...] = 1
if group == 7 or group == 8:
group[...] = 2
In [21]:
df['group'] = y
In [22]:
df.describe()
Out[22]:
In [23]:
# df.to_csv('./insurance-customers-300-2.csv', sep=';', index=False)
# df.to_csv('./insurance-customers-300.csv', sep=';', index=False)
# df.to_csv('./insurance-customers-1500.csv', sep=';', index=False)
# df.to_csv('./insurance-customers-10000.csv', sep=';', index=False)
df.to_csv('./insurance-customers-500-test.csv', sep=';', index=False)
In [24]:
# check
# !curl -O https://raw.githubusercontent.com/DJCordhose/data-viz/master/data/insurance-customers-1500.csv
# !curl -O https://raw.githubusercontent.com/DJCordhose/data-viz/master/data/insurance-customers-300.csv
In [25]:
!ls -l
In [26]:
# customers = pd.read_csv('./insurance-customers-300.csv', sep=';')
# customers = pd.read_csv('./insurance-customers-1500.csv', sep=';')
# customers = pd.read_csv('./insurance-customers-10000.csv', sep=';')
customers = pd.read_csv('./insurance-customers-500-test.csv', sep=';')
In [27]:
customers.describe()
Out[27]:
In [ ]: