Make Vertual Data



In [14]:

    
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

1. make_classification



In [15]:

    
from sklearn.datasets import make_classification



In [16]:

    
# n_samples : the number of sample data (100)
# n_features : the number of feature data (20)
# n_informative : 독립변수중 서로 독립 성분수 (2)
# n_redundant : 독립변수중 중복된 성분수 (2)
# n_repeated : 독립변수중 단순 중복된 성분의수 (0) 
# n_classes : 클래스당 클러스터 수  (2)
# n_clusters_per_class : 클래스 당 클러스터의 수 (2)
# weights : 각 클래스에 할당된 표본 수 (None)
# random_state : make random seed (None)

# 20개씩 100개의 sample data
X, y = make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=2, 
                           n_repeated=0, n_classes=2, n_clusters_per_class=2,      
                           weights=None, flip_y=0.01, class_sep=1.0, hypercube=True, 
                           shift=0.0, scale=1.0, shuffle=True, random_state=None)
# X : feature, y : target



In [17]:

    
# c=y : color 0:white, 1:black
X, y = make_classification(n_features=1, n_redundant=0, n_informative=1, n_clusters_per_class=1, random_state=4)
plt.scatter(X, y, marker='o', c=y, s=100)
len(X[0]), len(y)









    Out[17]:





(1, 100)



In [18]:

    
# two features & one infomative
# in this case, X[:, 1] is infomative feature
plt.title("One informative feature, one cluster per class")
X, y = make_classification(n_features=2, n_redundant=0, n_informative=1, n_clusters_per_class=1, random_state=4)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y, s=100)
# plt.scatter(X[:, 0], y, marker='o', c="r", s=100)
# plt.scatter(X[:, 1], y, marker='o', c=y, s=100)









    Out[18]:





<matplotlib.collections.PathCollection at 0x10a3abf98>



In [19]:

    
# two features & two infomative
# in this case, X[:, 0] & X[:, 1] are infomative features
plt.title("Two informative features, one cluster per class")
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=6)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y, s=100)
# plt.scatter(X[:, 0], y, marker='o', c=y, s=100)
# plt.scatter(X[:, 1], y, marker='o', c=y, s=100)









    Out[19]:





<matplotlib.collections.PathCollection at 0x10aa28828>



In [20]:

    
# weights 0.9 : 0.1
plt.title("Two informative features, one cluster per class, different weight")
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, weights=[0.9, 0.1], random_state=6)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y, s=100)









    Out[20]:





<matplotlib.collections.PathCollection at 0x10aa98400>



In [21]:

    
plt.title("Two informative features, two clusters per class")
X2, Y2 = make_classification(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=2, random_state=2)
plt.scatter(X2[:, 0], X2[:, 1], marker='o', c=Y2, s=100)









    Out[21]:





<matplotlib.collections.PathCollection at 0x10b43c278>



In [22]:

    
# n_classes : 클래스당 클러수터 수
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, n_classes=3, random_state=2)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y, s=100)









    Out[22]:





<matplotlib.collections.PathCollection at 0x10b4a6f98>

2. make_blob



In [23]:

    
from sklearn.datasets import make_blobs



In [24]:

    
X, y = make_blobs(n_samples=100, n_features=2, centers=3, 
                  cluster_std=1.0, center_box=(-10.0, 10.0), 
                  shuffle=True, random_state=None)



In [25]:

    
plt.title("Three blobs")
X1, Y1 = make_blobs(n_features=2, centers=3)
plt.scatter(X1[:, 0], X1[:, 1], marker='o', c=Y1, s=100)









    Out[25]:





<matplotlib.collections.PathCollection at 0x10ba35b00>



In [ ]: