Scikit-learn has a number of methods to generate data in its datasets module. Below are some examples of the options available.
In [1]:
#Import pandas and plotting libraries to visualize data
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
In [2]:
from sklearn.datasets import make_blobs
blob_data, blob_labels = make_blobs(n_samples=100,
n_features=2,
centers=5,
cluster_std=.8)
plt.scatter(blob_data[:,0],
blob_data[:,1],
c = blob_labels,
cmap='viridis');
In [3]:
blob_df = pd.DataFrame({'x':blob_data[:,0],'y':blob_data[:,1],'Label':blob_labels})
blob_df.head(2)
Out[3]:
Create one circle inside of another one.
In [4]:
from sklearn.datasets import make_circles
circles_data, circles_labels = make_circles(n_samples=100,
noise=.03,
factor=.5)
plt.scatter(circles_data[:,0],
circles_data[:,1],
c = circles_labels,
cmap='viridis');
In [5]:
circles_df = pd.DataFrame({'x':circles_data[:,0],'y':circles_data[:,1],'Class':circles_labels})
circles_df.head(2)
Out[5]:
In [6]:
from sklearn.datasets import make_regression
regression_data, regression_values = make_regression(n_samples=100,
n_features=1,
n_informative=1,
noise=5)
plt.scatter(regression_data[:,0],
regression_values,
cmap='viridis');
In [7]:
regression_df = pd.DataFrame({'Feature 1':regression_data[:,0],'Value':regression_values})
regression_df.head(2)
Out[7]:
In [8]:
from sklearn.datasets import make_biclusters
biclusters_data, biclusters_rows, biclusters_cols = make_biclusters(shape = (100,2),
n_clusters=2)
biclusters_df = pd.DataFrame({'x':biclusters_data[:,0],
'y':biclusters_data[:,1],
'Row Class 1':biclusters_rows[0],
'Row Class 2':biclusters_rows[1]})
biclusters_df.head(2)
Out[8]:
In [9]:
biclusters_cols
Out[9]:
In [10]:
from sklearn.datasets import make_classification
classification_data, classification_class = make_classification(n_samples=100,
n_features=4,
n_informative=3,
n_redundant=1,
n_classes=3)
classification_df = pd.DataFrame({'Feature 1':classification_data[:,0],
'Feature 2':classification_data[:,1],
'Feature 3':classification_data[:,2],
'Feature 4':classification_data[:,3],
'Class':classification_class})
classification_df.head(2)
Out[10]:
In [11]:
from sklearn.datasets import make_multilabel_classification
multilabel_classification_data, multilabel_classification_classes = make_multilabel_classification(n_samples=100,
n_features=4,
n_classes=2,
n_labels=2)
multilabel_classification_df = pd.DataFrame({'Feature 1':multilabel_classification_data[:,0],
'Feature 2':multilabel_classification_data[:,1],
'Feature 3':multilabel_classification_data[:,2],
'Feature 4':multilabel_classification_data[:,3],
'Class 1':multilabel_classification_classes[:,0],
'Class 2':multilabel_classification_classes[:,1]})
multilabel_classification_df.head(2)
Out[11]:
In [12]:
from sklearn.datasets import make_moons
moons_data, moons_labels = make_moons(n_samples=100,noise=0)
plt.scatter(moons_data[:,0],
moons_data[:,1],
c=moons_labels,
cmap='viridis');
In [13]:
from sklearn.datasets import make_spd_matrix
spd_matrix = make_spd_matrix(n_dim=5)
sns.heatmap(data=spd_matrix, annot=True, cmap='viridis');
In [14]:
pd.DataFrame(spd_matrix)
Out[14]: