Basic Machine Learning Introduction-checkpoint


Importing our modules


In [34]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import datasets
from sklearn import svm
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt

Pandas: Import Data from a CSV File into a Pandas DataFrame


In [2]:
filename_csv = '../datasets/IRIS.csv'

csv_data = pd.read_csv(filename_csv)
sk_data = datasets.load_iris()
#features = sk_data.data[:, :2]  # we only take the first two features.
#targets = sk_data.target


print("Pandas Dataframe Describe method: \n")
print(csv_data.describe())
print("\nPandas Dataframe Mean method: \n")
print(csv_data.mean())

print("\n\nVariable 'sk_data' has a type of: ")
print(type(sk_data))
print("\n sk_data data: ")

print("\nVariable 'csv_type' has a type of: ")
print(type(csv_data))
print("\nCSV Data:")
print(csv_data)


Pandas Dataframe Describe method: 

         column1    column2    column3     column4     column5     column6  \
count  99.000000  99.000000  99.000000  100.000000  100.000000  100.000000   
mean    5.926263   0.451278   3.040404    0.433849    3.851000    0.483319   
std     0.851959   0.237209   0.441685    0.182881    1.785378    0.302445   
min     4.300000   0.010000   2.000000    0.010000    1.000000    0.010000   
25%     5.200000   0.250000   2.800000    0.333333    1.600000    0.101695   
50%     5.900000   0.444444   3.000000    0.416667    4.500000    0.593220   
75%     6.500000   0.611111   3.300000    0.541667    5.100000    0.694915   
max     7.900000   0.999900   4.400000    0.999900    6.900000    0.999900   

         column7     column8  
count  100.00000  100.000000  
mean     1.21300    0.464048  
std      0.74558    0.310207  
min      0.10000    0.010000  
25%      0.37500    0.114583  
50%      1.40000    0.541667  
75%      1.80000    0.708333  
max      2.50000    0.999900  

Pandas Dataframe Mean method: 

column1    5.926263
column2    0.451278
column3    3.040404
column4    0.433849
column5    3.851000
column6    0.483319
column7    1.213000
column8    0.464048
dtype: float64


Variable 'sk_data' has a type of: 
<class 'sklearn.datasets.base.Bunch'>

 sk_data data: 

Variable 'csv_type' has a type of: 
<class 'pandas.core.frame.DataFrame'>

CSV Data:
    column1   column2  column3   column4  column5   column6  column7  \
0       5.1  0.222222      3.5  0.625000      1.4  0.067797      0.2   
1       4.9       NaN      3.0  0.416667      1.4  0.067797      0.2   
2       NaN  0.111111      3.2  0.500000      1.3  0.050847      0.2   
3       4.6  0.083333      NaN  0.458333      1.5  0.084746      0.2   
4       5.0  0.194444      3.6  0.666667      1.4  0.067797      0.2   
5       5.4  0.305556      3.9  0.791667      1.7  0.118644      0.4   
6       4.6  0.083333      3.4  0.583333      1.4  0.067797      0.3   
7       5.0  0.194444      3.4  0.583333      1.5  0.084746      0.2   
8       4.4  0.027778      2.9  0.375000      1.4  0.067797      0.2   
9       4.9  0.166667      3.1  0.458333      1.5  0.084746      0.1   
10      5.4  0.305556      3.7  0.708333      1.5  0.084746      0.2   
11      4.8  0.138889      3.4  0.583333      1.6  0.101695      0.2   
12      4.8  0.138889      3.0  0.416667      1.4  0.067797      0.1   
13      4.3  0.010000      3.0  0.416667      1.1  0.016949      0.1   
14      5.8  0.416667      4.0  0.833333      1.2  0.033898      0.2   
15      5.7  0.388889      4.4  0.999900      1.5  0.084746      0.4   
16      5.4  0.305556      3.9  0.791667      1.3  0.050847      0.4   
17      5.1  0.222222      3.5  0.625000      1.4  0.067797      0.3   
18      5.7  0.388889      3.8  0.750000      1.7  0.118644      0.3   
19      5.1  0.222222      3.8  0.750000      1.5  0.084746      0.3   
20      5.4  0.305556      3.4  0.583333      1.7  0.118644      0.2   
21      5.1  0.222222      3.7  0.708333      1.5  0.084746      0.4   
22      4.6  0.083333      3.6  0.666667      1.0  0.010000      0.2   
23      5.1  0.222222      3.3  0.541667      1.7  0.118644      0.5   
24      4.8  0.138889      3.4  0.583333      1.9  0.152542      0.2   
25      5.0  0.194444      3.0  0.416667      1.6  0.101695      0.2   
26      5.0  0.194444      3.4  0.583333      1.6  0.101695      0.4   
27      5.2  0.250000      3.5  0.625000      1.5  0.084746      0.2   
28      5.2  0.250000      3.4  0.583333      1.4  0.067797      0.2   
29      4.7  0.111111      3.2  0.500000      1.6  0.101695      0.2   
..      ...       ...      ...       ...      ...       ...      ...   
70      6.5  0.611111      3.0  0.416667      5.8  0.813559      2.2   
71      7.6  0.916667      3.0  0.416667      6.6  0.949153      2.1   
72      4.9  0.166667      2.5  0.208333      4.5  0.593220      1.7   
73      7.3  0.833333      2.9  0.375000      6.3  0.898305      1.8   
74      6.7  0.666667      2.5  0.208333      5.8  0.813559      1.8   
75      7.2  0.805556      3.6  0.666667      6.1  0.864407      2.5   
76      6.5  0.611111      3.2  0.500000      5.1  0.694915      2.0   
77      6.4  0.583333      2.7  0.291667      5.3  0.728814      1.9   
78      6.8  0.694444      3.0  0.416667      5.5  0.762712      2.1   
79      5.7  0.388889      2.5  0.208333      5.0  0.677966      2.0   
80      5.8  0.416667      2.8  0.333333      5.1  0.694915      2.4   
81      6.4  0.583333      3.2  0.500000      5.3  0.728814      2.3   
82      6.5  0.611111      3.0  0.416667      5.5  0.762712      1.8   
83      7.7  0.944444      3.8  0.750000      6.7  0.966102      2.2   
84      7.7  0.944444      2.6  0.250000      6.9  0.999900      2.3   
85      6.0  0.472222      2.2  0.083333      5.0  0.677966      1.5   
86      6.9  0.722222      3.2  0.500000      5.7  0.796610      2.3   
87      5.6  0.361111      2.8  0.333333      4.9  0.661017      2.0   
88      7.7  0.944444      2.8  0.333333      6.7  0.966102      2.0   
89      6.3  0.555556      2.7  0.291667      4.9  0.661017      1.8   
90      6.7  0.666667      3.3  0.541667      5.7  0.796610      2.1   
91      7.2  0.805556      3.2  0.500000      6.0  0.847458      1.8   
92      6.2  0.527778      2.8  0.333333      4.8  0.644068      1.8   
93      6.1  0.500000      3.0  0.416667      4.9  0.661017      1.8   
94      6.4  0.583333      2.8  0.333333      5.6  0.779661      2.1   
95      7.2  0.805556      3.0  0.416667      5.8  0.813559      1.6   
96      7.4  0.861111      2.8  0.333333      6.1  0.864407      1.9   
97      7.9  0.999900      3.8  0.750000      6.4  0.915254      2.0   
98      6.4  0.583333      2.8  0.333333      5.6  0.779661      2.2   
99      6.3  0.555556      2.8  0.333333      5.1  0.694915      1.5   

     column8     target  
0   0.041667     setosa  
1   0.041667     setosa  
2   0.041667     setosa  
3   0.041667     setosa  
4   0.041667     setosa  
5   0.125000     setosa  
6   0.083333     setosa  
7   0.041667     setosa  
8   0.041667     setosa  
9   0.010000     setosa  
10  0.041667     setosa  
11  0.041667     setosa  
12  0.010000     setosa  
13  0.010000     setosa  
14  0.041667     setosa  
15  0.125000     setosa  
16  0.125000     setosa  
17  0.083333     setosa  
18  0.083333     setosa  
19  0.083333     setosa  
20  0.041667     setosa  
21  0.125000     setosa  
22  0.041667     setosa  
23  0.166667     setosa  
24  0.041667     setosa  
25  0.041667     setosa  
26  0.125000     setosa  
27  0.041667     setosa  
28  0.041667     setosa  
29  0.041667     setosa  
..       ...        ...  
70  0.875000  virginica  
71  0.833333  virginica  
72  0.666667  virginica  
73  0.708333  virginica  
74  0.708333  virginica  
75  0.999900  virginica  
76  0.791667  virginica  
77  0.750000  virginica  
78  0.833333  virginica  
79  0.791667  virginica  
80  0.958333  virginica  
81  0.916667  virginica  
82  0.708333  virginica  
83  0.875000  virginica  
84  0.916667  virginica  
85  0.583333  virginica  
86  0.916667  virginica  
87  0.791667  virginica  
88  0.791667  virginica  
89  0.708333  virginica  
90  0.833333  virginica  
91  0.708333  virginica  
92  0.708333  virginica  
93  0.708333  virginica  
94  0.833333  virginica  
95  0.625000  virginica  
96  0.750000  virginica  
97  0.791667  virginica  
98  0.875000  virginica  
99  0.583333  virginica  

[100 rows x 9 columns]

Numpy: Simple Numpy NdArray vs Python List


In [3]:
python_list = [1, 2, 3]
numpy_array = np.array([1, 2, 3])
pandas_dataframe = pd.DataFrame(data=[1,2,3])

print("Python List:")
print(python_list)
print(type(python_list))

print("\nNumpy Array:")
print(numpy_array)
print(type(numpy_array))

print("\nPandas DataFrame:")
print(pandas_dataframe)
print(type(pandas_dataframe))


Python List:
[1, 2, 3]
<class 'list'>

Numpy Array:
[1 2 3]
<class 'numpy.ndarray'>

Pandas DataFrame:
   0
0  1
1  2
2  3
<class 'pandas.core.frame.DataFrame'>

Pandas: Subset of columns


In [4]:
subset_columns = csv_data[['column1','column3','target']]

column1_cleaned = subset_columns[['column1']].fillna( subset_columns[['column1']].mean() )
column3_cleaned = subset_columns[['column3']].fillna( subset_columns[['column3']].mean() )

subset_columns.column1 = column1_cleaned
subset_columns.column3 = column3_cleaned

print(subset_columns)


     column1   column3     target
0   5.100000  3.500000     setosa
1   4.900000  3.000000     setosa
2   5.926263  3.200000     setosa
3   4.600000  3.040404     setosa
4   5.000000  3.600000     setosa
5   5.400000  3.900000     setosa
6   4.600000  3.400000     setosa
7   5.000000  3.400000     setosa
8   4.400000  2.900000     setosa
9   4.900000  3.100000     setosa
10  5.400000  3.700000     setosa
11  4.800000  3.400000     setosa
12  4.800000  3.000000     setosa
13  4.300000  3.000000     setosa
14  5.800000  4.000000     setosa
15  5.700000  4.400000     setosa
16  5.400000  3.900000     setosa
17  5.100000  3.500000     setosa
18  5.700000  3.800000     setosa
19  5.100000  3.800000     setosa
20  5.400000  3.400000     setosa
21  5.100000  3.700000     setosa
22  4.600000  3.600000     setosa
23  5.100000  3.300000     setosa
24  4.800000  3.400000     setosa
25  5.000000  3.000000     setosa
26  5.000000  3.400000     setosa
27  5.200000  3.500000     setosa
28  5.200000  3.400000     setosa
29  4.700000  3.200000     setosa
..       ...       ...        ...
70  6.500000  3.000000  virginica
71  7.600000  3.000000  virginica
72  4.900000  2.500000  virginica
73  7.300000  2.900000  virginica
74  6.700000  2.500000  virginica
75  7.200000  3.600000  virginica
76  6.500000  3.200000  virginica
77  6.400000  2.700000  virginica
78  6.800000  3.000000  virginica
79  5.700000  2.500000  virginica
80  5.800000  2.800000  virginica
81  6.400000  3.200000  virginica
82  6.500000  3.000000  virginica
83  7.700000  3.800000  virginica
84  7.700000  2.600000  virginica
85  6.000000  2.200000  virginica
86  6.900000  3.200000  virginica
87  5.600000  2.800000  virginica
88  7.700000  2.800000  virginica
89  6.300000  2.700000  virginica
90  6.700000  3.300000  virginica
91  7.200000  3.200000  virginica
92  6.200000  2.800000  virginica
93  6.100000  3.000000  virginica
94  6.400000  2.800000  virginica
95  7.200000  3.000000  virginica
96  7.400000  2.800000  virginica
97  7.900000  3.800000  virginica
98  6.400000  2.800000  virginica
99  6.300000  2.800000  virginica

[100 rows x 3 columns]
C:\Users\Windows\AppData\Local\Continuum\Anaconda3\lib\site-packages\pandas\core\generic.py:2999: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value

Pandas: Subset of rows


In [5]:
setosa = subset_columns[csv_data.target == 'setosa']
versicolor = subset_columns[csv_data.target == 'versicolor']
virginica = subset_columns[csv_data.target == 'virginica']
print("Subset-rows created.")


Subset-rows created.

In [6]:
setosa_x = setosa['column3'].values
setosa_y = setosa['column1'].values
versicolor_x = versicolor['column3'].values
versicolor_y = versicolor['column1'].values
virginica_x = virginica['column3'].values
virginica_y = virginica['column1'].values

f, axarr = plt.subplots(3, sharex=True, sharey=True)
axarr[0].plot(setosa_x, setosa_y, "bo")
axarr[0].set_title('Setosa')
axarr[1].set_title('Versicolor')
axarr[2].set_title('Virginica')
axarr[1].scatter(versicolor_x, versicolor_y)
axarr[2].scatter(virginica_x, virginica_y)

plt.show()


Building a SVM (Support Vector Machine) Model/Classifier


In [42]:
print("Preparing Data...")
# [5.1 , 3.5]
classifier_x = subset_columns[['column1','column3']].values

#[1,0,0] = setosa
#[0,1,0] = versicolor
#[0,0,1] = virginica7
labels = subset_columns['target'].values
le = preprocessing.LabelEncoder()
le.fit(labels)
classifier_y = le.transform(labels)

print("Data Splitting:")
print("Shape before Split: ",classifier_x.shape,"-",classifier_y.shape)
X_train, X_test, y_train, y_test = train_test_split(classifier_x,
                                                     classifier_y)
print("Shape after Split: ",X_train.shape,"-",X_test.shape)

clf = svm.LinearSVC(max_iter=10)
print("Fitting...")
clf.fit(X=X_train,
        y=y_train)
print(clf.coef_)
print("Predicting...")
y_pred = clf.predict(X_test)

print("#"*50)
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))


Preparing Data...
Data Splitting:
Shape before Split:  (100, 2) - (100,)
Shape after Split:  (75, 2) - (25, 2)
Fitting...
[[-0.94824719  1.53144869]
 [ 0.50655832 -0.73699852]
 [ 0.41090775 -0.75221188]]
Predicting...
##################################################
[[ 4  4  0]
 [ 0  5  0]
 [ 0 12  0]]
             precision    recall  f1-score   support

          0       1.00      0.50      0.67         8
          1       0.24      1.00      0.38         5
          2       0.00      0.00      0.00        12

avg / total       0.37      0.36      0.29        25

C:\Users\Windows\AppData\Local\Continuum\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

In [8]:
print(__doc__)


# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

plt.figure(2, figsize=(8, 6))
plt.clf()

# Plot the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1,
            edgecolor='k')
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

# To getter a better understanding of interaction of the dimensions
# plot the first three PCA dimensions
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = PCA(n_components=3).fit_transform(iris.data)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=y,
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])

plt.show()


Automatically created module for IPython interactive environment

In [ ]: