In [3]:
#!/usr/bin/python

""" this example borrows heavily from the example
    shown on the sklearn documentation:

    http://scikit-learn.org/stable/modules/cross_validation.html

"""

from sklearn import datasets
from sklearn.svm import SVC

iris = datasets.load_iris()
features = iris.data
labels = iris.target

###############################################################
from sklearn.cross_validation import train_test_split

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.4, random_state=0)
###############################################################

### import the relevant code and make your train/test split
### name the output datasets features_train, features_test,
### labels_train, and labels_test

### set the random_state to 0 and the test_size to 0.4 so
### we can exactly check your result




###############################################################

clf = SVC(kernel="linear", C=1.)
clf.fit(features_train, labels_train)

print clf.score(features_test, labels_test)


##############################################################
def submitAcc():
    return clf.score(features_test, labels_test)


0.966666666667

In [5]:
%load ../ud120-projects/validation/validate_poi.py

In [20]:
#!/usr/bin/python


"""
    starter code for the validation mini-project
    the first step toward building your POI identifier!

    start by loading/formatting the data

    after that, it's not our code anymore--it's yours!
"""

import pickle
import sys
sys.path.append("../ud120-projects/tools/")
from feature_format import featureFormat, targetFeatureSplit

data_dict = pickle.load(open("../ud120-projects/final_project/final_project_dataset.pkl", "r") )

### add more features to features_list!
features_list = ["poi", "salary"]

data = featureFormat(data_dict, features_list)
labels, features = targetFeatureSplit(data)

### Splitting train/test

from sklearn.cross_validation import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels,
                                                                            test_size=0.3,
                                                                            random_state=42)

### Train decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

clf = DecisionTreeClassifier()
clf.fit(train_features, train_labels)

pred = clf.predict(test_features)

print "Confusion Matrix:\n", confusion_matrix(test_labels, pred), "\n"
print "Classification Report:\n", classification_report(test_labels, pred)
print "Accuracy:", accuracy_score(test_labels, pred)


Confusion Matrix:
[[21  3]
 [ 5  0]] 

Classification Report:
             precision    recall  f1-score   support

        0.0       0.81      0.88      0.84        24
        1.0       0.00      0.00      0.00         5

avg / total       0.67      0.72      0.70        29

Accuracy: 0.724137931034

In [19]:
#%load ../ud120-projects/tools/feature_format.py

In [8]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [17]:
help(train_test_split)


Help on function train_test_split in module sklearn.cross_validation:

train_test_split(*arrays, **options)
    Split arrays or matrices into random train and test subsets
    
    Quick utility that wraps calls to ``check_arrays`` and
    ``next(iter(ShuffleSplit(n_samples)))`` and application to input
    data into a single call for splitting (and optionally subsampling)
    data in a oneliner.
    
    Parameters
    ----------
    *arrays : sequence of arrays or scipy.sparse matrices with same shape[0]
        Python lists or tuples occurring in arrays are converted to 1D numpy
        arrays.
    
    test_size : float, int, or None (default is None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the test split. If
        int, represents the absolute number of test samples. If None,
        the value is automatically set to the complement of the train size.
        If train size is also None, test size is set to 0.25.
    
    train_size : float, int, or None (default is None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.
    
    random_state : int or RandomState
        Pseudo-random number generator state used for random sampling.
    
    dtype : a numpy dtype instance, None by default
        Enforce a specific dtype.
    
    Returns
    -------
    splitting : list of arrays, length=2 * len(arrays)
        List containing train-test split of input array.
    
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.cross_validation import train_test_split
    >>> a, b = np.arange(10).reshape((5, 2)), range(5)
    >>> a
    array([[0, 1],
           [2, 3],
           [4, 5],
           [6, 7],
           [8, 9]])
    >>> list(b)
    [0, 1, 2, 3, 4]
    
    >>> a_train, a_test, b_train, b_test = train_test_split(
    ...     a, b, test_size=0.33, random_state=42)
    ...
    >>> a_train
    array([[4, 5],
           [0, 1],
           [6, 7]])
    >>> b_train
    [2, 0, 3]
    >>> a_test
    array([[2, 3],
           [8, 9]])
    >>> b_test
    [1, 4]


In [ ]: