In [70]:
import pandas as pd
import numpy as np
import sklearn
import sklearn.preprocessing as pre
import sklearn.pipeline as pipe
import sklearn.decomposition as decomp
import sklearn.svm as svm
import sklearn.cross_validation as crossval
import sklearn.metrics as metrics

In [2]:
cleaned = pd.read_csv('./DD1103_cleandata.csv')

In [4]:
cleaned.set_index('DEPTH', inplace=True)

In [8]:
cols = cleaned.columns.tolist()
cols.remove('Unnamed: 0')
cleaned = cleaned[cols]

In [12]:
pd.unique(cleaned.LABELS)


Out[12]:
array([nan, 'CN', 'LEP', 'FH', 'PU', 'PT', 'PM', 'PTL', 'P', 'AQ', 'AQL',
       'T1UA', 'T1UB', 'T1L', 'T2', 'C1', 'C2'], dtype=object)

In [62]:
#target = [np.isnan(i) for i in cleaned.LABELS]
target = np.logical_not(cleaned.LABELS.isnull())
print(target.sum())
target.shape


281
Out[62]:
(3001,)

In [19]:
cols.remove('LABELS')

In [41]:
svm.SVC?

In [65]:
crossval.train_test_split?

In [83]:
imputer = pre.Imputer()
scalar = pre.StandardScaler()
n_components=20
svc = svm.SVC()

pca = decomp.PCA(n_components=n_components, whiten=True)

tx = pipe.make_pipeline(imputer, pca)

x_train, x_test, y_train, y_test = crossval.train_test_split(cleaned[cols], target, test_size=0.4)

result = tx.fit_transform(x_train)

svc.fit(result, y_train)
pred = svc.predict(tx.transform(x_test))

print('F1 test validation score {}'.format(metrics.f1_score(pred, y_test)))


F1 test validation score 0.8387096774193549

In [84]:
metrics.roc_auc_score(pred, y_test)


Out[84]:
0.96399224045859755

In [20]:
pre.scale(cleaned[cols].fillna)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-20-df262f0c6d9e> in <module>()
----> 1 pre.scale(cleaned[cols])

/Users/jvial/anaconda/envs/py3k/lib/python3.3/site-packages/sklearn/preprocessing/data.py in scale(X, axis, with_mean, with_std, copy)
    115     :class:`sklearn.pipeline.Pipeline`)
    116     """
--> 117     X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False)
    118     warn_if_not_float(X, estimator='The scale function')
    119     if sparse.issparse(X):

/Users/jvial/anaconda/envs/py3k/lib/python3.3/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features)
    350                              array.ndim)
    351         if force_all_finite:
--> 352             _assert_all_finite(array)
    353 
    354     shape_repr = _shape_repr(array.shape)

/Users/jvial/anaconda/envs/py3k/lib/python3.3/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     50             and not np.isfinite(X).all()):
     51         raise ValueError("Input contains NaN, infinity"
---> 52                          " or a value too large for %r." % X.dtype)
     53 
     54 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [ ]: