In [15]:
# import
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import  confusion_matrix, accuracy_score, classification_report, precision_score

%matplotlib inline

Lets revise


In [2]:
# reading the data
df = pd.read_csv("data/fertility_Diagnosis.txt", delimiter=',', header=None)
df.iloc[:4,0:9]


Out[2]:
0 1 2 3 4 5 6 7 8
0 -0.33 0.69 0 1 1 0 0.8 0 0.88
1 -0.33 0.94 1 0 1 0 0.8 1 0.31
2 -0.33 0.50 1 0 0 0 1.0 -1 0.50
3 -0.33 0.75 0 1 1 0 1.0 -1 0.38

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:9], df[9], test_size=0.1)

In [4]:
pipeline = Pipeline([('clf', LogisticRegression())])

parameters = {
 'clf__penalty': ('l1', 'l2'),
    'clf__C': (0.01, 0.1, 1, 10)    
    }

grid_search = GridSearchCV(pipeline, parameters, n_jobs=5, verbose=True, scoring='accuracy', cv = 5)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=5)]: Done  31 out of  40 | elapsed:    5.8s remaining:    1.6s
[Parallel(n_jobs=5)]: Done  40 out of  40 | elapsed:    5.8s finished
Out[4]:
GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('clf', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=5,
       param_grid={'clf__C': (0.01, 0.1, 1, 10), 'clf__penalty': ('l1', 'l2')},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy',
       verbose=True)

In [5]:
print( 'Best score: %0.3f' % grid_search.best_score_)
print( 'Best parameters set:')

best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print( '\t%s: %r' % (param_name, best_parameters[param_name]))


Best score: 0.933
Best parameters set:
	clf__C: 0.01
	clf__penalty: 'l1'

In [6]:
y_pred = grid_search.predict(X_test)

#print((y_pred), (y_test))

y_test = [2 if x=='N' else 1 for x in y_test]
y_pred = [2 if x=='N' else 1 for x in y_pred]

#print((y_pred), (y_test))

print( 'Accuracy:', accuracy_score(y_test, y_pred))
print( 'Precision:', precision_score(y_test, y_pred))
print( 'Recall:', recall_score(y_test, y_pred))


Accuracy: 0.8
Precision: 0.0
Recall: 0.0
C:\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1113: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)

Multi-class classification

The goal of multi-class classification is to assign an instance to one of the set of classes. scikit-learn uses a strategy called one-vs.-all, or one-vs.-the-rest, to support multi-class classification. Onevs.- all classification uses one binary classifier for each of the possible classes. The class that is predicted with the greatest confidence is assigned to the instance.


In [7]:
movie = pd.read_csv("data/movie_train.tsv", delimiter="\t")
movie[:10]


Out[7]:
PhraseId SentenceId Phrase Sentiment
0 1 1 A series of escapades demonstrating the adage ... 1
1 2 1 A series of escapades demonstrating the adage ... 2
2 3 1 A series 2
3 4 1 A 2
4 5 1 series 2
5 6 1 of escapades demonstrating the adage that what... 2
6 7 1 of 2
7 8 1 escapades demonstrating the adage that what is... 2
8 9 1 escapades 2
9 10 1 demonstrating the adage that what is good for ... 2

In [8]:
print(movie['Sentiment'].describe())


count    156060.000000
mean          2.063578
std           0.893832
min           0.000000
25%           2.000000
50%           2.000000
75%           3.000000
max           4.000000
Name: Sentiment, dtype: float64

In [9]:
print(movie['Sentiment'].value_counts())


2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [16]:
def movie_rank():
    
    pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),
            ('clf', LogisticRegression()) 
                 ])
    
    parameters = {'vect__max_df': (0.25, 0.5),
                    'vect__ngram_range': ((1, 1), (1, 2)),
                    'vect__use_idf': (True, False),
                    'clf__C': (0.1, 1, 10),}
    
    movie=pd.read_csv('data/movie_train.tsv', header=0, delimiter='\t')
    X, y = movie['Phrase'], movie['Sentiment'].as_matrix()
    #print(X[:3])
    #print(y[:3])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 19)
    #print(X_train[:3])
    #print(y_train[:3])

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
   
    print( 'Best score: %0.3f' % grid_search.best_score_)
    print( 'Best parameters set:')
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print( '\t%s: %r' % (param_name, best_parameters[param_name]))
        
    predictions = grid_search.predict(X_test)

    print ('Accuracy:', accuracy_score(y_test, predictions))
    print ('Confusion Matrix:', confusion_matrix(y_test, predictions))
    print ('Classification Report:', classification_report(y_test, predictions))

movie_rank()


Fitting 3 folds for each of 24 candidates, totalling 72 fits
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done  72 out of  72 | elapsed:  5.7min finished
Best score: 0.631
Best parameters set:
	clf__C: 10
	vect__max_df: 0.25
	vect__ngram_range: (1, 2)
	vect__use_idf: False
Accuracy: 0.651159810329
Confusion Matrix: [[  740  1022   287    24     2]
 [  526  3854  3568   289    10]
 [  120  1869 19712  2057    82]
 [   10   248  3740  5096   780]
 [    4    11   248  1435  1084]]
Classification Report:              precision    recall  f1-score   support

          0       0.53      0.36      0.43      2075
          1       0.55      0.47      0.51      8247
          2       0.72      0.83      0.77     23840
          3       0.57      0.52      0.54      9874
          4       0.55      0.39      0.46      2782

avg / total       0.64      0.65      0.64     46818


In [ ]:


In [ ]: