In [15]:
# import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, precision_score
%matplotlib inline
Lets revise
In [2]:
# reading the data
df = pd.read_csv("data/fertility_Diagnosis.txt", delimiter=',', header=None)
df.iloc[:4,0:9]
Out[2]:
In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:9], df[9], test_size=0.1)
In [4]:
pipeline = Pipeline([('clf', LogisticRegression())])
parameters = {
'clf__penalty': ('l1', 'l2'),
'clf__C': (0.01, 0.1, 1, 10)
}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=5, verbose=True, scoring='accuracy', cv = 5)
grid_search.fit(X_train, y_train)
Out[4]:
In [5]:
print( 'Best score: %0.3f' % grid_search.best_score_)
print( 'Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print( '\t%s: %r' % (param_name, best_parameters[param_name]))
In [6]:
y_pred = grid_search.predict(X_test)
#print((y_pred), (y_test))
y_test = [2 if x=='N' else 1 for x in y_test]
y_pred = [2 if x=='N' else 1 for x in y_pred]
#print((y_pred), (y_test))
print( 'Accuracy:', accuracy_score(y_test, y_pred))
print( 'Precision:', precision_score(y_test, y_pred))
print( 'Recall:', recall_score(y_test, y_pred))
The goal of multi-class classification is to assign an instance to one of the set of classes. scikit-learn uses a strategy called one-vs.-all, or one-vs.-the-rest, to support multi-class classification. Onevs.- all classification uses one binary classifier for each of the possible classes. The class that is predicted with the greatest confidence is assigned to the instance.
In [7]:
movie = pd.read_csv("data/movie_train.tsv", delimiter="\t")
movie[:10]
Out[7]:
In [8]:
print(movie['Sentiment'].describe())
In [9]:
print(movie['Sentiment'].value_counts())
In [16]:
def movie_rank():
pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')),
('clf', LogisticRegression())
])
parameters = {'vect__max_df': (0.25, 0.5),
'vect__ngram_range': ((1, 1), (1, 2)),
'vect__use_idf': (True, False),
'clf__C': (0.1, 1, 10),}
movie=pd.read_csv('data/movie_train.tsv', header=0, delimiter='\t')
X, y = movie['Phrase'], movie['Sentiment'].as_matrix()
#print(X[:3])
#print(y[:3])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 19)
#print(X_train[:3])
#print(y_train[:3])
grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)
print( 'Best score: %0.3f' % grid_search.best_score_)
print( 'Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print( '\t%s: %r' % (param_name, best_parameters[param_name]))
predictions = grid_search.predict(X_test)
print ('Accuracy:', accuracy_score(y_test, predictions))
print ('Confusion Matrix:', confusion_matrix(y_test, predictions))
print ('Classification Report:', classification_report(y_test, predictions))
movie_rank()
In [ ]:
In [ ]: