In [108]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn import ensemble
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split,cross_val_score, KFold, cross_val_predict, GridSearchCV
from sklearn.svm import SVC
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from nltk.tokenize import sent_tokenize
from sklearn.feature_selection import SelectPercentile
from nltk.stem import PorterStemmer
In [2]:
# Utility function for standard text cleaning.
def text_cleaner(text):
# Visual inspection identifies a form of punctuation spaCy does not
# recognize: the double dash '--'. Better get rid of it now!
text = re.sub(r'--',' ',text)
text = re.sub("[\[].*?[\]]", "", text)
text = ' '.join(text.split())
return text
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')
# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)
In [3]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)
In [4]:
#Tokenize texts into sentences
sent_tokenize_alice = sent_tokenize(alice)
sent_tokenize_persuasion = sent_tokenize(persuasion)
# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in sent_tokenize_alice]
persuasion_sents = [[sent, "Austen"] for sent in sent_tokenize_persuasion]
# Combine the sentences from the two novels into one data frame.
names = ['Sentences','Author']
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = names)
sentences.head()
Out[4]:
Challenge 0
Try support vector classifier with the current features
Only 90% has been achieved so we are 7% from our goal. Build new features based on Grammar, phrases and POS
In [5]:
#add column with 0 = Carroll and 1 = Austen
sentences.loc[sentences['Author'] == 'Carroll', 'Target'] = 0
sentences.loc[sentences['Author'] == 'Austen', 'Target'] = 1
In [156]:
#Build the predictors and the predicted variable
X = sentences['Sentences']
y = sentences['Target']
#Split the data set into train and test 70/30
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=135)
#KFold for cross validation analysis
kf = KFold(3)
In [205]:
#et up the vectorizer
vectorize = TfidfVectorizer(analyzer = 'char_wb', ngram_range = (1,4),
stop_words = 'english',
lowercase=True,
max_df=0.3,
min_df=5,
max_features=20000
).fit(X_train)
#analyzer = 'word',
# stop_words = 'english',
# ngram_range = (1,3),
# max_df = 0.5,
# norm = 'l2',
# min_df = 5,
# use_idf = True,
# sublinear_tf = True
#Vectorize the train and test datasets
X_train_vectorized = vectorize.transform(X_train)
X_test_vectorized = vectorize.transform(X_test)
In [189]:
# Set up the model
svc = SVC(class_weight = 'balanced')
#Create range of values to fit parameters
Cs = np.arange(20,32)
gammas = [0.001, 0.01, 0.1, 1]
param_grid = {'C': Cs, 'gamma' : gammas}
#Fit parameters
svc1 = GridSearchCV(svc, param_grid=param_grid,n_jobs=-1,iid=False, cv=kf)
#Fit the tunned model
svc1.fit(X_train_vectorized, y_train)
#Print the hyperparameters set
svc1.grid_scores_, svc1.best_params_, svc1.best_score_
Out[189]:
In [190]:
#Fit tunned model on Test set
svc1.fit(X_test_vectorized, y_test)
# Predict on training set
predtest_y = svc1.predict(X_test_vectorized)
In [191]:
#Test Scores
target_names = ['0.0', '1.0']
#Build confusion matrix
cnf = confusion_matrix(y_test, predtest_y)
#Calcualte type I and type II errors
table_test = pd.crosstab(y_test, predtest_y, margins=True)
tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']
#Print accruacy results
print(classification_report(y_test, predtest_y, target_names=target_names))
print(cnf)
print((
'SVC accuracy:{}\n'
'Type I error:{}\n'
'Type II error:{}\n'
).format(cross_val_score(svc1,X_test_vectorized,y_test,cv=kf).mean(), tI_errors, tII_errors))
In [206]:
#Build the model considering tfidf
text_clf = Pipeline([
('tfidf', TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)),
('clf', svc1)
])
#Fit the model on the train dataset
text_clf = text_clf.fit(X_train_vectorized, y_train)
In [207]:
#Run the classifier on the test data set
text_clf.fit(X_test_vectorized,y_test)
#Calculate accuracy with cross validation
cross_val_score(text_clf,X_test_vectorized,y_test,cv=kf).mean()
Out[207]:
In [ ]: