In [108]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from nltk.corpus import gutenberg, stopwords
from collections import Counter
from sklearn import ensemble
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split,cross_val_score, KFold, cross_val_predict, GridSearchCV
from sklearn.svm import SVC
from nltk.stem import PorterStemmer
from nltk import word_tokenize
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from nltk.tokenize import sent_tokenize
from sklearn.feature_selection import SelectPercentile
from nltk.stem import PorterStemmer

In [2]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice)
persuasion = text_cleaner(persuasion)

In [3]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [4]:
#Tokenize texts into sentences 
sent_tokenize_alice = sent_tokenize(alice)
sent_tokenize_persuasion = sent_tokenize(persuasion)

# Group into sentences.
alice_sents = [[sent, "Carroll"] for sent in sent_tokenize_alice]
persuasion_sents = [[sent, "Austen"] for sent in sent_tokenize_persuasion]

# Combine the sentences from the two novels into one data frame.
names = ['Sentences','Author']
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns = names)
sentences.head()


Out[4]:
Sentences Author
0 Alice was beginning to get very tired of sitti... Carroll
1 So she was considering in her own mind (as wel... Carroll
2 There was nothing so VERY remarkable in that; ... Carroll
3 Oh dear! Carroll
4 I shall be late!' Carroll

Challenge 0

Try support vector classifier with the current features

Only 90% has been achieved so we are 7% from our goal. Build new features based on Grammar, phrases and POS


In [5]:
#add column with 0 = Carroll and 1  = Austen
sentences.loc[sentences['Author'] == 'Carroll', 'Target'] = 0
sentences.loc[sentences['Author'] == 'Austen', 'Target'] = 1

In [156]:
#Build the predictors and the predicted variable
X = sentences['Sentences']
y = sentences['Target']

#Split the data set into train and test 70/30
X_train, X_test, y_train, y_test  = train_test_split(X,y, test_size=0.33, random_state=135)

#KFold for cross validation analysis
kf = KFold(3)

In [205]:
#et up the vectorizer
vectorize = TfidfVectorizer(analyzer = 'char_wb', ngram_range = (1,4),
                                              stop_words = 'english',
                                              lowercase=True,
                                              max_df=0.3,
                            min_df=5,
                            max_features=20000
                       ).fit(X_train)

        
#analyzer = 'word',
 #                       stop_words = 'english',
  #                      ngram_range = (1,3),
   #                     max_df = 0.5,
    #                    norm = 'l2',
     #                   min_df = 5,
      #                  use_idf = True,
       #                 sublinear_tf  = True
                        
#Vectorize the train and test datasets
X_train_vectorized = vectorize.transform(X_train)
X_test_vectorized = vectorize.transform(X_test)

In [189]:
# Set up the model
svc = SVC(class_weight = 'balanced')

#Create range of values to fit parameters
Cs = np.arange(20,32)
gammas = [0.001, 0.01, 0.1, 1]
    
param_grid = {'C': Cs, 'gamma' : gammas}


#Fit parameters
svc1 = GridSearchCV(svc, param_grid=param_grid,n_jobs=-1,iid=False, cv=kf)

#Fit the tunned model
svc1.fit(X_train_vectorized, y_train)

#Print the hyperparameters set
svc1.grid_scores_, svc1.best_params_, svc1.best_score_


/home/borjaregueral/anaconda3/lib/python3.6/site-packages/sklearn/model_selection/_search.py:761: DeprecationWarning: The grid_scores_ attribute was deprecated in version 0.18 in favor of the more elaborate cv_results_ attribute. The grid_scores_ attribute will not be available from 0.20
  DeprecationWarning)
Out[189]:
([mean: 0.91323, std: 0.00594, params: {'C': 20, 'gamma': 0.001},
  mean: 0.96269, std: 0.00118, params: {'C': 20, 'gamma': 0.01},
  mean: 0.96213, std: 0.00243, params: {'C': 20, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 20, 'gamma': 1},
  mean: 0.91549, std: 0.00586, params: {'C': 21, 'gamma': 0.001},
  mean: 0.96213, std: 0.00078, params: {'C': 21, 'gamma': 0.01},
  mean: 0.96213, std: 0.00243, params: {'C': 21, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 21, 'gamma': 1},
  mean: 0.91775, std: 0.00657, params: {'C': 22, 'gamma': 0.001},
  mean: 0.96241, std: 0.00104, params: {'C': 22, 'gamma': 0.01},
  mean: 0.96269, std: 0.00183, params: {'C': 22, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 22, 'gamma': 1},
  mean: 0.91917, std: 0.00717, params: {'C': 23, 'gamma': 0.001},
  mean: 0.96184, std: 0.00118, params: {'C': 23, 'gamma': 0.01},
  mean: 0.96241, std: 0.00211, params: {'C': 23, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 23, 'gamma': 1},
  mean: 0.92114, std: 0.00795, params: {'C': 24, 'gamma': 0.001},
  mean: 0.96213, std: 0.00078, params: {'C': 24, 'gamma': 0.01},
  mean: 0.96269, std: 0.00277, params: {'C': 24, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 24, 'gamma': 1},
  mean: 0.92228, std: 0.00799, params: {'C': 25, 'gamma': 0.001},
  mean: 0.96297, std: 0.00078, params: {'C': 25, 'gamma': 0.01},
  mean: 0.96269, std: 0.00277, params: {'C': 25, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 25, 'gamma': 1},
  mean: 0.92425, std: 0.00724, params: {'C': 26, 'gamma': 0.001},
  mean: 0.96382, std: 0.00105, params: {'C': 26, 'gamma': 0.01},
  mean: 0.96213, std: 0.00212, params: {'C': 26, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 26, 'gamma': 1},
  mean: 0.92736, std: 0.00626, params: {'C': 27, 'gamma': 0.001},
  mean: 0.96439, std: 0.00119, params: {'C': 27, 'gamma': 0.01},
  mean: 0.96213, std: 0.00212, params: {'C': 27, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 27, 'gamma': 1},
  mean: 0.92793, std: 0.00658, params: {'C': 28, 'gamma': 0.001},
  mean: 0.96467, std: 0.00106, params: {'C': 28, 'gamma': 0.01},
  mean: 0.96213, std: 0.00212, params: {'C': 28, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 28, 'gamma': 1},
  mean: 0.92878, std: 0.00632, params: {'C': 29, 'gamma': 0.001},
  mean: 0.96495, std: 0.00079, params: {'C': 29, 'gamma': 0.01},
  mean: 0.96156, std: 0.00174, params: {'C': 29, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 29, 'gamma': 1},
  mean: 0.92934, std: 0.00708, params: {'C': 30, 'gamma': 0.001},
  mean: 0.96495, std: 0.00144, params: {'C': 30, 'gamma': 0.01},
  mean: 0.96184, std: 0.00208, params: {'C': 30, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 30, 'gamma': 1},
  mean: 0.93104, std: 0.00663, params: {'C': 31, 'gamma': 0.001},
  mean: 0.96382, std: 0.00242, params: {'C': 31, 'gamma': 0.01},
  mean: 0.96156, std: 0.00174, params: {'C': 31, 'gamma': 0.1},
  mean: 0.96213, std: 0.00105, params: {'C': 31, 'gamma': 1}],
 {'C': 29, 'gamma': 0.01},
 0.96495210438799528)

In [190]:
#Fit tunned model on Test set
svc1.fit(X_test_vectorized, y_test)

# Predict on training set
predtest_y = svc1.predict(X_test_vectorized)

In [191]:
#Test Scores
target_names = ['0.0', '1.0']

#Build confusion matrix
cnf = confusion_matrix(y_test, predtest_y)

#Calcualte type I and type II errors
table_test = pd.crosstab(y_test, predtest_y, margins=True)
tI_errors = table_test.loc[0.0,1.0] / table_test.loc['All','All']
tII_errors = table_test.loc[1.0,0.0] / table_test.loc['All','All']

#Print accruacy results
print(classification_report(y_test, predtest_y, target_names=target_names))
print(cnf)
print((
    'SVC accuracy:{}\n'
    'Type I error:{}\n'
    'Type II error:{}\n'
).format(cross_val_score(svc1,X_test_vectorized,y_test,cv=kf).mean(), tI_errors, tII_errors))


             precision    recall  f1-score   support

        0.0       0.97      0.96      0.96       521
        1.0       0.98      0.99      0.98      1223

avg / total       0.98      0.98      0.98      1744

[[ 498   23]
 [  16 1207]]
SVC accuracy:0.9392227722869878
Type I error:0.013188073394495414
Type II error:0.009174311926605505


In [206]:
#Build the model considering tfidf
text_clf = Pipeline([
                      ('tfidf', TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True)),
                     ('clf', svc1)
])

#Fit the model on the train dataset
text_clf = text_clf.fit(X_train_vectorized, y_train)

In [207]:
#Run the classifier on the test data set
text_clf.fit(X_test_vectorized,y_test)

#Calculate accuracy with cross validation
cross_val_score(text_clf,X_test_vectorized,y_test,cv=kf).mean()


Out[207]:
0.9461104112079145

In [ ]: