In [1]:
#I'm not sure if all of these need to be included but I am importing them all just to be safe.
from __future__ import division
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import requests
from sklearn.feature_extraction.text import CountVectorizer
import math
from bs4 import BeautifulSoup
import requests
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
import PyPDF2

In [2]:
#Import all books into a list object. 
#Need to look into creating a function to do this.
pathname = 'C:/Users/dmdal/OneDrive/Documents/TextMining/Books_for_Project/'

pdf_Hemingway = open(pathname+'Hemingway_OldManSea.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Hemingway)
pages = pdfReader.numPages
#print(type(pages))
hemingway_text = []
#print(type(hemingway_text))

i = 1
while i < pages: 
    pageObj_hemingway = pdfReader.getPage(i)
    text= pageObj_hemingway.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    hemingway_text = hemingway_text + text
    i = i+1
    
pdf_Faulkner = open(pathname+'Faulkner_As_I_Lay_Dying.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Faulkner)
pages = pdfReader.numPages
#print(type(pages))
faulkner_text = []
#print(type(faulkner_text))

i = 9
while i < pages: 
    pageObj_faulkner = pdfReader.getPage(i)
    text= pageObj_faulkner.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    faulkner_text = faulkner_text + text
    i = i+1
    
pdf_Tolstoy = open(pathname+'Tolstoy_Death_Of_Ivan.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Tolstoy)
pages = pdfReader.numPages
#print(type(pages))
tolstoy_text = []
#print(type(tolstoy_text))

i = 1
while i < pages: 
    pageObj_tolstoy = pdfReader.getPage(i)
    text= pageObj_tolstoy.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    tolstoy_text = tolstoy_text + text
    i = i+1
    
pdf_Hemingway2 = open(pathname+'Hemingway_TheSunAlsoRises.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Hemingway2)
pages = pdfReader.numPages
#print(type(pages))
hemingway_text2 = []
#print(type(hemingway_text2))

i = 3
while i < pages: 
    pageObj_hemingway = pdfReader.getPage(i)
    text= pageObj_hemingway.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    hemingway_text2 = hemingway_text2 + text
    i = i+1
    
pdf_Faulkner2 = open(pathname+'Faulkner_light_in_august.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Faulkner2)
pages = pdfReader.numPages
#print(type(pages))
faulkner_text2 = []
#print(type(faulkner_text2))

i = 5
while i < pages: 
    pageObj_faulkner = pdfReader.getPage(i)
    text= pageObj_faulkner.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    faulkner_text2 = faulkner_text2 + text
    i = i+1
    
pdf_Tolstoy2 = open(pathname+'Tolstoy_Master_and_Man.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Tolstoy2)
pages = pdfReader.numPages
#print(type(pages))
tolstoy_text2 = []
#print(type(tolstoy_text2))

i = 2
while i < pages: 
    pageObj_tolstoy = pdfReader.getPage(i)
    text= pageObj_tolstoy.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    tolstoy_text2 = tolstoy_text2 + text
    i = i+1
    
pdf_Austen = open(pathname+'Austen_Persuasion.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Austen)
pages = pdfReader.numPages
#print(type(pages))
austen_text = []
#print(type(austen_text))

i = 3
while i < pages: 
    pageObj_austen = pdfReader.getPage(i)
    text= pageObj_austen.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    austen_text = austen_text + text
    i = i+1
    
pdf_Austen2 = open(pathname+'Austen_Pride_Prejudice.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Austen2)
pages = pdfReader.numPages
#print(type(pages))
austen_text2 = []
#print(type(austen_text2))

i = 3
while i < pages: 
    pageObj_austen = pdfReader.getPage(i)
    text= pageObj_austen.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    austen_text2 = austen_text2 + text
    i = i+1

pdf_Tolstoy3 = open(pathname+'Tolstoy_on_Shakespeare.pdf','rb')     #'rb' for read binary mode
pdfReader = PyPDF2.PdfFileReader(pdf_Tolstoy3)
pages = pdfReader.numPages
#print(type(pages))
tolstoy_text3 = []
#print(type(tolstoy_text3))

i = 3
while i < pages: 
    pageObj_tolstoy = pdfReader.getPage(i)
    text= pageObj_tolstoy.extractText()
    text = [text]
    #text = ''.join(row.findAll(text=True)) 
    #data = [str(text.strip())] 
    tolstoy_text3 = tolstoy_text3 + text
    i = i+1


PdfReadWarning: Xref table not zero-indexed. ID numbers for objects will be corrected. [pdf.py:1736]

In [3]:
#Turn each list object into a dataframe.
hemingway_df = pd.DataFrame(hemingway_text)
faulkner_df = pd.DataFrame(faulkner_text)
tolstoy_df = pd.DataFrame(tolstoy_text)
hemingway_df2 = pd.DataFrame(hemingway_text2)
faulkner_df2 = pd.DataFrame(faulkner_text2)
tolstoy_df2 = pd.DataFrame(tolstoy_text2)
austen_df = pd.DataFrame(austen_text)
austen_df2 = pd.DataFrame(austen_text2)
tolstoy_df3 = pd.DataFrame(tolstoy_text3)

#Confirm text has been brought in
#print(hemingway_df[0:5])
#print(faulkner_df[14])
#print(tolstoy_df[14])

In [4]:
#print(hemingway_df.shape)
#print(faulkner_df.shape)
#print(tolstoy_df.shape)
#print(austen_df.shape)
#print(hemingway_df2.shape)
#print(faulkner_df2.shape)
#print(tolstoy_df2.shape)
#print(austen_df2.shape)
#print(tolstoy_df3.shape)

hemingway_df['Author']= 'Hemingway'
#print(hemingway_df.head)
faulkner_df['Author']= 'Faulkner'
#print(faulkner_df.head)
tolstoy_df['Author']= 'Tolstoy'
#print(faulkner_df.head)
hemingway_df2['Author']= 'Hemingway'
#print(hemingway_df.head)
faulkner_df2['Author']= 'Faulkner'
#print(faulkner_df.head)
tolstoy_df2['Author']= 'Tolstoy'
#print(faulkner_df.head)
austen_df['Author']= 'Austen'
#print(faulkner_df.head)
austen_df2['Author']= 'Austen'
#print(faulkner_df.head)
tolstoy_df3['Author']= 'Tolstoy'
#print(faulkner_df.head)

In [5]:
#Combine books
#hemingway_df,hemingway_df2,
books = [ faulkner_df, tolstoy_df, austen_df, faulkner_df2, tolstoy_df2, austen_df2, tolstoy_df3]
complete_set = pd.concat(books)
#print(complete_set.shape)
list(complete_set)
complete_set['OrigText'] = complete_set[0]
#print(complete_set.shape)

In [32]:
complete_CV = CountVectorizer(binary=False, lowercase = False, stop_words = 'english') 
complete_CV_dm = complete_CV.fit_transform(complete_set['OrigText'])
#print(complete_CV_dm.shape)

names = complete_CV.get_feature_names()
#print(type(names), len(names))

count = np.sum(complete_CV_dm.toarray(), axis = 0).tolist()
#print(type(count), len(count))
count_df = pd.DataFrame(count, index = names, columns = ['count'])

#count_df.sort_values(['count'], ascending = False).head(20)

#print(names)
complete_set[1]


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
C:\Users\dmdal\Anaconda3\lib\site-packages\pandas\indexes\base.py in get_loc(self, key, method, tolerance)
   1944             try:
-> 1945                 return self._engine.get_loc(key)
   1946             except KeyError:

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4154)()

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4018)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12368)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12322)()

KeyError: 1

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-32-fa9ced4cea34> in <module>()
     13 
     14 #print(names)
---> 15 complete_set[1]

C:\Users\dmdal\Anaconda3\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

C:\Users\dmdal\Anaconda3\lib\site-packages\pandas\core\frame.py in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

C:\Users\dmdal\Anaconda3\lib\site-packages\pandas\core\generic.py in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

C:\Users\dmdal\Anaconda3\lib\site-packages\pandas\core\internals.py in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

C:\Users\dmdal\Anaconda3\lib\site-packages\pandas\indexes\base.py in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4154)()

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4018)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12368)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12322)()

KeyError: 1

In [7]:
from nltk.corpus import stopwords

nltk_stopwords = stopwords.words("english")

#print(type(nltk_stopwords))
#print(len(nltk_stopwords))
my_stopwords = nltk_stopwords + ["br", "said","Mr","It","The","Mrs","did","But","The" "0", '000', '10', '100', '11', '12', '13', '14', u'15', u'16', u'17', u'18', u'19', u'20', u'25', u'30', u'40', u'50', u'500', u'60']
#print(len(my_stopwords))

In [31]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# get a feel for the distribution
complete_set['Author'].value_counts().plot(kind='bar')


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x12b004b26a0>

T1.

Read in or create a data frame with at least one column of text to be analyzed. This could be the text you used previously or new text. Choose a prediction you would like to make with these data and create the appropriate feature space. Identify the labels you will be trying to predict and proceed to create a train-test split. Using default model parameters, fit 3 classifiers (decision tree, naïve bayes, and logistic regression) to your dataset and subsequently generate predictions (just like we did in class). Feel free to set a random state variable where appropriate to facilitate replication. Assess the performance of the models using any of the measures (confusion matrices, precision, recall, f1-score, and accuracy).


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# instantiate vectorizer(s)
cv1 = CountVectorizer(lowercase=False, 
                     stop_words=my_stopwords,
                     binary=False,
                     max_df=0.95, 
                     min_df=0.1,
                     ngram_range = (1,1)) 
tfidf1 = TfidfVectorizer(lowercase=False, 
                        stop_words= my_stopwords, 
                        max_df=0.95, 
                        min_df=0.1,
                        ngram_range = (1,1)) 

choice = TfidfVectorizer(lowercase = False,
                         stop_words = my_stopwords,
                         binary=True,
                         max_df=.92,
                         min_df=.1,
                         ngram_range = (1,1)) #choose your favorite parameter combination

# fit and transform text
cv_dm = cv1.fit_transform(complete_set['OrigText'])
tfidf_dm = tfidf1.fit_transform(complete_set['OrigText'])
choice_dm = choice.fit_transform(complete_set['OrigText'])

In [10]:
from sklearn.cross_validation import train_test_split

X = cv_dm.toarray()  #remember this is the output from the vectorizer and we are turning it into an array
#print(type(X), X[0:10])


y = complete_set['Author'].values #this is an array of labels
#print(type(y), y[0:10])


DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. [cross_validation.py:44]

In [11]:
#print(complete_set['OrigText'][0:1])
#print("~~~~~~~~~~~~")
#print(complete_set['Author'][0:1])
#print("~~~~~~~~~~~~~")
#we created numeric representations of the text 
#print(pd.DataFrame(X,columns = cv1.get_feature_names())[0:1])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #random_state is set seed

# function creates 4 output structures - order matters
#print(X_train)
#print(X_train.shape)
#print(X_test.shape)
#print(y_train.shape)
#print(y_test.shape)

In [13]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

# fit a CART model to the data
model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train)

# make predictions
clf1_expected = y_test
clf1_predicted = model.predict(X_test)


#print(model.score(X_test,y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf1_expected, clf1_predicted)))
#print(metrics.classification_report(clf1_expected, clf1_predicted))

In [14]:
from sklearn.naive_bayes import MultinomialNB

# fit a Naive Bayes model to the data
model = MultinomialNB()
#print(model)
model.fit(X_train, y_train)

# make predictions
clf2_expected = y_test
clf2_predicted = model.predict(X_test)

#print(model.score(X_test, y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf2_expected, clf2_predicted)))
#print(metrics.classification_report(clf2_expected, clf2_predicted))

In [15]:
from sklearn.linear_model import LogisticRegression

# fit a logistic regression model to the data
model = LogisticRegression(random_state = 42)
#print(model)
model.fit(X_train, y_train)

# make predictions
clf3_expected = y_test
clf3_predicted = model.predict(X_test)

#print(model.score(X_test, y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf3_expected, clf3_predicted)))
#print(metrics.classification_report(clf3_expected, clf3_predicted))

Q1.

Write a short description of the results of these “baseline” models. Make sure your answer is no longer than four paragraphs, and should at minimum answer these questions: What decisions did you make when creating your feature space? Why?
How do these classifiers address your question? How did your models perform? Are you happy with the results? Why or why not? Audience: general – management or non-technical staff.

A1.

These "baseline" models preformed very well (best model is 99% and the worst model is 95%) but that is cause for concern.

What decisions did you make when creating your feature space? Why? I brought in all of the text from the PDFs since this is just a baseline model. The words, or "features", I removed were words in the English stop words. I removed these words because they are words that occur frequently and don't provide much or any information. The stop words are generic/common words that do not help with our prediction and because they do not help answer our question I have decided to remove them.

How do these classifiers address your question? I want to be able to predict an author based on the text from a page. The classifiers help me answer that question. Each classifier is an author which is what I'm trying to predict.

How did your models perform? Are you happy with the results? Why or why not? My models performed very well but I believe this is not a good thing. I have too many items in my word list compared to what I am trying to predict. The model is overfitted. I would need to either add more entries for prediction or break down my current entries from pages to paragraphs.

T2. 

Using a variety of parameter settings (for classifiers or vectorizers), try to improve on the performance of the baseline models. At least 6 separate predictions should be run and the results reported in a table. You can use any combination of parameters and classifiers; you do not need to use all three classifiers.


In [16]:
from sklearn.cross_validation import train_test_split

X = tfidf_dm.toarray()  #remember this is the output from the vectorizer
#print(type(X), X[0:10])


y = complete_set['Author'].values #this is an array of labels
#print(type(y), y[0:10])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #random_state is set seed

# function creates 4 output structures - order matters
#print(X_train.shape)
#print(X_test.shape)
#print(y_train.shape)
#print(y_test.shape)

In [18]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

# fit a CART model to the data
model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train)

# make predictions
clf1_expected = y_test
clf1_predicted = model.predict(X_test)


#print(model.score(X_test,y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf1_expected, clf1_predicted)))
#print(metrics.classification_report(clf1_expected, clf1_predicted))

In [19]:
from sklearn.naive_bayes import MultinomialNB

# fit a Naive Bayes model to the data
model = MultinomialNB()
#print(model)
model.fit(X_train, y_train)

# make predictions
clf2_expected = y_test
clf2_predicted = model.predict(X_test)

#print(model.score(X_test, y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf2_expected, clf2_predicted)))
#print(metrics.classification_report(clf2_expected, clf2_predicted))

In [20]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

# fit a logistic regression model to the data
model = LogisticRegression(random_state = 42)
#print(model)
model.fit(X_train, y_train)

# make predictions
clf3_expected = y_test
clf3_predicted = model.predict(X_test)

#print(model.score(X_test, y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf3_expected, clf3_predicted)))
#print(metrics.classification_report(clf3_expected, clf3_predicted))

In [21]:
cv1 = CountVectorizer(lowercase=False, 
                     stop_words=my_stopwords,
                     binary=False,
                     max_df=0.9, 
                     min_df=0.1,
                     ngram_range = (1,1))
cv_dm = cv1.fit_transform(complete_set['OrigText'])
X = cv_dm.toarray()  #remember this is the output from the vectorizer and we are turning it into an array
#print(type(X), X[0:10])


y = complete_set['Author'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [22]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier

# fit a CART model to the data
model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train)

# make predictions
clf1_expected = y_test
clf1_predicted = model.predict(X_test)


#print(model.score(X_test,y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf1_expected, clf1_predicted)))
#print(metrics.classification_report(clf1_expected, clf1_predicted))

In [23]:
from sklearn.naive_bayes import MultinomialNB

# fit a Naive Bayes model to the data
model = MultinomialNB()
#print(model)
model.fit(X_train, y_train)

# make predictions
clf2_expected = y_test
clf2_predicted = model.predict(X_test)

#print(model.score(X_test, y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf2_expected, clf2_predicted)))
#print(metrics.classification_report(clf2_expected, clf2_predicted))

In [24]:
from sklearn.linear_model import LogisticRegression

# fit a logistic regression model to the data
model = LogisticRegression(random_state = 42)
#print(model)
model.fit(X_train, y_train)

# make predictions
clf3_expected = y_test
clf3_predicted = model.predict(X_test)

#print(model.score(X_test, y_test))

# summarize the fit of the model
#print("accuracy: " + str(metrics.accuracy_score(clf3_expected, clf3_predicted)))
#print(metrics.classification_report(clf3_expected, clf3_predicted))

Q2.

Write a short description of the improvement you were able to make in your prediction. Make sure your answer is no longer than four paragraphs, and should at minimum answer these questions: What combination of classifiers and settings did you use and why? Which model fit “best” and what metric did you use for the comparison? Why? Are you happy with the results? Why or why not? What could you do to improve on the “best” model’s performance? Audience: technical – fellow data scientists or other technical staff.

A2.

Table Decision Tree Naive Bayes Logistic Regression
Accuracy with weights 0.9826 0.9548 0.9965
Accuracy using 60/40 split 0.9739 0.9609 0.9843

What combination of classifiers and settings did you use and why? For the first 3 predictions I used the weight vectorizer. I used the weight vectorizers for the decision tree, naive bayes, and logistic regression. I kept the 3 classifiers I used earlier. If I removed entries for the 3rd classifier I would not have enough data. If anything, I would want to add more classifiers or at least add more data for the classifiers that I currently have. Adding more classifiers would make my model more complex and interesting. It is more interesting to predict many authors instead of 3. Adding more data for the classifiers I have now would reduce overfitting, which I believe I am currently experiencing.

The next 3 predictions I used had a higher test/train split of 60% training and 40% training. The accuracy is pretty similar to the 70/30 split which shows that my model is fairly consistent. The drop in accuracy is due to the change in the min and max df. I restricted the min and max document frequency because I wanted to reduce the overall feature space. The feature space was too large and was leading to a model that was overfitted.

Which model fit “best” and what metric did you use for the comparison? Why? The model that performed the best was my logistic regression model with the weight vectorizer. Again, I have concern that this model is overfitting, so best is a loose term. I used accuracy as my metric of comparison. I used accuracy because I am interested in overall classification. If I was working with something that was attached to revenue or cost then I would be interested in using either specificity or sensitivity. Since there is no cost associated with misclassifying a book, I am using accuracy as my metric.

Are you happy with the results? Why or why not? What could you do to improve on the “best” model’s performance? The results are good but I believe I need more data. I have too many features in relation to documents. To improve the model I could tinker with the

T3.

Choose one preprocessing option (stemming, lemmatization, custom dictionary, custom stopwords, etc.) and recreate your feature space. Rerun your 3 best models.


In [25]:
cv1 = CountVectorizer(lowercase=False, 
                     stop_words=my_stopwords,
                     binary=False,
                     max_df=0.95, 
                     min_df=0.1,
                     ngram_range = (1,1))
tfidf1 = TfidfVectorizer(lowercase=False, 
                        stop_words= my_stopwords, 
                        max_df=0.95, 
                        min_df=0.1,
                        ngram_range = (1,1))

In [26]:
import re
_dict = { 'boats':'boat', 'old man':'om', 'Asianing.com':'','asiaing':'', 'fishes':'fish', '\n':'',
         'approaching': 'approachin', 'boating':'boat', 'wall':'walls','undergarment':'undergarments'
        ,'tact':'tactful','wound':'wounded', 'wounding':'wounded', 'wounds':'wounded'}

def multiple_replace(dict, text): 

  """ Replace in 'text' all occurences of any key in the given
  dictionary by its corresponding value.  Returns the new tring.""" 
  text = str(text).lower()

  # Create a regular expression  from the dictionary keys
  regex = re.compile("(%s)" % "|".join(map(re.escape, dict.keys())))

  # For each match, look-up corresponding value in dictionary
  return regex.sub(lambda mo: dict[mo.string[mo.start():mo.end()]], text)

complete_set['cleantext'] = complete_set.OrigText.apply(lambda x: multiple_replace(_dict, x))
#book_df.cleantext[0]

In [27]:
cv_dm = cv1.fit_transform(complete_set['cleantext'])
tfidf_dm = tfidf1.fit_transform(complete_set['cleantext'])
X = tfidf_dm.toarray()  
y = complete_set['Author'].values

#X = cv_dm.toarray()  
#y = complete_set['Author'].values #this is an array of labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #random_state is set seed

# function creates 4 output structures - order matters
#print(X_train.shape)
#print(X_test.shape)
#print(y_train.shape)
#print(y_test.shape)

In [28]:
model = LogisticRegression(random_state = 42)
#print(model)
model.fit(X_train, y_train)

# make predictions
clf3_expected = y_test
clf3_predicted = model.predict(X_test)

print(model.score(X_test, y_test))


0.979166666667

In [29]:
model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train)

# make predictions
clf1_expected = y_test
clf1_predicted = model.predict(X_test)

print(model.score(X_test, y_test))


0.954861111111

In [30]:
X = cv_dm.toarray()  
y = complete_set['Author'].values #this is an array of labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LogisticRegression(random_state = 42)
#print(model)
model.fit(X_train, y_train)

# make predictions
clf3_expected = y_test
clf3_predicted = model.predict(X_test)

print(model.score(X_test, y_test))


0.993055555556

Q3.

Write a short description of the exercise and the outcome. Make sure your answer is no longer than three paragraphs, and should at minimum answer these questions: How did the preprocessing affect the feature space? Why did you choose that option? What changed (not just size)? How did the preprocessing affect the performance of the models? Is this the result you would have expected? Are you satisfied with the performance of any of the models? Why or Why not? What else might you investigate? Audience: technical – fellow data scientists or other technical staff.

A3.

How did the preprocessing affect the feature space? Why did you choose that option? What changed (not just size)? Preprocessing with a custom dictionary reduced my feature because it removed tokens. I chose a custom dictionary because I noticed there were several tokens that were similar and could be reduced to one token. As an example,undergarment and undergarments could be reduced down to one token since they represent the same information for predicting my question.

How did the preprocessing affect the performance of the models? Is this the result you would have expected? It lowered my accuracy for 2 of my models slightly. I would and I wouldn't expect this. I would expect this because it is reducing tokens that could be used for prediction. One of the tokens that I removed may have held some information for prediction that I was unaware of. I wouldn't expect this because we are consolidating redundant data.I would expect the model to become more precise. I do believe the model is better off with the reduce feature space due to overfitting.

Are you satisfied with the performance of any of the models? Why or Why not? What else might you investigate? Out of the 3 I am satisfied with the decision tree model because of its 95% accuracy. The 95% accuracy begins to indicate that it is not overfitted as the two models that are 99% accurate, but it is still accuracte enough to help with identifying an author based on a page that was submitted. I would also want to investigate other words to remove and add on to my current dictionary.

Q4.

In no more than a paragraph, outline your final project. What question are you looking to answer? What text are you using? With whom are you working? (Note: you will not be strictly held to this description since there are still techniques that we will cover in class. This is meant to get people thinking about how to get started more than a week before the project is due.)

A4.

Team members: Cory Lowe, Nathan Shores, and Dylan Dale Project Outline: We would like to be able to predict a well known author based on the text from their book. The authors that are included currently for our prediction are: Hemingway, Tolstoy, Austen, and Faulkner. We are using their well known novels to assist us with our predictions. Our corpus will consist of pages from their novels. How could this be used? Library science, predict unknown authors, or predict plagiarism.