In [1]:
##### Imports

import numpy as np
from numpy import random

from matplotlib import pyplot

%matplotlib inline

from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.cross_validation import KFold
from sklearn import neighbors
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from __future__ import division
import nltk, re, pprint
import pandas as pd
from pandas import DataFrame
import operator
from pandas import Series, DataFrame
from scipy import stats
from numpy.random import permutation, shuffle
import string

import codecs
import sys
import os
sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
sys.stdin = codecs.getreader('utf_8')(sys.stdin)
sys.path.append('/Users/Hoyt/PythonScripts/Utils')
from MLtools import *
from RichertUtils import *

#if not set, include path to where the postagger is, including name of actual .jar file
os.environ['CLASSPATH'] = '.;C:\Program Files (x86)\Java\jre7\lib\ext\QTJava.zip;C:\Users\Public\Documents\MyData\
                                                PythonFiles\stanford-postagger-full-2014-06-16\stanford-postagger.jar'
os.environ['JAVAHOME'] = 'c:\Windows\SysWOW64\\' #give path where your java.exe file is (e.g., C:\Windows\SysWOW64)

from nltk.tag.stanford import POSTagger
#be sure to use the appropriate model
tagger = POSTagger('c:\Users\Public\Documents\MyData\PythonFiles\stanford-postagger-full-2014-06-16\models\english-left3words-distsim.tagger',
                   'c:\Users\Public\Documents\MyData\PythonFiles\stanford-postagger-full-2014-06-16\stanford-postagger.jar')

In [ ]:
# Import code for Rich's machine

sys.path.append('/Users/richardjeanso/Dropbox/')
from MLtools import *
from Utils import *

### Implement Stanford POS Tagger 
java_path = "/Library/Java/JavaVirtualMachines/jdk1.7.0_60.jdk/bin/java.exe"
os.environ['JAVAHOME'] = java_path

path_to_model = "/users/richardjeanso/stanford-tagger/stanford/models/english-bidirectional-distsim.tagger"
path_to_jar = "/users/richardjeanso/stanford-tagger/stanford/stanford-postagger.jar"
tagger = nltk.tag.stanford.POSTagger(path_to_model, path_to_jar)

In [41]:
#use this code to grab novels, chunk them, put into a data frame, and pickle for later use
#note: if you've already done this, just unpickle the file in the next cell

#cleaner functions
import unicodedata

def remove_accents(input_str):
    if input_str != '':
        nkfd_form = unicodedata.normalize('NFKD', input_str)
        return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])

#need to create seperate cleaner functions, cuz sometimes we want punct, and sometimes we don't
#use this first function when chunking and storing chunks--gets rid or replaces things that cause unicode errors
def clean_unicode_accents(raw):
    import re
    raw = re.sub(r'I\.', '', raw)
    raw = re.sub(r'\]', '', raw)
    raw = re.sub(r'II', '', raw)
    raw = re.sub(r'\[', '', raw)
    raw = re.sub(r'\*', '', raw)
    raw = re.sub(r'[\n\r\t]', ' ', raw)
    raw = re.sub(u'\u2013','-',raw)   #this is an em-dash
    raw = re.sub(u'\u2014','-',raw)   #also an em-dash
    raw = re.sub(u'\u2044','',raw)
    raw = re.sub(u'\u201c','"',raw)   #this is left " mark; just replacing them here
    raw = re.sub(u'\u201d','"',raw)   #this is right " mark
    raw = re.sub(u'\ufeff','',raw)
    raw = re.sub(u'\u2018','\'',raw)  #this is left ' mark
    raw = re.sub(u'\u2019','\'',raw)  #this is right ' mark
    raw = re.sub(u'\ufb02','',raw)
    raw = re.sub(u'\u2026', '', raw)
    raw = re.sub(u'\u2022', '', raw)
    #these next items have to be marked as unicode since you're dealing with utf-8 strings
    raw = re.sub(u'£', '', raw)
    raw = re.sub(u'«', '', raw)
    raw = re.sub(u'»', '', raw)
    raw = re.sub(u'¢', '', raw)
    raw = re.sub(u'æ', 'e', raw)
    raw = re.sub(u'©', '', raw)
    raw = remove_accents(raw)
    return raw

def cleaner(text):
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'[,.;:"?!*()\']', '', text)
    text = re.sub(r'-', ' ', text)
    text = re.sub(r'[\n\t]', ' ', text)
    #text = remove_accents(text)
    return text

#import the data frame for SOC corpus
columns = ['book_id', 'libraries', 'novel_title', 'auth_last', 'auth_first', 'auth_dates', 'publ_city', 'publisher',
           'publ_date', 'source', 'nationality', 'genre']

#For Hoyt's Machine
df = pd.read_csv("c:\Users\Hoyt\Dropbox\SOC_TEXTS\SOC_TEXTS.csv", names=columns)

#For Rich's Machine
#df = pd.read_csv("/Users/richardjeanso/Dropbox/SOC_TEXTS/SOC_TEXTS.csv", names=columns)

df = df.ix[1:]   #filter out first row (the column headers from original .csv)
df['filepath'] = Series('',index=df.index)   #add column for filepaths
df['chunk'] = Series('',index=df.index)    #add column for text chunks
df['chunk_id'] = Series('1',index=df.index)    #not needed, but keeping for consistency with realism corpus


#this is code from original pipeline (not necessary for the SOC texts)
#set your chunk_length (in characters) -- this will impact following cell too, so treat as global variable
#chunk_length = 1500

#generate filepaths for each text we have and load in the texts, chunking as you go
corpus_path = "c:\Users\Hoyt\Dropbox\SOC_TEXTS\\"
#corpus_path = "/Users/richardjeanso/Dropbox/SOC_TEXTS/"

for i in df.index:
    df.filepath[i] = corpus_path + str(df.book_id[i]) + ".txt"  #assign filepath
    text = codecs.open(df.filepath[i], encoding="utf-8", errors="ignore")
    raw = text.read()
    raw = clean_unicode_accents(raw)
    df.chunk[i] = raw

#drop the columns we don't need
df.drop(['libraries', 'auth_dates', 'publ_city', 'publisher', 'source', 'nationality', 'filepath', 'auth_first'], axis=1, inplace=True)

In [42]:
#now you're ready to send list of chunks for pos_tagging

#function to POS tag a chunk of text
def pos_tagger(chunk_list, tagset="stanford"):
    pos_chunks = []
    for chunk in chunk_list:
        word_list = []
        sent_list = nltk.sent_tokenize(chunk)
        for sent in sent_list:
            words = nltk.word_tokenize(sent)
            #this next little bit of code saves us from endless unicodeDecode errors!
            word_list.append([el.encode('unicode_escape') for el in words])  
        pos_chunks.append(word_list)
    pos_chunks = map(tagger.tag_sents, pos_chunks)
    if tagset == "universal":
        pos_chunks = change_tagset(pos_chunks)
    return pos_chunks

def change_tagset(tagged_corpus):
    new_tagged_corpus = []
    for chunk in tagged_corpus:
        new_chunk = []
        for sent in chunk:
            new_sent = []
            for item in sent:
                if item[1].startswith('N'):         #include an if statement for every tag you want to change
                    new_sent.append((item[0], 'N'))
                elif item[1].startswith('V'):
                    new_sent.append((item[0], 'V'))
                elif item[1].startswith('J'):
                    new_sent.append((item[0], 'J'))
                else:
                    new_sent.append(item)
            new_chunk.append(new_sent)
        new_tagged_corpus.append(new_chunk)
    return new_tagged_corpus

#get the chunks
chunk_list = soc_chunks_df.chunk.values
#pos tag them
pos_chunk_list = pos_tagger(chunk_list, "universal")       
#incoporate into the data frame
soc_chunks_df['pos_chunk'] = Series(pos_chunk_list, index=soc_chunks_df.index)
#pickle it
soc_chunks_df.to_pickle(r"c:\Users\Hoyt\Dropbox\SOC_TEXTS\soc_chunks_df.pkl")

In [43]:
#now build the feature sets for the soc_chunks 
#note: because these are cleaner chunks, I've adapted the code so it doesn't cut off front and end fragments
soc_chunks_df = pd.read_pickle(r"c:\Users\Hoyt\Dropbox\SOC_PROJECT\REALISM_TEXTS\soc_chunks_df.pkl")
soc_chunks_df.shape


Out[43]:
(300, 8)

In [45]:
#build the dictionary

#code for neologism detector (expanded version -- all words)
with codecs.open('c:\Users\Hoyt\Dropbox\SOC_PROJECT\WEB_DICT_1913.txt','r', 'utf-8') as f:
    raw = f.read()

#with codecs.open('/Users/richardjeanso/Dropbox/SOC_PROJECT/WEB_DICT_1913.txt','r', 'utf-8') as f:
#    raw = f.read()

raw = raw[1:]   #get rid of initial unicode character
Xdict_words = re.split(' ', raw)

import unicodedata

def remove_accents(input_str):
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])

Xdict_entries = []
for word in Xdict_words:
    word = ''.join(c for c in word if c not in string.punctuation)
    word = re.sub(r'-', '', word)
    if len(word) > 1 and re.search(r'[0-9]', word) != True and re.search(r'[AEIOUYaeiouy]', word):
        word = word.lower()
        word = remove_accents(word)
        Xdict_entries.append(word)

new_entries = ["a", "o", "i", "couldn", "mustn", "wasn", "wouldn", "ain", "ll", "needn", "shouldn", "needn", 
               "oughtn", "hasn", "hadn", "didn", "doesn", "don't", "n't", "didn't", "'s", "'d", "'ll", "'m", ]

for el in new_entries:
    Xdict_entries.append(el)

# Add in British spellings of words, i.e. "colour" and "aeroplane"
with codecs.open('c:\Users\Hoyt\Dropbox\SOC_PROJECT\BRITISH_SPELLING.txt','r', 'utf-8') as f:
    raw = f.read()
    british = re.split('\n|\r', raw)

for word in british:
    Xdict_entries.append(word)

Xdict_entries = list(set(Xdict_entries))
Xdict_entries.sort()         #do this for more efficient lookup in later function
len(Xdict_entries)


Out[45]:
215480

In [46]:
#functions for adding additional features to the data frame
def median_sent_length(chunk):
    chunk_sentences = nltk.sent_tokenize(chunk)
    sent_length = []
    for sent in chunk_sentences:
        sent_length.append(len(sent))
    return np.median(sent_length)

#returns the percentage of sentences in chunk that end with noun
def noun_ending(pos_chunk):
    #pos_chunk = pos_chunk[1:-1]  #eliminate first and last items, since these are likely fragments
    noun_endings = 0
    total_sents = 0
    for sent in pos_chunk:
        if len(sent) > 2:           #get rid of obvious non-sentences
            tag_index = -1          #we want to start from the end of the sentence
            final_tag = sent[tag_index][1]
            while re.search('[^A-Z$]+', final_tag) and abs(tag_index) != len(sent):    #while tag not punctuation...
                tag_index = tag_index - 1             #step back one element
                final_tag = sent[tag_index][1]
            if final_tag == 'N':                      #keep track of nouns
                noun_endings += 1
            total_sents += 1
    if total_sents == 0:
        return 0.0
    else:
        return noun_endings/total_sents               #normalize 

#returns the percentage of verbless sentences in each chunk
def verbless_sents(pos_chunk):
    #pos_chunk = pos_chunk[1:-1]
    verbless_sents = 0
    total_sents = 0
    for sent in pos_chunk:
        if len(sent) > 2:   #get rid of the non-sentences
            verb_counter = 0
            for tag in sent:
                if tag[1] == 'V' or tag[1] == 'MD':
                    verb_counter +=1
            if verb_counter == 0:
                verbless_sents +=1
            total_sents += 1
    if total_sents == 0:
        return 0.0
    else:
        return verbless_sents/total_sents

#returns average number of personal pronouns and possesive personal pronouns per sentence
def per_pronoun_use(pos_chunk):
    #pos_chunk = pos_chunk[1:-1]
    sent_ratios = []
    for sent in pos_chunk:
        per_pronouns = 0
        total_tags = 0
        for tag in sent:
            if tag[1] == 'PRP' or tag[1] == 'PRP$':
                per_pronouns += 1
            #don't include punctuation in your totals
            if re.search(r'[A-Z$]+', tag[1]):
                total_tags += 1
        if total_tags != 0:
            sent_ratios.append(per_pronouns/total_tags)   #do we normalize for sentence length in some other way?     
    return np.mean(sent_ratios)                           #do we need to normalize for # of sentences in chunk?

#get multiple type/token ratios for each chunk

#cleaner functions
def cleaner(text):
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'[,.;:"?!*()\']', '', text)
    text = re.sub(r'-', ' ', text)
    text = re.sub(r'[\n\t]', ' ', text)
    #text = remove_accents(text)
    return text

def rmv_stopwords(text):
    chunk = nltk.word_tokenize(text)
    new_chunk = []
    for word in chunk:
        if word not in jockers_stopwords:
            new_chunk.append(word)
    return ' '.join(new_chunk)

def rmv_capitals(text):
    chunk = nltk.sent_tokenize(text)
    new_chunk = []
    for sent in chunk:
        if len(sent) > 1:         #avoid empty sentences
            if sent[0].isupper():     #convert capitalized head-words to lowercase, since most we want to keep
                sent[0].lower()
            sent = re.sub(r'[A-Z][a-z\'.]+', '', sent)   #gets rid of all capitalized words in a sentence; also Mr./Mrs.
            new_chunk.append(sent)
    return ' '.join(new_chunk)     #reconvert to a string

#basic type/token ratio using all words in chunk
def tt_ratio(chunk):
    chunk = cleaner(chunk)   #gets rid of punctuation and numbers
    chunk = nltk.word_tokenize(chunk.lower())
    if len(set(chunk)) == 0:
        return 0.0
    else:
        return len(set(chunk))/len(chunk)   #compute the amount of lexical repetition

#returns type/token ratio without stopwords
def tt_ratio_no_stopwords(chunk):
    chunk = cleaner(chunk)
    chunk = rmv_stopwords(chunk.lower())
    chunk = nltk.word_tokenize(chunk)
    if len(set(chunk)) == 0:
        return 0.0
    else:
        return len(set(chunk))/len(chunk)

#returns type/token ratio without stopwords or capitalized words
def tt_ratio_no_capitals(chunk):
    chunk = rmv_capitals(chunk)
    chunk = rmv_stopwords(chunk.lower())
    chunk = nltk.word_tokenize(chunk)
    if len(set(chunk)) == 0:
        return 0.0
    else:
        return len(set(chunk))/len(chunk)

# Onom detector and measurer
with codecs.open('c:\Users\Hoyt\Dropbox\SOC_PROJECT\ONOM_LIST.txt','r', 'utf-8') as f:
    raw = f.read()
    onom_list = re.split('\n|\r', raw)
    
def onom(chunk):
    onoms = []
    already_checked = []
    chunk = rmv_capitals(chunk)
    chunk = nltk.word_tokenize(cleaner(chunk))
    #chunk = chunk[1:-1]
    for word in chunk:
        if word not in already_checked:
            already_checked.append(word)
            if word.lower() in onom_list:
                onoms.append(word)
    if len(already_checked) == 0:
        return 0.0
    else:
        return len(set(onoms))/len(set(already_checked))

#an efficient way to look up strings in a large list
from bisect import bisect_left
#the list here needs to be sorted
def bi_contains(lst, item):
    """ efficient `item in lst` for sorted lists """
    # if item is larger than the last its not in the list, but the bisect would 
    # find `len(lst)` as the index to insert, so check that first. Else, if the 
    # item is in the list then it has to be at index bisect_left(lst, item)
    return (item <= lst[-1]) and (lst[bisect_left(lst, item)] == item)    
    
def neo(chunk):
    neologisms_in_chunk = 0
    num_words = 0
    chunk = rmv_capitals(chunk)
    chunk = nltk.word_tokenize(cleaner(chunk))
    #chunk = chunk[1:-1]
    for word in chunk:
        if len(word) > 1:
            if bi_contains(Xdict_entries, word) != True:   #if word not in the dictionary created above
                neologisms_in_chunk += 1
            num_words += 1 
    if num_words == 0:
        return 0.0
    else:
        return neologisms_in_chunk/num_words

#function that calculates number of sentences per chunk starting with a PRP (should we exclude "It"?)
def per_pronoun_head(pos_chunk):
    #pos_chunk = pos_chunk[1:]     #leave the last sentence in, since we're looking at head words
    prp_sents = 0
    total_sents = 0
    for sent in pos_chunk:
        if len(sent) > 2:
            tag_index = 0
            #while tag not punct
            while re.search('[^A-Z$]+', sent[tag_index][1]) and tag_index != (len(sent)-1): #or sent[tag_index][0] == '': 
                tag_index += 1   #step forward one element
            total_sents += 1
            if sent[tag_index][1] == 'PRP':
                prp_sents += 1
    if total_sents == 0:
        return 0.0
    else:
        return prp_sents/total_sents

#returns number of sentences per chunk that begin with obvious adverbial modification 
pos_list = ["RB", "RP", "TO", "PRP$", "CC", "IN", "DT", "EX"]
word_list = ["Is", "Had", "Said", "Says", "Come", "Looked", "Look", "Make"] #NEED TO EXPAND THIS
end_char_list = ["g", "n", "d"]     #This may be too restrictive, but catches most everything we want for now

def adv_modification(pos_chunk):
    #pos_chunk = pos_chunk[1:]   #leave the last sentence in, since we're looking at head words
    adv_sents = 0
    total_sents = 0             #for this total, will only include sents that start with capital letter 
    for sent in pos_chunk:
        if len(sent) > 2:
            #print sent[0:5]
            tag_index = 0
            #while tag not punct
            while (re.search('[^A-Z$]+', sent[tag_index][1]) and tag_index != (len(sent)-1)) or sent[tag_index][0] == '': 
                tag_index += 1   #step forward one element
            if re.search(r'[A-Z]', sent[tag_index][0][0]):     #only look at sentences that start with capital letter
                total_sents += 1                               #count these toward the total
                first_tag = sent[tag_index][1]
                second_tag = sent[tag_index + 1][1]
                first_word = sent[tag_index][0]
                second_word = sent[tag_index + 1][0]
                #print first_tag
                #print second_tag
                #check the starting sequence of words and tags
                if first_tag == 'V' and first_word[-1] in end_char_list and second_tag in pos_list and second_word != 'n\'t' :  
                    adv_sents += 1
    if total_sents == 0:
        return 0.0
    else:
        return adv_sents/total_sents
    
#function to find ! or ? in non-dialogue passages
def q_e(chunk):
    chunk = nltk.sent_tokenize(chunk.encode('unicode_escape'))
    partial_count = 0       #keep track of ! and ? that appear outside of dialogue
    total_count = 0
    in_quote = 0            #boolean variable to keep track of whether we're in or outside of quote
    for sent in chunk:
        if sent[0] == "\"" or sent[0] == "\'":         #if starting item is quote mark, then we're in a quote
            in_quote = 1
        for char in sent:
            if char == "?" or char == "!":        
                if in_quote == 0:
                    partial_count = partial_count + 1
                    total_count = total_count + 1
                elif in_quote == 1:
                    total_count = total_count + 1      #this means we were in a quote, so just add to total
        if sent[-1] == "\"" or sent[-1] == r'[.!?]\'\s': 
            in_quote = 0
            partial_count = partial_count - 1    #this was actually a quote, so any ? or ! seen should not count
    if partial_count > 0:
        return partial_count/total_count    #ratio of q or e as function of total number of q or e
    else:
        return 0.0

#function to find ellipses in passage
def find_ellip(chunk):
    count = len(re.findall(r'(\.\s){2}', chunk.encode('utf-8'))) + \
            len(re.findall(u"…", chunk.encode('utf-8'))) + \
            len(re.findall(r'(\.\.)+', chunk.encode('utf-8')))
    return count/len(chunk) 

#going to have to update stop_words list to include personal names (if we are including bag-of-words)
new_dir = 'c:\Users\Hoyt\Dropbox\SOC_PROJECT\\'
#new_dir = '/Users/richardjeanso/Dropbox/SOC_PROJECT/'
text = codecs.open(new_dir + "jockers.txt", "r", "utf-8")
raw = text.read()
#need to turn stopwords into a list
raw = nltk.word_tokenize(raw)
jockers_stopwords = []
for word in raw:
    jockers_stopwords.append(word)

In [47]:
#load up the class labels and features into the data frame

#always label soc as the "0" class -- all other classes will be labeled "1"
soc_chunks_df['class_labels'] = [0]*len(soc_chunks_df)

#this is only if you're interested in bag of words
#vectorizer = TfidfVectorizer(min_df=4, stop_words=jockers_stopwords)
#word_features = vectorizer.fit_transform(combined_df.chunk).toarray()
#for i, col in enumerate(vectorizer.get_feature_names()):
#    combined_df[col] = word_features[:,i]

#all other features
soc_chunks_df['sent_length'] = soc_chunks_df.chunk.map(lambda x: median_sent_length(x))
print "added sent_length..."
soc_chunks_df['noun_ending'] = soc_chunks_df.pos_chunks.map(lambda x: noun_ending(x))
print "added noun_ending..."
soc_chunks_df['verbless_sents'] = soc_chunks_df.pos_chunks.map(lambda x: verbless_sents(x))
print "added verbless_sents..."
soc_chunks_df['per_pronouns'] = soc_chunks_df.pos_chunks.map(lambda x: per_pronoun_use(x))
print "added per_pronouns..."
soc_chunks_df['per_pronoun_head'] = soc_chunks_df.pos_chunks.map(lambda x: per_pronoun_head(x)) 
print "added per_pronoun_head..."
soc_chunks_df['adv_modification'] = soc_chunks_df.pos_chunks.map(lambda x: adv_modification(x)) 
print "added adv_modification..."
soc_chunks_df['tt_ratio'] = soc_chunks_df.chunk.map(lambda x: tt_ratio(x))
soc_chunks_df['tt_ratio_no_stopwords'] = soc_chunks_df.chunk.map(lambda x: tt_ratio_no_stopwords(x))
soc_chunks_df['tt_ratio_no_capitals'] = soc_chunks_df.chunk.map(lambda x: tt_ratio_no_capitals(x))
print "added all tt_ratios..."
soc_chunks_df['onomotopoeia'] = soc_chunks_df.chunk.map(lambda x: onom(x))
print "added onomotopoeia..."
soc_chunks_df['neologisms'] = soc_chunks_df.chunk.map(lambda x: neo(x))
print "added neologisms..."
soc_chunks_df['ellipses'] = soc_chunks_df.chunk.map(lambda x: find_ellip(x))
print "added ellipses counts..."
soc_chunks_df['q_e'] = soc_chunks_df.chunk.map(lambda x: q_e(x))
print "added q_e counts..."

soc_chunks_df.to_pickle("c:\Users\Hoyt\Dropbox\SOC_TEXTS\soc_df.pkl")
soc_chunks_df.shape


added sent_length...
added noun_ending...
added verbless_sents...
added per_pronouns...
added per_pronoun_head...
added adv_modification...
added all tt_ratios...
added onomotopoeia...
added neologisms...
added ellipses counts...
added q_e counts...
Out[47]:
(300, 22)