In [24]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re

In [25]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [26]:
#stopwords like the at a an, unnecesasry
#tokenization into sentences, punkt 
#http://www.nltk.org/
nltk.download("punkt")
nltk.download("stopwords")


[nltk_data] Downloading package punkt to /home/quoniam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/quoniam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[26]:
True

In [27]:
#get the book names, matching txt file
book_filenames = sorted(glob.glob("./data/*.txt"))

In [28]:
#print books
print("Found books:")
book_filenames


Found books:
Out[28]:
['./data/got1.txt',
 './data/got2.txt',
 './data/got3.txt',
 './data/got4.txt',
 './data/got5.txt']

In [29]:
#step 1 process data

#initialize rawunicode , we'll add all text to this one bigass file in memory
corpus_raw = u""
#for each book, read it, open it un utf 8 format, 
#add it to the raw corpus
for book_filename in book_filenames:
    print("Reading '{0}'...".format(book_filename))
    with codecs.open(book_filename, "r", "utf-8") as book_file:
        corpus_raw += book_file.read()
    print("Corpus is now {0} characters long".format(len(corpus_raw)))


Reading './data/got1.txt'...
Corpus is now 1770659 characters long
Reading './data/got2.txt'...
Corpus is now 4071041 characters long
Reading './data/got3.txt'...
Corpus is now 6391405 characters long
Reading './data/got4.txt'...
Corpus is now 8107945 characters long
Reading './data/got5.txt'...
Corpus is now 9719485 characters long

In [30]:
#tokenizastion! saved the trained model here
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [33]:
#tokenize into sentences
raw_sentences = tokenizer.tokenize(corpus_raw)

print(raw_sentences[0:5])


['This edition contains the complete text of the original hardcover edition.', 'NOT ONE WORD HAS BEEN OMITTED.', 'A CLASH OF KINGS\n\nA Bantam Spectra Book\n\nPUBLISHING HISTORY\n\nBantam Spectra hardcover edition published February 1999\n\nBantam Spectra paperback edition / September 2000\n\nSPECTRA and the portrayal of a boxed “s” are trademarks of Bantam Books, a division of Random House, Inc.\n\nAll rights reserved.', 'Copyright © 1999 by George R. R. Martin.', 'Maps by James Sinclair.']

In [32]:
#convert into list of words
#remove unecessary characters, split into words, no hyhens and shit
#split into words
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw)
    words = clean.split()
    return words

In [34]:
#for each sentece, sentences where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [36]:
#print an example
print(raw_sentences[3])
print(sentence_to_wordlist(raw_sentences[3]))


Copyright © 1999 by George R. R. Martin.
['Copyright', 'by', 'George', 'R', 'R', 'Martin']

In [37]:
#count tokens, each one being a sentence
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))


The book corpus contains 1,818,103 tokens

In [ ]: