In [24]:
import codecs
import glob
import logging
import multiprocessing
import os
import pprint
import re
In [25]:
import nltk
import gensim.models.word2vec as w2v
import sklearn.manifold
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
In [26]:
#stopwords like the at a an, unnecesasry
#tokenization into sentences, punkt
#http://www.nltk.org/
nltk.download("punkt")
nltk.download("stopwords")
Out[26]:
In [27]:
#get the book names, matching txt file
book_filenames = sorted(glob.glob("./data/*.txt"))
In [28]:
#print books
print("Found books:")
book_filenames
Out[28]:
In [29]:
#step 1 process data
#initialize rawunicode , we'll add all text to this one bigass file in memory
corpus_raw = u""
#for each book, read it, open it un utf 8 format,
#add it to the raw corpus
for book_filename in book_filenames:
print("Reading '{0}'...".format(book_filename))
with codecs.open(book_filename, "r", "utf-8") as book_file:
corpus_raw += book_file.read()
print("Corpus is now {0} characters long".format(len(corpus_raw)))
In [30]:
#tokenizastion! saved the trained model here
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
In [33]:
#tokenize into sentences
raw_sentences = tokenizer.tokenize(corpus_raw)
print(raw_sentences[0:5])
In [32]:
#convert into list of words
#remove unecessary characters, split into words, no hyhens and shit
#split into words
def sentence_to_wordlist(raw):
clean = re.sub("[^a-zA-Z]"," ", raw)
words = clean.split()
return words
In [34]:
#for each sentece, sentences where each word is tokenized
sentences = []
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(sentence_to_wordlist(raw_sentence))
In [36]:
#print an example
print(raw_sentences[3])
print(sentence_to_wordlist(raw_sentences[3]))
In [37]:
#count tokens, each one being a sentence
token_count = sum([len(sentence) for sentence in sentences])
print("The book corpus contains {0:,} tokens".format(token_count))
In [ ]: