In [1]:
import nltk
import string
from nltk.corpus import gutenberg, brown, wordnet
from neo4j.v1 import GraphDatabase, basic_auth
from nltk.stem import WordNetLemmatizer

In [2]:
# INSERT YOUR NEO4j AUTHENACATION DETAILS HERE
NEO4J_BOLT_URL = "bolt://localhost:7687"
NEO4J_USERNAME = "neo4j"
NEO4J_PASSWORD = ""

# CHANGE YOUR CUSTOM STOP WORDS
CUSTOM_STOP_WORDS = "'',--,``".split(",")

In [3]:
# SELECT YOUR IMPORT DATA HERE

# OPTION 1
# File
# file = open("text.txt", "r")
# text = file.read()
# sents = [nltk.word_tokenize(sentence) for sentence in nltk.sent_tokenize(text)]
# tagged_sents = nltk.pos_tag_sents(sents);

# OPTION 2
# bible-kjv in gutenberg
# sents = gutenberg.sents('bible-kjv.txt')
# tagged_sents = nltk.pos_tag_sents(sents);

# OPTION 3
# chesterton-ball in gutenberg
sents = gutenberg.sents('chesterton-ball.txt')
tagged_sents = nltk.pos_tag_sents(sents);

# OPTION 4
# Brown
# tagged_sents = brown.tagged_sents()

In [4]:
driver = GraphDatabase.driver(NEO4J_BOLT_URL, auth=basic_auth(NEO4J_USERNAME, NEO4J_PASSWORD))
wordnet_lemmatizer = WordNetLemmatizer() 
puncuation = string.punctuation + "“”’‘"

# Stopwords
mysql_innodb_full_text_search_stop_words = "a's able about above according accordingly across actually after afterwards again against ain't all allow allows almost alone along already also although always am among amongst an and another any anybody anyhow anyone anything anyway anyways anywhere apart appear appreciate appropriate are aren't around as aside ask asking associated at available away awfully be became because become becomes becoming been before beforehand behind being believe below beside besides best better between beyond both brief but by c'mon c's came can can't cannot cant cause causes certain certainly changes clearly co com come comes concerning consequently consider considering contain containing contains corresponding could couldn't course currently definitely described despite did didn't different do does doesn't doing don't done down downwards during each edu eg eight either else elsewhere enough entirely especially et etc even ever every everybody everyone everything everywhere ex exactly example except far few fifth first five followed following follows for former formerly forth four from further furthermore get gets getting given gives go goes going gone got gotten greetings had hadn't happens hardly has hasn't have haven't having he he's hello help hence her here here's hereafter hereby herein hereupon hers herself hi him himself his hither hopefully how howbeit however i'd i'll i'm i've ie if ignored immediate in inasmuch inc indeed indicate indicated indicates inner insofar instead into inward is isn't it it'd it'll it's its itself just keep keeps kept know known knows last lately later latter latterly least less lest let let's like liked likely little look looking looks ltd mainly many may maybe me mean meanwhile merely might more moreover most mostly much must my myself name namely nd near nearly necessary need needs neither never nevertheless new next nine no nobody non none noone nor normally not nothing novel now nowhere obviously of off often oh ok okay old on once one ones only onto or other others otherwise ought our ours ourselves out outside over overall own particular particularly per perhaps placed please plus possible presumably probably provides que quite qv rather rd re really reasonably regarding regardless regards relatively respectively right said same saw say saying says second secondly see seeing seem seemed seeming seems seen self selves sensible sent serious seriously seven several shall she should shouldn't since six so some somebody somehow someone something sometime sometimes somewhat somewhere soon sorry specified specify specifying still sub such sup sure t's take taken tell tends th than thank thanks thanx that that's thats the their theirs them themselves then thence there there's thereafter thereby therefore therein theres thereupon these they they'd they'll they're they've think third this thorough thoroughly those though three through throughout thru thus to together too took toward towards tried tries truly try trying twice two un under unfortunately unless unlikely until unto up upon us use used useful uses using usually value various very via viz vs want wants was wasn't way we we'd we'll we're we've welcome well went were weren't what what's whatever when whence whenever where where's whereafter whereas whereby wherein whereupon wherever whether which while whither who who's whoever whole whom whose why will willing wish with within without won't wonder would wouldn't yes yet you you'd you'll you're you've your yours yourself yourselves zero".split(" ")
nltk_stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your".split(",")
single_character_stop_words = "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z".split(",")
old_fashioned_stop_words = "thou,thee,thy,thine,ye,wa,shalt,hath,hast".split(",")
puncuation_stop_words = list(puncuation);

stop_words = list(set(
    CUSTOM_STOP_WORDS +
    mysql_innodb_full_text_search_stop_words + 
    nltk_stop_words + 
    single_character_stop_words +
    old_fashioned_stop_words +
    puncuation_stop_words))

In [5]:
selected_sentences = tagged_sents

In [6]:
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']
def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']
def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def tag_to_wordnet_pos(tag):
    if is_adjective(tag):
        return wordnet.ADJ
    elif is_noun(tag):
        return wordnet.NOUN
    elif is_adverb(tag):
        return wordnet.ADV
    elif is_verb(tag):
        return wordnet.VERB
    return wordnet.NOUN

DEQUOTE_PUNCTUATIONS = tuple(list(puncuation))
def dequote(s):
    while s.startswith(DEQUOTE_PUNCTUATIONS):
        s = s[1:]
    while s.endswith(DEQUOTE_PUNCTUATIONS):
        s = s[:-1]
    
    return s

In [7]:
def create_constraint_for_sentence(tx):
    tx.run("CREATE CONSTRAINT ON (w: Word) ASSERT w.name IS UNIQUE")
def create_constraint_for_word(tx):
    tx.run("CREATE CONSTRAINT ON (s: Sentence) ASSERT s.name IS UNIQUE")

In [8]:
def generate_sentence_query(tx, sentence):
    not_numerical_words = [word for word in sentence if not word[0].isdigit()]
    lower_words = [(word[0].lower(), word[1]) for word in not_numerical_words]
    not_stopwords_words = [(word[0].lower(), word[1]) for word in lower_words if word[0] not in stop_words]
    normalized_words = [wordnet_lemmatizer.lemmatize(word[0], tag_to_wordnet_pos(word[1])) for word in not_stopwords_words]
    dequoted_words = [dequote(word) for word in normalized_words]
    words = [word for word in dequoted_words if len(word) > 0]
    
    tx.run("WITH $words AS words, $sentence AS sentence "
           "MERGE (s: Sentence {name: sentence}) "
           ""
           "WITH words, s "
           "UNWIND range(0, size(words) - 2) AS i "
           "MERGE (w1: Word {name: words[i]}) ON CREATE SET w1.count = 0 "
           "MERGE (w1)-[ai1:APPEARS_IN]->(s) "
           "MERGE (w2: Word {name: words[i + 1]}) ON CREATE SET w2.count = 0 "
           "MERGE (w2)-[ai2:APPEARS_IN]->(s)  "
           "MERGE (w1)-[r:NEXT]->(w2) ON CREATE SET r.count = 1 ON MATCH SET r.count = r.count + 1 "
           ""
           "WITH collect(w1) AS w1s, collect(w2) AS w2s "
           "WITH w1s + last(w2s) AS merged_words "
           ""
           "UNWIND merged_words AS merged_word "
           "SET merged_word.count = merged_word.count + 1 "
           "",
           {
               "words": words,
               "sentence": " ".join([word[0] for word in sentence])
           })

In [9]:
print("Started insertion")

number_of_finished_queries = 0

with driver.session() as session:
    session.write_transaction(create_constraint_for_sentence)
    session.write_transaction(create_constraint_for_word)

    for sentence in selected_sentences:
        session.write_transaction(generate_sentence_query, sentence)
        number_of_finished_queries = number_of_finished_queries + 1
        if (number_of_finished_queries % 500 == 0):
            print(number_of_finished_queries)
        
print("Ended insertion")


Started insertion
500
1000
1500
2000
2500
3000
3500
4000
4500
Ended insertion

In [8]:
english_stopwords


Out[8]:
["a's",
 'able',
 'about',
 'above',
 'according',
 'accordingly',
 'across',
 'actually',
 'after',
 'afterwards',
 'again',
 'against',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'appear',
 'appreciate',
 'appropriate',
 'are',
 "aren't",
 'around',
 'as',
 'aside',
 'ask',
 'asking',
 'associated',
 'at',
 'available',
 'away',
 'awfully',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'believe',
 'below',
 'beside',
 'besides',
 'best',
 'better',
 'between',
 'beyond',
 'both',
 'brief',
 'but',
 'by',
 "c'mon",
 "c's",
 'came',
 'can',
 "can't",
 'cannot',
 'cant',
 'cause',
 'causes',
 'certain',
 'certainly',
 'changes',
 'clearly',
 'co',
 'com',
 'come',
 'comes',
 'concerning',
 'consequently',
 'consider',
 'considering',
 'contain',
 'containing',
 'contains',
 'corresponding',
 'could',
 "couldn't",
 'course',
 'currently',
 'definitely',
 'described',
 'despite',
 'did',
 "didn't",
 'different',
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'done',
 'down',
 'downwards',
 'during',
 'each',
 'edu',
 'eg',
 'eight',
 'either',
 'else',
 'elsewhere',
 'enough',
 'entirely',
 'especially',
 'et',
 'etc',
 'even',
 'ever',
 'every',
 'everybody',
 'everyone',
 'everything',
 'everywhere',
 'ex',
 'exactly',
 'example',
 'except',
 'far',
 'few',
 'fifth',
 'first',
 'five',
 'followed',
 'following',
 'follows',
 'for',
 'former',
 'formerly',
 'forth',
 'four',
 'from',
 'further',
 'furthermore',
 'get',
 'gets',
 'getting',
 'given',
 'gives',
 'go',
 'goes',
 'going',
 'gone',
 'got',
 'gotten',
 'greetings',
 'had',
 "hadn't",
 'happens',
 'hardly',
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he's",
 'hello',
 'help',
 'hence',
 'her',
 'here',
 "here's",
 'hereafter',
 'hereby',
 'herein',
 'hereupon',
 'hers',
 'herself',
 'hi',
 'him',
 'himself',
 'his',
 'hither',
 'hopefully',
 'how',
 'howbeit',
 'however',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'ie',
 'if',
 'ignored',
 'immediate',
 'in',
 'inasmuch',
 'inc',
 'indeed',
 'indicate',
 'indicated',
 'indicates',
 'inner',
 'insofar',
 'instead',
 'into',
 'inward',
 'is',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'keep',
 'keeps',
 'kept',
 'know',
 'known',
 'knows',
 'last',
 'lately',
 'later',
 'latter',
 'latterly',
 'least',
 'less',
 'lest',
 'let',
 "let's",
 'like',
 'liked',
 'likely',
 'little',
 'look',
 'looking',
 'looks',
 'ltd',
 'mainly',
 'many',
 'may',
 'maybe',
 'me',
 'mean',
 'meanwhile',
 'merely',
 'might',
 'more',
 'moreover',
 'most',
 'mostly',
 'much',
 'must',
 'my',
 'myself',
 'name',
 'namely',
 'nd',
 'near',
 'nearly',
 'necessary',
 'need',
 'needs',
 'neither',
 'never',
 'nevertheless',
 'new',
 'next',
 'nine',
 'no',
 'nobody',
 'non',
 'none',
 'noone',
 'nor',
 'normally',
 'not',
 'nothing',
 'novel',
 'now',
 'nowhere',
 'obviously',
 'of',
 'off',
 'often',
 'oh',
 'ok',
 'okay',
 'old',
 'on',
 'once',
 'one',
 'ones',
 'only',
 'onto',
 'or',
 'other',
 'others',
 'otherwise',
 'ought',
 'our',
 'ours',
 'ourselves',
 'out',
 'outside',
 'over',
 'overall',
 'own',
 'particular',
 'particularly',
 'per',
 'perhaps',
 'placed',
 'please',
 'plus',
 'possible',
 'presumably',
 'probably',
 'provides',
 'que',
 'quite',
 'qv',
 'rather',
 'rd',
 're',
 'really',
 'reasonably',
 'regarding',
 'regardless',
 'regards',
 'relatively',
 'respectively',
 'right',
 'said',
 'same',
 'saw',
 'say',
 'saying',
 'says',
 'second',
 'secondly',
 'see',
 'seeing',
 'seem',
 'seemed',
 'seeming',
 'seems',
 'seen',
 'self',
 'selves',
 'sensible',
 'sent',
 'serious',
 'seriously',
 'seven',
 'several',
 'shall',
 'she',
 'should',
 "shouldn't",
 'since',
 'six',
 'so',
 'some',
 'somebody',
 'somehow',
 'someone',
 'something',
 'sometime',
 'sometimes',
 'somewhat',
 'somewhere',
 'soon',
 'sorry',
 'specified',
 'specify',
 'specifying',
 'still',
 'sub',
 'such',
 'sup',
 'sure',
 "t's",
 'take',
 'taken',
 'tell',
 'tends',
 'th',
 'than',
 'thank',
 'thanks',
 'thanx',
 'that',
 "that's",
 'thats',
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'thence',
 'there',
 "there's",
 'thereafter',
 'thereby',
 'therefore',
 'therein',
 'theres',
 'thereupon',
 'these',
 'they',
 "they'd",
 "they'll",
 "they're",
 "they've",
 'think',
 'third',
 'this',
 'thorough',
 'thoroughly',
 'those',
 'though',
 'three',
 'through',
 'throughout',
 'thru',
 'thus',
 'to',
 'together',
 'too',
 'took',
 'toward',
 'towards',
 'tried',
 'tries',
 'truly',
 'try',
 'trying',
 'twice',
 'two',
 'un',
 'under',
 'unfortunately',
 'unless',
 'unlikely',
 'until',
 'unto',
 'up',
 'upon',
 'us',
 'use',
 'used',
 'useful',
 'uses',
 'using',
 'usually',
 'value',
 'various',
 'very',
 'via',
 'viz',
 'vs',
 'want',
 'wants',
 'was',
 "wasn't",
 'way',
 'we',
 "we'd",
 "we'll",
 "we're",
 "we've",
 'welcome',
 'well',
 'went',
 'were',
 "weren't",
 'what',
 "what's",
 'whatever',
 'when',
 'whence',
 'whenever',
 'where',
 "where's",
 'whereafter',
 'whereas',
 'whereby',
 'wherein',
 'whereupon',
 'wherever',
 'whether',
 'which',
 'while',
 'whither',
 'who',
 "who's",
 'whoever',
 'whole',
 'whom',
 'whose',
 'why',
 'will',
 'willing',
 'wish',
 'with',
 'within',
 'without',
 "won't",
 'wonder',
 'would',
 "wouldn't",
 'yes',
 'yet',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'zero',
 'a',
 'able',
 'about',
 'across',
 'after',
 'all',
 'almost',
 'also',
 'am',
 'among',
 'an',
 'and',
 'any',
 'are',
 'as',
 'at',
 'be',
 'because',
 'been',
 'but',
 'by',
 'can',
 'cannot',
 'could',
 'dear',
 'did',
 'do',
 'does',
 'either',
 'else',
 'ever',
 'every',
 'for',
 'from',
 'get',
 'got',
 'had',
 'has',
 'have',
 'he',
 'her',
 'hers',
 'him',
 'his',
 'how',
 'however',
 'i',
 'if',
 'in',
 'into',
 'is',
 'it',
 'its',
 'just',
 'least',
 'let',
 'like',
 'likely',
 'may',
 'me',
 'might',
 'most',
 'must',
 'my',
 'neither',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'often',
 'on',
 'only',
 'or',
 'other',
 'our',
 'own',
 'rather',
 'said',
 'say',
 'says',
 'she',
 'should',
 'since',
 'so',
 'some',
 'than',
 'that',
 'the',
 'their',
 'them',
 'then',
 'there',
 'these',
 'they',
 'this',
 'tis',
 'to',
 'too',
 'twas',
 'us',
 'wants',
 'was',
 'we',
 'were',
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'would',
 'yet',
 'you',
 'your',
 'thou',
 'thee',
 'thy',
 'thine',
 'ye']

In [21]:
selected_sentences


Out[21]:
[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlanta', 'NP-TL'), ("''", "''"), ('for', 'IN'), ('the', 'AT'), ('manner', 'NN'), ('in', 'IN'), ('which', 'WDT'), ('the', 'AT'), ('election', 'NN'), ('was', 'BEDZ'), ('conducted', 'VBN'), ('.', '.')], ...]

In [56]:
sentence = selected_sentences[0]

not_numerical_words = [word for word in sentence if not word[0].isdigit()]
not_punucation_words = [word for word in not_numerical_words if word[0] not in puncuation]
lower_words = [(word[0].lower(), word[1]) for word in not_punucation_words]
not_stopwords_words = [(word[0].lower(), word[1]) for word in lower_words if word[0] not in english_stopwords]
normalized_words = [wordnet_lemmatizer.lemmatize(word[0], penn_to_wn(word[1])) for word in not_stopwords_words]

normalized_words


Out[56]:
['fulton',
 'county',
 'grand',
 'jury',
 'friday',
 'investigation',
 "atlanta's",
 'recent',
 'primary',
 'election',
 'produce',
 'evidence',
 'irregularity',
 'place']

In [34]:
dequote("’the")


Out[34]:
'the'

In [31]:
DEQUOTE_PUNCTUATIONS


Out[31]:
('!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '“',
 '”')

In [9]:
list(puncuation)


Out[9]:
['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 '“',
 '”',
 '’',
 '‘']

In [ ]: