First, let's analyze some text...
...
Tagging and Parsing into Trees is different:
Other important words:
NLTK is the mother of all mother of NLP
so many parsers:
In [ ]:
sent = "Each of us is full of shit in our own special way"
# setup display for demo
%matplotlib inline
import os
os.environ['DISPLAY'] = 'localhost:1'
In [ ]:
from stat_parser import Parser
parser = Parser()
parser.parse(sent)
tree = parser.parse(sent) # returns nltk Tree instance
tree
In [ ]:
from textblob import TextBlob
blob = TextBlob(sent)
blob.parse()
In [ ]:
import nltk
mp = nltk.parse.malt.MaltParser(os.getcwd(),
model_filename="engmalt.linear-1.7.mco")
mp.parse_one(sent.split()).tree()
In [ ]:
from pattern.en import parse, pprint
s = parse(sent,
tokenize = True, # Tokenize the input, i.e. split punctuation from words.
tags = True, # Find part-of-speech tags.
chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
relations = True, # Find relations between chunks.
lemmata = True, # Find word lemmata.
light = False)
pprint(s)
In [ ]:
from spacy.en import English
parser = English()
parsedData = parser(unicode(sent))
In [ ]:
for i, token in enumerate(parsedData):
print("original:", token.orth, token.orth_)
print("lowercased:", token.lower, token.lower_)
print("lemma:", token.lemma, token.lemma_)
print("shape:", token.shape, token.shape_)
print("prefix:", token.prefix, token.prefix_)
print("suffix:", token.suffix, token.suffix_)
print("log probability:", token.prob)
print("Brown cluster id:", token.cluster)
print("----------------------------------------")
if i > 1:
break
In [ ]:
from visualize_word_graph import draw_graph
draw_graph("dog")
In [ ]:
draw_graph("noise", hypernym=True)
In [ ]:
bad_sounds =['The sound in the place is terrible.',
'dining with clatter and the occasional smell of BMW exausts',
'Also, the acoustics are not conducive to having any sort of conversation.']
not_bad_sounds = ["not to sound like a snob",
"at your table and you can tune the sound to whichever game you're interested in",
"oh god I sound old!"]
In [ ]:
from pattern.en import parse, pprint
def print_parts(sents):
for sent in sents:
s = parse(sent,
tokenize = True, # Tokenize the input, i.e. split punctuation from words.
tags = True, # Find part-of-speech tags.
chunks = True, # Find chunk tags, e.g. "the black cat" = NP = noun phrase.
relations = True, # Find relations between chunks.
lemmata = True, # Find word lemmata.
light = False)
print sent
pprint(s)
sents = bad_sounds + not_bad_sounds
print_parts(bad_sounds + not_bad_sounds)
Penn Treebank Project Chunks guide
Tag | Description | Example |
CC | conjunction, coordinating | and, or, but |
CD | cardinal number | five, three, 13% |
DT | determiner | the, a, these |
EX | existential there | there were six boys |
FW | foreign word | mais |
IN | conjunction, subordinating or preposition | of, on, before, unless |
JJ | adjective | nice, easy |
JJR | adjective, comparative | nicer, easier |
JJS | adjective, superlative | nicest, easiest |
LS | list item marker | |
MD | verb, modal auxillary | may, should |
NN | noun, singular or mass | tiger, chair, laughter |
NNS | noun, plural | tigers, chairs, insects |
NNP | noun, proper singular | Germany, God, Alice |
NNPS | noun, proper plural | we met two Christmases ago |
PDT | predeterminer | both his children |
POS | possessive ending | 's |
PRP | pronoun, personal | me, you, it |
PRP$ | pronoun, possessive | my, your, our |
RB | adverb | extremely, loudly, hard |
RBR | adverb, comparative | better |
RBS | adverb, superlative | best |
RP | adverb, particle | about, off, up |
SYM | symbol | % |
TO | infinitival to | what to do? |
UH | interjection | oh, oops, gosh |
VB | verb, base form | think |
VBZ | verb, 3rd person singular present | she thinks |
VBP | verb, non-3rd person singular present | I think |
VBD | verb, past tense | they thought |
VBN | verb, past participle | a sunken ship |
VBG | verb, gerund or present participle | thinking is fun |
WDT | wh-determiner | which, whatever, whichever |
WP | wh-pronoun, personal | what, who, whom |
WP$$ | wh-pronoun, possessive | whose, whosever |
WRB | wh-adverb | where, when |
. | punctuation mark, sentence closer | .;?* |
, | punctuation mark, comma | , |
: | punctuation mark, colon | : |
( | contextual separator, left paren | ( |
) | contextual separator, right paren | ) |
Tag | Description | Words | Example | % |
NP | noun phrase | DT+RB+JJ+NN + PR | the strange bird | 51 |
PP | prepositional phrase | TO+IN | in between | 19 |
VP | verb phrase | RB+MD+VB | was looking |
9 |
ADVP | adverb phrase | RB | also |
6 |
ADJP | adjective phrase | CC+RB+JJ | warm and cosy | 3 |
SBAR | subordinating conjunction | IN | whether or not |
3 |
PRT | particle | RP | up the stairs | 1 |
INTJ | interjection | UH | hello |
0 |
In [ ]:
from pattern.en import parsetree
from pattern.search import search
for sent in sents:
t = parsetree(sent)
print
print sent
print "Tagged Sent:", t
print "Verbs:", search('VB*', t) # verbs
print "ADJP:", search('ADJP', t) # verbs
print "Nouns:", search('NN', t) # all nouns
In [ ]:
from nltk.corpus import wordnet as wn
from pattern.en import parsetree
from pattern.search import taxonomy, WordNetClassifier, search
taxonomy.classifiers.append(WordNetClassifier())
def get_parts(word, pos, recursive=False):
parts = [word, ]
parts += taxonomy.children(word, pos=pos, recursive=recursive)
parts += taxonomy.parents(word, pos=pos, recursive=recursive)
return parts
def word_search(t, word, pos):
parts = get_parts(word, pos)
results = search(pos, t)
for result in results:
# print result.string, parts
if any(x in result.string.split() for x in parts):
return True
return False
def run_a_rule(sent, word, pos):
t = parsetree(sent)
return word_search(t, word, pos)
In [ ]:
print "1. 'sound' is a NN"
print run_a_rule(sents[0], 'noise', 'NN')
print "2. clatter is a NN"
print run_a_rule(sents[1], 'noise', 'NN')
print "3. acoustics is NNS and RB Not"
print run_a_rule(sents[2], 'acoustics', 'NNS') and run_a_rule(sents[2], 'not', 'RB')
print "4. sound is a VB"
print run_a_rule(sents[3], 'noise', 'VB*')
print "5. Sounds is JJ"
print run_a_rule(sents[4], 'sound', 'JJ')
print "6. sound is VBP"
print run_a_rule(sents[5], 'noise', 'VB*')
In [ ]:
def ext_func(tgt):
return bool(not (run_a_rule(tgt, 'noise', 'VB*') and not run_a_rule(tgt, 'sound', 'JJ'))
and (run_a_rule(tgt, 'noise', 'NN') or run_a_rule(tgt, 'acoustics', 'NNS') or
(run_a_rule(tgt, 'acoustics', 'NNS') and run_a_rule(tgt, 'not', 'RB'))))
print "bad noises in review:"
for sent in bad_sounds:
print "\t" + sent
assert(ext_func(sent) == True)
print
print "no mention of bad noises:"
for sent in not_bad_sounds:
print "\t" + sent
assert(ext_func(sent) == False)
In [14]:
import zipfile
import pickle
from lxml import etree
from StringIO import StringIO
zf = zipfile.ZipFile('nhtsa_as_xml.zip', 'r')
nhtsa_injured = zf.read('nhtsa_injured.xml')
nhtsa_not_injured = zf.read('nhtsa_not_injured.xml')
xml_injured = etree.parse(StringIO(nhtsa_injured))
xml_not_injured = etree.parse(StringIO(nhtsa_not_injured))
def injured(l):
return ['0' != str(x) and 'injured' or 'notinjured' for x in l]
def data(x):
out = [x.xpath("//rows/row/@c1"),
injured(x.xpath("//rows/row/@c8")),
x.xpath("//rows/row/@c2")]
return list(reversed(zip(*out)))
xml_injured_data = data(xml_injured)[:800]
xml_not_injured_data = data(xml_not_injured)[:800]
In [15]:
xml_injured_data[0]
Out[15]:
In [16]:
from visualize_word_graph import draw_graph
draw_graph("injury")
Out[16]:
In [17]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from pattern.search import taxonomy, search
taxonomy.append('dislocated', type='injury')
taxonomy.append('sustained', type='injury')
taxonomy.append('burn', type='injury')
taxonomy.append('injury', type='hurt')
def check_sustained(text):
if len(search('HURT', text)) > 0:
return True
return False
def feats(text):
words = text.replace(".", "").split()
out = dict([(word, True) for word in words])
if 'SUSTAINED' in out:
del out['SUSTAINED']
out['rule(SUSTAINED)'] = check_sustained(text)
return out
negcutoff = len(xml_not_injured_data)*3/4
poscutoff = len(xml_injured_data)*3/4
not_inj_data = xml_not_injured_data[:negcutoff] + xml_injured_data[:poscutoff]
inj_data = xml_not_injured_data[negcutoff:] + xml_injured_data[poscutoff:]
negfeats = [(feats(f[2]), 'not') for f in not_inj_data]
posfeats = [(feats(f[2]), 'injure') for f in inj_data]
egcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features(n=100)
classifier.classify(feats("HE SUSTAINED INJURY"))
Out[17]:
converts:
return bool(not (run_a_rule(tgt, 'noise', 'VB*') and not run_a_rule(tgt, 'sound', 'JJ'))
and (run_a_rule(tgt, 'noise', 'NN') or run_a_rule(tgt, 'acoustics', 'NNS') or
(run_a_rule(tgt, 'acoustics', 'NNS') and run_a_rule(tgt, 'not', 'RB'))))
To:
SENT: !VB*(noise+3) and !JJ(sound+3) ) and (NN(noise+2) | NNS(acoustics) | (NNS(acoustics) & RB(not)))
Comming soon to: https://github.com/brianray/posh
Copy of this presentation found here: https://github.com/brianray/puppy_dec_2015