e.g. a sentence: "the cat barked or swam" can be matched via the Regular Expression:
((the|a)(cat|dog|fish))(barked|slept|swam)((and|or)(barked|slept|swam))*
In [124]:
alist=[7,1,3,4,5]
print(min(alist))
In [114]:
import re
In [122]:
string="One person found xxx helpful"
if ("One" in string):
print(True)
else:
print(False)
In [87]:
pattern="((the|a) (cat|dog|fish)) (barked|slept|swam) ((and|or) (barked|slept|swam))*"
m=re.match(pattern, "the cat barked or swam")
print m.groups()
print m.group(),"\n"
print m.span()
print m.start()
print m.end(),"\n"
print m.group()[0],"\n",m.group()[21],"\n",m.group()[0:22]
Another way to match a sentence is to construct a regular grammar:
In [40]:
from IPython.core.display import display
grammar = nltk.CFG.fromstring("""
S -> 'the' S1
S1 -> 'cat' VP
VP -> 'barked' VP1
VP1 -> C VP
C -> 'or' | 'and'
VP -> 'swam'
""")
sent = ['the', 'cat', 'barked', 'or', 'swam']
parser = nltk.ChartParser(grammar)
for tree in parser.parse(sent):
print tree
display(tree)
Now, here is a sentence which shows ambiguity since it can be parsed into two different trees.
A well-known example of ambiguity is shown in (2), from the Groucho Marx movie, Animal Crackers (1930):
(2) While hunting in Africa, I shot an elephant in my pajamas. How he got into my pajamas, I don't know.
This grammar permits the sentence to be analyzed in two ways, depending on whether the prepositional phrase in my pajamas describes the elephant or the shooting event.
In [27]:
from IPython.core.display import display
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)
for tree in parser.parse(sent):
print tree
display(tree)
Stanford NLP
Installation guides
In [2]:
import nltk
import os
# 提供環境變數
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home"
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
from nltk.tokenize import StanfordTokenizer
tokenizer = StanfordTokenizer()
sent = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks."
print(tokenizer.tokenize(sent))
In [9]:
import nltk
import os
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home"
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
from nltk.tag import StanfordNERTagger
eng_tagger = StanfordNERTagger(model_filename=r'english.all.3class.distsim.crf.ser.gz')
print(eng_tagger.tag(['New York',]))
print(eng_tagger.tag('Rami Eid is studying at Stony Brook University in New York'.split()))
In [11]:
from nltk.tag import StanfordPOSTagger
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home"
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
eng_tagger = StanfordPOSTagger(model_filename=r'english-bidirectional-distsim.tagger')
print(eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split()))
In [16]:
from nltk.parse.stanford import StanfordParser
from IPython.core.display import display
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home"
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
eng_parser = StanfordParser()
structure=eng_parser.parse("the quick brown fox jumps over the lazy dog".split())
for tree in structure:
print(tree)
display(tree)
#print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))
In [17]:
from nltk.parse.stanford import StanfordDependencyParser
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home"
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
eng_parser = StanfordDependencyParser()
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for row in res[0].triples():
print(row)
In [82]:
import nltk
print(nltk.corpus.gutenberg.fileids())
In [56]:
print(nltk.corpus.gutenberg.raw("whitman-leaves.txt")[:500])
In [71]:
print(nltk.corpus.gutenberg.words("whitman-leaves.txt")[:100])
In [67]:
for j in range(2):
print( nltk.corpus.gutenberg.sents("whitman-leaves.txt")[j])
In [45]:
num_char = len(nltk.corpus.gutenberg.raw("whitman-leaves.txt")) # 原始文本的长度,包括空格、符号等
num_words = len(nltk.corpus.gutenberg.words("whitman-leaves.txt")) # 词的数量
num_sents = len(nltk.corpus.gutenberg.sents("whitman-leaves.txt")) # 句子的数量
num_vocab = len(set([w.lower() for w in nltk.corpus.gutenberg.words("whitman-leaves.txt")])) #文本的尺寸
# 印出文本尺寸,以及文本總字數
print(len(set([w.lower() for w in nltk.corpus.gutenberg.words("whitman-leaves.txt")])),len([w.lower() for w in nltk.corpus.gutenberg.words("whitman-leaves.txt")]))
# 印出平均詞長(包括一个空白符号)、平均句子長度、和文本中每个詞出现的平均次数
print(int(num_char/num_words),int(num_words/num_sents),int(num_words/num_vocab),"whitman-leaves.txt")
In [84]:
import nltk
from nltk.corpus import brown
print(brown.categories())
new_texts=brown.words(categories='news')
print(brown.raw(categories='news')[:300]+"\n")
wfreq=nltk.FreqDist([w.lower() for w in new_texts])
modals=['can','could','may','might','must','will']
for m in modals:
print(m + ':',wfreq[m])
In [50]:
len([w.lower() for w in new_texts])
Out[50]:
In [92]:
a='123 as2'
print(a.startswith('123 111'))
print(a.startswith('123 a'))
In [95]:
print(inaugural.words("1793-Washington.txt")[:200])
here's a built-in corpus which records the inaugural speeches of presidents of the U.S.
In [105]:
from nltk.corpus import inaugural
print(len(inaugural.fileids()))
print([fileid for fileid in inaugural.fileids()])
import nltk
cfd=nltk.ConditionalFreqDist((target,fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america','citizen'] if w.lower().startswith(target))
cfd.plot()
cfd.tabulate()
In [113]:
[(w.lower()) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america','citizen'] if w.lower().startswith(target)][:20]
Out[113]:
In [104]:
import nltk
from nltk.tokenize import word_tokenize
sent = "the the the dog dog some other words that we do not care about"
print(word_tokenize(sent))
cfdist = nltk.ConditionalFreqDist((len(word), word) for word in word_tokenize(sent))
cfdist.plot()
print(cfdist.tabulate())