e.g. a sentence: "the cat barked or swam" can be matched via the Regular Expression:
((the|a)(cat|dog|fish))(barked|slept|swam)((and|or)(barked|slept|swam))*
In [124]:
    
alist=[7,1,3,4,5]
print(min(alist))
    
    
In [114]:
    
import re
    
In [122]:
    
string="One person found xxx helpful"
if ("One" in string):
    print(True)
else:
    print(False)
    
    
In [87]:
    
pattern="((the|a) (cat|dog|fish)) (barked|slept|swam) ((and|or) (barked|slept|swam))*"
m=re.match(pattern, "the cat barked or swam")
print m.groups()
print m.group(),"\n"
print m.span()
print m.start()
print m.end(),"\n"
print m.group()[0],"\n",m.group()[21],"\n",m.group()[0:22]
    
    
Another way to match a sentence is to construct a regular grammar:
In [40]:
    
from IPython.core.display import display
grammar = nltk.CFG.fromstring("""
S -> 'the' S1
S1 -> 'cat' VP
VP -> 'barked' VP1
VP1 ->  C VP
C  -> 'or' | 'and'
VP -> 'swam'
""")
sent = ['the', 'cat', 'barked', 'or', 'swam']
parser = nltk.ChartParser(grammar)
for tree in parser.parse(sent):
    print tree
    display(tree)
    
    
    
Now, here is a sentence which shows ambiguity since it can be parsed into two different trees.
A well-known example of ambiguity is shown in (2), from the Groucho Marx movie, Animal Crackers (1930):
(2) While hunting in Africa, I shot an elephant in my pajamas. How he got into my pajamas, I don't know.
This grammar permits the sentence to be analyzed in two ways, depending on whether the prepositional phrase in my pajamas describes the elephant or the shooting event.
In [27]:
    
from IPython.core.display import display
groucho_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)
for tree in parser.parse(sent):
    print tree
    display(tree)
    
    
    
    
    
Stanford NLP
Installation guides
In [2]:
    
import nltk
import os
# 提供環境變數
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home" 
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
from nltk.tokenize import StanfordTokenizer
tokenizer = StanfordTokenizer()
sent = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\nThanks."
print(tokenizer.tokenize(sent))
    
    
In [9]:
    
import nltk
import os
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home" 
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
from nltk.tag import StanfordNERTagger
eng_tagger = StanfordNERTagger(model_filename=r'english.all.3class.distsim.crf.ser.gz')
print(eng_tagger.tag(['New York',]))
print(eng_tagger.tag('Rami Eid is studying at Stony Brook University in New York'.split()))
    
    
In [11]:
    
from nltk.tag import StanfordPOSTagger
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home" 
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
eng_tagger = StanfordPOSTagger(model_filename=r'english-bidirectional-distsim.tagger')
print(eng_tagger.tag('What is the airspeed of an unladen swallow ?'.split()))
    
    
In [16]:
    
from nltk.parse.stanford import StanfordParser
from IPython.core.display import display
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home" 
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
eng_parser = StanfordParser()
structure=eng_parser.parse("the quick brown fox jumps over the lazy dog".split())
for tree in structure:
    print(tree)
    display(tree)
#print(list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split())))
    
    
    
In [17]:
    
from nltk.parse.stanford import StanfordDependencyParser
os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk1.8.0_25.jdk/Contents/Home" 
os.environ["CLASSPATH"] = "/Users/chweng/Desktop/StanfordNLP/jars"
os.environ["STANFORD_MODELS"] = "/Users/chweng/Desktop/StanfordNLP/models"
eng_parser = StanfordDependencyParser()
res = list(eng_parser.parse("the quick brown fox jumps over the lazy dog".split()))
for row in res[0].triples():
    print(row)
    
    
In [82]:
    
import nltk
print(nltk.corpus.gutenberg.fileids())
    
    
In [56]:
    
print(nltk.corpus.gutenberg.raw("whitman-leaves.txt")[:500])
    
    
In [71]:
    
print(nltk.corpus.gutenberg.words("whitman-leaves.txt")[:100])
    
    
In [67]:
    
for j in range(2):
    print( nltk.corpus.gutenberg.sents("whitman-leaves.txt")[j])
    
    
In [45]:
    
num_char = len(nltk.corpus.gutenberg.raw("whitman-leaves.txt"))    # 原始文本的长度,包括空格、符号等
num_words = len(nltk.corpus.gutenberg.words("whitman-leaves.txt")) # 词的数量
num_sents = len(nltk.corpus.gutenberg.sents("whitman-leaves.txt")) # 句子的数量
num_vocab = len(set([w.lower() for w in nltk.corpus.gutenberg.words("whitman-leaves.txt")])) #文本的尺寸
# 印出文本尺寸,以及文本總字數
print(len(set([w.lower() for w in nltk.corpus.gutenberg.words("whitman-leaves.txt")])),len([w.lower() for w in nltk.corpus.gutenberg.words("whitman-leaves.txt")]))
# 印出平均詞長(包括一个空白符号)、平均句子長度、和文本中每个詞出现的平均次数
print(int(num_char/num_words),int(num_words/num_sents),int(num_words/num_vocab),"whitman-leaves.txt")
    
    
In [84]:
    
import nltk
from nltk.corpus import brown
print(brown.categories())
new_texts=brown.words(categories='news')
print(brown.raw(categories='news')[:300]+"\n")
wfreq=nltk.FreqDist([w.lower() for w in new_texts])
modals=['can','could','may','might','must','will']
for m in modals:
    print(m + ':',wfreq[m])
    
    
In [50]:
    
len([w.lower() for w in new_texts])
    
    Out[50]:
In [92]:
    
a='123 as2'
print(a.startswith('123 111'))
print(a.startswith('123 a'))
    
    
In [95]:
    
print(inaugural.words("1793-Washington.txt")[:200])
    
    
here's a built-in corpus which records the inaugural speeches of presidents of the U.S.
In [105]:
    
from nltk.corpus import inaugural
print(len(inaugural.fileids()))
print([fileid for fileid in inaugural.fileids()])
import nltk
cfd=nltk.ConditionalFreqDist((target,fileid[:4]) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america','citizen'] if w.lower().startswith(target))
cfd.plot()
cfd.tabulate()
    
    
    
    
In [113]:
    
[(w.lower()) for fileid in inaugural.fileids() for w in inaugural.words(fileid) for target in ['america','citizen'] if w.lower().startswith(target)][:20]
    
    Out[113]:
In [104]:
    
import nltk
from nltk.tokenize import word_tokenize
sent = "the the the dog dog some other words that we do not care about"
print(word_tokenize(sent))
cfdist = nltk.ConditionalFreqDist((len(word), word) for word in word_tokenize(sent))
cfdist.plot()
print(cfdist.tabulate())