In [1]:
import nltk
from nltk.corpus import names
import random
In [2]:
name = [(n,'M') for n in names.words('male.txt')] + [(n,'F') for n in names.words('female.txt')]
random.shuffle(name)
name[:10]
Out[2]:
準備一個function,用來產生feature,這裡的feature是用最後的英文字母。
In [3]:
def gender_feature(name): return {'last_letter': name[-1]}
featuresets = [(gender_feature(n), g) for (n,g) in name]
train, test = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train)
In [4]:
classifier.classify({'last_letter': 'a'})
Out[4]:
In [5]:
nltk.classify.accuracy(classifier, test)
Out[5]:
In [6]:
classifier.show_most_informative_features()
嘗試不同的feature,例如加入第一個字母,或加入姓名的長度。
In [7]:
def gender_feature(name): return {'last_letter': name[-1], 'first_letter': name[0]}
featuresets = [(gender_feature(n), g) for (n,g) in name]
train, test = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train)
In [8]:
nltk.classify.accuracy(classifier, test)
Out[8]:
In [9]:
def gender_feature(name): return {'last_letter': name[-1], 'first_letter': name[0], 'len': len(name)}
featuresets = [(gender_feature(n), g) for (n,g) in name]
train, test = featuresets[500:], featuresets[:500]
classifier = nltk.NaiveBayesClassifier.train(train)
In [12]:
nltk.classify.accuracy(classifier, test)
Out[12]:
In [13]:
classifier.show_most_informative_features()
In [17]:
from nltk.corpus import movie_reviews
In [18]:
movie_reviews.categories()
Out[18]:
In [21]:
movie_reviews.fileids('neg')[:5], movie_reviews.fileids('pos')[:5]
Out[21]:
In [22]:
documents = [(list(movie_reviews.words(f)), c)
for c in movie_reviews.categories() for f in movie_reviews.fileids(c)]
random.shuffle(documents)
In [41]:
# 選擇2000個字作為feature
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [w for (w, c) in all_words.most_common()[:2000]]
word_features[:5]
Out[41]:
In [42]:
def document_features(doc):
document_words = set(doc) # 自動排除重複字
features = {w: w in document_words for w in word_features}
return features
In [44]:
tmp = document_features(['the', 'she'])
[key for key in tmp if tmp[key]]
Out[44]:
In [45]:
featuresets = [(document_features(d), c) for (d, c) in documents]
train, test = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train)
nltk.classify.accuracy(classifier, test)
Out[45]:
In [46]:
classifier.show_most_informative_features()
In [2]:
import nltk
from nltk.corpus import brown
fdist = nltk.FreqDist()
fdist.update([w[-1:] for w in brown.words()])
fdist.update([w[-2:] for w in brown.words()])
fdist.update([w[-3:] for w in brown.words()])
In [2]:
common_suf = [k for (k, c) in fdist.most_common()[:100]]
common_suf[:5]
Out[2]:
In [3]:
def pos_features(word):
features = {suffix:word.lower().endswith(suffix) for suffix in common_suf}
return features
In [4]:
featuresets = [(pos_features(n), pos) for (n, pos) in brown.tagged_words(categories='news')]
size = int(len(featuresets) * 0.1)
train, test = featuresets[size:], featuresets[:size]
size
Out[4]:
In [5]:
classifier = nltk.DecisionTreeClassifier.train(train)
nltk.classify.accuracy(classifier, test)
Out[5]:
In [11]:
# DecisionTree可以列出結構
print classifier.pretty_format(depth=10)
In [1]:
def pos_features(sentence, i):
features = {"suf_1": sentence[i][-1:], "suf_2": sentence[i][-2:], "suf_3": sentence[i][-3:]}
if i == 0:
features['prev'] = '*'
else:
features['prev'] = sentence[i-1]
return features
In [3]:
pos_features(brown.sents()[0], 8)
Out[3]:
In [6]:
brown.sents()[0][7:10]
Out[6]:
In [7]:
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
for i, (word, tag) in enumerate(tagged_sent):
featuresets.append( (pos_features(untagged_sent, i), tag) )
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
Out[7]:
In [9]:
classifier.show_most_informative_features()
In [10]:
def pos_features(sentence, i, history):
features = {"suf_1": sentence[i][-1:], "suf_2": sentence[i][-2:], "suf_3": sentence[i][-3:]}
if i == 0:
features['prev-word'] = '*'
features['prev-tag'] = '*'
else:
features['prev-word'] = sentence[i-1]
features['prev-tag'] = history[i-1]
return features
In [11]:
# 自己定義自己的tagger class
class ConsecutivePosTagger(nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
# 利用enumerate產生 (index, value)
for i, (word, tag) in enumerate(tagged_sent):
featureset = pos_features(untagged_sent, i, history)
train_set.append( (featureset, tag) )
history.append(tag)
self.classifier = nltk.NaiveBayesClassifier.train(train_set)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = pos_features(sentence, i, history)
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
In [12]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
tagger.evaluate(test_sents)
Out[12]:
In [14]:
import random
from nltk.corpus import brown
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
# 用10%的資料作為test set
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]
以法庭為例,Precision是被判有罪的人裡面,真的有罪的機率。如果Precision是0.95,表示有5%的人會被誤判。Recall是所有罪犯會被判有罪的機率,如果Recall是0.85,表示有15%的人會被誤放。而F-Score則為(2*0.85*0.95)/(0.85+0.95)=0.897。
Confusion Matrix是用來分析輸入和輸出的分類。
要利用機率模型來分類,最簡單的方法就是直接使用機率最高的分類,例如unigram model中,所有的字都預測為'the'。當我們已知機率模型$P(\text{label | feature})$,對於未來新的輸入,就可以用$\max_\ell P(\ell|\text{feature})$來預測。
$$ P(\text{label | feature}) = \frac{P(\text{feature, label})}{P(\text{feature})} = \frac{P(\text{feature | label}) \times P(\text{label})}{P(\text{feature})} = \frac{ \prod_{f \in \text{feature}} P(\text{f | label}) \times P(\text{label})}{P(\text{feature})} $$在實務上,我們會用$count(f,label) / count(label)$來估計$P(\text{f | label})$的值。在POS問題中,假設我們想找P(suffix_es|NN),則估計值為count(suffix_es|NN) / count(NN)。