朴素贝叶斯算法
使用Python自带的包
In [5]:
from nltk.corpus import movie_reviews
from sklearn.cross_validation import StratifiedShuffleSplit
import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
应用自带的库来实现数据的获取,自带的数据中需要抽取分类,对于每个分类下的文件,获取属于这个分类的词语
数据格式返回的是列表的元组
In [7]:
def get_data():
dataset = []
y_labels = []
# 抽取分类
for cat in movie_reviews.categories():
# 对每个类别下的文件
for fileid in movie_reviews.fileids(cat):
# 获取属于这个分类的词语
words = list(movie_reviews.words(fileid))
dataset.append((words,cat))
y_labels.append(cat)
return dataset,y_labels
In [13]:
def get_train_test(input_dataset,ylabels):
'''
将数据分为训练数据和测试数据
'''
train_size = 0.7
test_size = 1 - train_size
stratified_split = StratifiedShuffleSplit(ylabels,test_size=test_size,n_iter=1,random_state=77)
for train_indx,test_indx in stratified_split:
train = [input_dataset[i] for i in train_indx]
train_y = [ylabels[i] for i in train_indx]
test = [input_dataset[i] for i in test_indx]
test_y = [ylabels[i] for i in test_indx]
return train,test,train_y,test_y
下面介绍3个函数,主要是特征生成函数:为了分类,需要提供特征或者属性,给定一个评价,这些函数能从中生成一系列特征。
In [14]:
def build_word_features(instance):
# 用字典来保存特征
feature_set = {}
# 词列表的实例元组里的第一个子项
words = instance[0]
for word in words:
feature_set[word] = 1
return (feature_set,instance[1])
In [15]:
def build_negate_features(instance):
'''
if a word is preceeded by either 'not' or 'no'
this function adds a prefix 'Not_' to that word
It will also not insert the previous negation word
'not' or 'no' in feature dictionary
'''
# 对词进行检索,即实例元组中的第1个子项
words = instance[0]
final_words = []
# 用一个布尔变量追踪上一个词是不是负面词
negate = False
negate_words = ['no','not']
for word in words:
if negate:
word = 'Not_' + word
negate = False
if word not in negate_words:
final_words.append(word)
else:
negate =True
feature_set = {}
for word in final_words:
feature_set[word] = 1
return (feature_set,instance[1])
In [16]:
def remove_stop_words(in_data):
stopword_list = stopwords.words('english')
negate_words = ['no','not']
new_stopwords = [word for word in stopword_list if word not in negate_words]
label = in_data[1]
# 删除停用词
words = [word for word in in_data[0] if word not in new_stopwords]
return (words,label)
In [17]:
def build_keyphrase_features(instance):
feature_set = {}
instance = remove_stop_words(instance)
words = instance[0]
# 采用二元特征原始频率计数
bigram_finder = BigramCollocationFinder.from_words(words)
# 二元特征按出现的频率降序排序,选择前400
bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq,400)
for bigram in bigrams:
feature_set[bigram] = 1
return (feature_set,instance[1])
In [18]:
def build_model(features):
model = nltk.NaiveBayesClassifier.train(features)
return model
In [19]:
def probe_model(model,features,dataset_type='Train'):
accuracy = nltk.classify.accuracy(model,features)
print '\n' + dataset_type + 'Accuracy = %0.2f'%(accuracy*100) + '%'
In [20]:
def show_features(model,no_features=5):
print '\nFeature Importance'
print '==================='
print model.show_most_informative_features(no_features)
第1次就得到正确的模型是非常困难的,需要反复尝试不同的特征和参数调优
In [21]:
def build_model_cycle_1(train_data,dev_data):
# 为训练集建立特征
train_features = map(build_word_features,train_data)
# 为测试集建立特征
dev_features = map(build_word_features,dev_data)
# 建模
model = build_model(train_features)
# 模型探测
probe_model(model,train_features)
probe_model(model,dev_features,'Dev')
return model
In [22]:
def build_model_cycle_2(train_data,dev_data):
# 为训练集建立特征
train_features = map(build_negate_features,train_data)
# 为测试集建立特征
dev_features = map(build_negate_features,dev_data)
# 建模
model = build_model(train_features)
# 模型探测
probe_model(model,train_features)
probe_model(model,dev_features,'Dev')
return model
In [23]:
def build_model_cycle_3(train_data,dev_data,test_data):
# 为训练集建立特征
train_features = map(build_keyphrase_features,train_data)
# 为测试集建立特征
dev_features = map(build_keyphrase_features,dev_data)
# 建模
model = build_model(train_features)
# 模型探测
probe_model(model,train_features)
probe_model(model,dev_features,'Dev')
test_features = map(build_keyphrase_features,test_data)
probe_model(model,test_features,'Test')
return model
In [25]:
if __name__ == '__main__':
input_dataset,y_labels = get_data()
# 训练数据
train_data,all_test_data,train_y,all_test_y = get_train_test(input_dataset,y_labels)
# DEV数据
dev_data,test_data,dev_y,test_y = get_train_test(all_test_data,all_test_y)
# 查看不同数据集的大小
print '\noriginal Data size =',len(input_dataset)
print '\nTraining Data size =',len(train_data)
print '\nDev Data size = ',len(dev_data)
print '\nTest Data size = ',len(test_data)
# 建模的不同过程
model_cycle_1 = build_model_cycle_1(train_data,dev_data)
show_features(model_cycle_1)
model_cycle_2 = build_model_cycle_2(train_data,dev_data)
show_features(model_cycle_2)
model_cycle_3 = build_model_cycle_3(train_data,dev_data,test_data)
show_features(model_cycle_3)
In [ ]: