朴素贝叶斯算法

使用Python自带的包


In [5]:
from nltk.corpus import movie_reviews
from sklearn.cross_validation import StratifiedShuffleSplit
import nltk
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

应用自带的库来实现数据的获取,自带的数据中需要抽取分类,对于每个分类下的文件,获取属于这个分类的词语

数据格式返回的是列表的元组


In [7]:
def get_data():
    dataset = []
    y_labels = []
    # 抽取分类
    for cat in movie_reviews.categories():
        # 对每个类别下的文件
        for fileid in movie_reviews.fileids(cat):
            # 获取属于这个分类的词语
            words = list(movie_reviews.words(fileid))
            dataset.append((words,cat))
            y_labels.append(cat)
    return dataset,y_labels

In [13]:
def get_train_test(input_dataset,ylabels):
    '''
    将数据分为训练数据和测试数据
    '''
    train_size = 0.7
    test_size = 1 - train_size
    
    stratified_split = StratifiedShuffleSplit(ylabels,test_size=test_size,n_iter=1,random_state=77)
    
    for train_indx,test_indx in stratified_split:
        train = [input_dataset[i] for i in train_indx]
        train_y = [ylabels[i] for i in train_indx]
        
        test = [input_dataset[i] for i in test_indx]
        test_y = [ylabels[i] for i in test_indx]
        
    return train,test,train_y,test_y

下面介绍3个函数,主要是特征生成函数:为了分类,需要提供特征或者属性,给定一个评价,这些函数能从中生成一系列特征。


In [14]:
def build_word_features(instance):
    # 用字典来保存特征
    feature_set = {}
    # 词列表的实例元组里的第一个子项
    words = instance[0]
    for word in words:
        feature_set[word] = 1 
    return (feature_set,instance[1])

In [15]:
def build_negate_features(instance):
    '''
    if a word is preceeded by either 'not' or 'no'
    this function adds a prefix 'Not_' to that word
    It will also not insert the previous negation word
    'not' or 'no' in feature dictionary
    '''
    # 对词进行检索,即实例元组中的第1个子项
    words = instance[0]
    final_words = []
    # 用一个布尔变量追踪上一个词是不是负面词
    negate = False
    negate_words = ['no','not']
    for word in words:
        if negate:
            word = 'Not_' + word
            negate = False
        if word not in negate_words:
            final_words.append(word)
        else:
            negate =True
    feature_set = {}
    for word in final_words:
        feature_set[word] = 1
    return (feature_set,instance[1])

In [16]:
def remove_stop_words(in_data):
    stopword_list = stopwords.words('english')
    negate_words = ['no','not']
    new_stopwords = [word for word in stopword_list if word not in negate_words]
    label = in_data[1]
    # 删除停用词
    words = [word for word in in_data[0] if word not in new_stopwords]
    return (words,label)

In [17]:
def build_keyphrase_features(instance):
    feature_set = {}
    instance = remove_stop_words(instance)
    words = instance[0]
    # 采用二元特征原始频率计数
    bigram_finder = BigramCollocationFinder.from_words(words)
    # 二元特征按出现的频率降序排序,选择前400
    bigrams = bigram_finder.nbest(BigramAssocMeasures.raw_freq,400)
    for bigram in bigrams:
        feature_set[bigram] = 1
    return (feature_set,instance[1])

In [18]:
def build_model(features):
    model = nltk.NaiveBayesClassifier.train(features)
    return model

In [19]:
def probe_model(model,features,dataset_type='Train'):
    accuracy = nltk.classify.accuracy(model,features)
    print '\n' + dataset_type + 'Accuracy = %0.2f'%(accuracy*100) + '%'

In [20]:
def show_features(model,no_features=5):
    print '\nFeature Importance'
    print '==================='
    print model.show_most_informative_features(no_features)

第1次就得到正确的模型是非常困难的,需要反复尝试不同的特征和参数调优


In [21]:
def build_model_cycle_1(train_data,dev_data):
    # 为训练集建立特征
    train_features = map(build_word_features,train_data)
    # 为测试集建立特征
    dev_features = map(build_word_features,dev_data)
    # 建模
    model = build_model(train_features)
    # 模型探测
    probe_model(model,train_features)
    probe_model(model,dev_features,'Dev')
    
    return model

In [22]:
def build_model_cycle_2(train_data,dev_data):
    # 为训练集建立特征
    train_features = map(build_negate_features,train_data)
    # 为测试集建立特征
    dev_features = map(build_negate_features,dev_data)
    # 建模
    model = build_model(train_features)
    # 模型探测
    probe_model(model,train_features)
    probe_model(model,dev_features,'Dev')
    return model

In [23]:
def build_model_cycle_3(train_data,dev_data,test_data):
    # 为训练集建立特征
    train_features = map(build_keyphrase_features,train_data)
    # 为测试集建立特征
    dev_features = map(build_keyphrase_features,dev_data)
    # 建模
    model = build_model(train_features)
    # 模型探测
    probe_model(model,train_features)
    probe_model(model,dev_features,'Dev')
    test_features = map(build_keyphrase_features,test_data)
    probe_model(model,test_features,'Test')
    return model

In [25]:
if __name__ == '__main__':
    input_dataset,y_labels = get_data()
    # 训练数据
    train_data,all_test_data,train_y,all_test_y = get_train_test(input_dataset,y_labels)
    # DEV数据
    dev_data,test_data,dev_y,test_y = get_train_test(all_test_data,all_test_y)
    
    # 查看不同数据集的大小
    print '\noriginal Data size =',len(input_dataset)
    print '\nTraining Data size =',len(train_data)
    print '\nDev Data size = ',len(dev_data)
    print '\nTest Data size = ',len(test_data)
    
    # 建模的不同过程
    model_cycle_1 = build_model_cycle_1(train_data,dev_data)
    show_features(model_cycle_1)
    
    model_cycle_2 = build_model_cycle_2(train_data,dev_data)
    show_features(model_cycle_2)
    
    model_cycle_3 = build_model_cycle_3(train_data,dev_data,test_data)
    show_features(model_cycle_3)


original Data size = 2000

Training Data size = 1399

Dev Data size =  420

Test Data size =  181

TrainAccuracy = 97.57%

DevAccuracy = 68.57%

Feature Importance
===================
Most Informative Features
               stupidity = 1                 neg : pos    =     15.6 : 1.0
                  warned = 1                 neg : pos    =     11.7 : 1.0
             wonderfully = 1                 pos : neg    =     11.5 : 1.0
             outstanding = 1                 pos : neg    =     11.0 : 1.0
            unconvincing = 1                 neg : pos    =     11.0 : 1.0
None

TrainAccuracy = 98.00%

DevAccuracy = 69.29%

Feature Importance
===================
Most Informative Features
               stupidity = 1                 neg : pos    =     15.6 : 1.0
             wonderfully = 1                 pos : neg    =     14.7 : 1.0
               Not_funny = 1                 neg : pos    =     13.0 : 1.0
                  warned = 1                 neg : pos    =     11.7 : 1.0
             outstanding = 1                 pos : neg    =     11.0 : 1.0
None

TrainAccuracy = 100.00%

DevAccuracy = 82.86%

TestAccuracy = 76.80%

Feature Importance
===================
Most Informative Features
     (u'waste', u'time') = 1                 neg : pos    =     13.0 : 1.0
      (u'one', u'worst') = 1                 neg : pos    =     13.0 : 1.0
        (u'-', u'notch') = 1                 pos : neg    =     11.7 : 1.0
      (u'perfect', u',') = 1                 pos : neg    =     11.7 : 1.0
      (u'.', u'cameron') = 1                 pos : neg    =     11.7 : 1.0
None

In [ ]: