In [21]:
import jieba
import os
import time
import random
import sklearn
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pylab as py
import matplotlib.pylab as plt

In [22]:
# 粗暴的词去重
def make_word_set(words_file):
    words_set = set()
    with open(words_file, 'r') as fp:
        for line in fp.readlines():
            word = line.strip().decode("utf-8")
            if len(word)>0 and word not in words_set: # 去重
                words_set.add(word)
    return words_set

In [23]:
# 文本处理,也就是样本生成过程
def text_processing(folder_path, test_size=0.2):
    folder_list = os.listdir(folder_path)
    data_list = []
    class_list = []

    # 遍历文件夹
    for folder in folder_list:
        new_folder_path = os.path.join(folder_path, folder)
        files = os.listdir(new_folder_path)
        # 读取文件
        j = 1
        for file in files:
            if j > 100: # 怕内存爆掉,只取100个样本文件,你可以注释掉取完
                break
            with open(os.path.join(new_folder_path, file), 'r',encoding='utf-8') as fp:
               raw = fp.read()
            ## 是的,随处可见的jieba中文分词
            ##jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数,不支持windows
            word_cut = jieba.cut(raw, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor
            word_list = list(word_cut) # genertor转化为list,每个词unicode格式
            ##jieba.disable_parallel() # 关闭并行分词模式
            
            data_list.append(word_list) #训练集list
            class_list.append(folder) #类别
            j += 1
    
    ## 粗暴地划分训练集和测试集
    data_class_list = zip(data_list, class_list)
    random.shuffle(data_class_list)
    index = int(len(data_class_list)*test_size)+1
    train_list = data_class_list[index:]
    test_list = data_class_list[:index]
    train_data_list, train_class_list = zip(*train_list)
    test_data_list, test_class_list = zip(*test_list)
    
    #其实可以用sklearn自带的部分做
    #train_data_list, test_data_list, train_class_list, test_class_list = sklearn.cross_validation.train_test_split(data_list, class_list, test_size=test_size)
    

    # 统计词频放入all_words_dict
    all_words_dict = {}
    for word_list in train_data_list:
        for word in word_list:
            if all_words_dict.has_key(word):
                all_words_dict[word] += 1
            else:
                all_words_dict[word] = 1

    # key函数利用词频进行降序排序
    all_words_tuple_list = sorted(all_words_dict.items(), key=lambda f:f[1], reverse=True) # 内建函数sorted参数需为list
    all_words_list = list(zip(*all_words_tuple_list)[0])

    return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list

In [24]:
print('start')
folder_path = './Database/SogouC/Sample'
text_processing(folder_path)


start
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-24-3e96b992e80c> in <module>()
      1 print('start')
      2 folder_path = './Database/SogouC/Sample'
----> 3 text_processing(folder_path)

<ipython-input-23-cc9efac74303> in text_processing(folder_path, test_size)
     28     ## 粗暴地划分训练集和测试集
     29     data_class_list = zip(data_list, class_list)
---> 30     random.shuffle(data_class_list)
     31     index = int(len(data_class_list)*test_size)+1
     32     train_list = data_class_list[index:]

D:\soft\anaconda3\lib\random.py in shuffle(self, x, random)
    269         if random is None:
    270             randbelow = self._randbelow
--> 271             for i in reversed(range(1, len(x))):
    272                 # pick an element in x[:i+1] with which to exchange x[i]
    273                 j = randbelow(i+1)

TypeError: object of type 'zip' has no len()