2. Testklassifikation mit Naive Bayes


In [11]:
from numpy import linalg as LA
#for float division
from __future__ import division
# import the time model to allow python to pause.
import time
import numpy as np
import matplotlib.pyplot as plt
import math
from numpy import linalg as LA
import scipy as sp
import urllib2
from urllib2 import urlopen, URLError, HTTPError
import tarfile as tar
import sys
import os
import re

%matplotlib inline

In [12]:
downloadFileName = '20news-18828.tar.gz'

In [3]:
# Get folder path containing text files
path = os.path.curdir+'/'+downloadFileName
folder_name = "20news-18828/"

subFolder = ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

tar_file = tar.open(path, mode='r:gz')

for dir_name in subFolder:
    subdir_and_files = [
        tarinfo for tarinfo in tar_file.getmembers()
        if tarinfo.name.startswith(folder_name + dir_name)
        ]
    tar_file.extractall(members=subdir_and_files)
    
tar_file.close()

Lesen Sie alle Dateien aus diesen vier Verzeichnissen in eine Array von Strings ein (d.h. ein File in einem String). Speichern Sie zusätzlich die Klassenzugehörigkeit jedes Dokuments in einem Vektor ab (Kontrolle: Sie müssten jetzt 3387 Strings im Speicher haben)


In [4]:
data = []
labels = []

for dir_name in subFolder:
    
    directory_path = folder_name + dir_name
    file_names = os.listdir(directory_path)
    for name in file_names:
        
        f = open(directory_path + '/' + name, 'r')
        data.append(f.read())
        f.close()
        labels.append(dir_name)
        
data_length = len(data)
print data_length


3387

Im nächsten Schritt muss jeder String in Worte (Tokens) zerlegt werden, die durch Leerzeichen, Kommas etc. voneinander getrennt sind. Hierzu setzen wir das Python-Standardpaket re ein, das zur Analyse regulärer Ausdrücke dient. Durch folgenden Befehl werden alle Tokens eines Strings textline in einer Liste l gespeichert, nachdem er zuvor mit lower() in Kleinbuchstaben umgewandelt wurde.


In [5]:
def strip_header ( text ):
    _before,_blankline,after = text.partition ('\n\n')
    return after

In [6]:
def tokenStringToList(data):
    l = re.compile(r"(?u)\b\w\w+\b").findall(data.lower())
    return l

In [7]:
token_list = []

for i in range(0, len(data)):
    l = tokenStringToList(strip_header (data[i]))
    
    for j in range(0, len(l)):
        
        if l[j] not in token_list:
            token_list.append(l[j])
            
vector_length = len(token_list)

print vector_length


41777

Berechnen Sie für jeden Text einen Merkmalsvektor, der für jedes Wort des Vokabulars seine Häufigkeit innerhalb des Texts enthält.


In [8]:
matrix = np.zeros((data_length, vector_length))

for i in range(0, len(data)):
    j = 0
    for token in token_list:
        matrix[i][j] = data[i].count(token)
        j += 1

In [10]:
len(matrix)


Out[10]:
3387

Verwenden Sie die ersten 60% der Daten als Trainingsdatensatz, die restlichen als Testdatensatz. Trainieren Sie damit einen multinomialen naiven Bayes-Klassifikator. Bestimmen Sie den Anteil korrekter Klassifikationen auf Ihren Trainings- und Testdaten. Wie gut generalisiert Ihr Klassifikator?


In [13]:
def classification(vector, labels, p_class, p_word_in_class):
    p = np.zeros((len(p_class)))
    for i in range (0, len(p_class)):
        p[i] = p_class[i] + np.sum(np.multiply(p_word_in_class[i], vector))
    c = np.argmax(p)
    return labels[c]

def testClassification(test_data, test_label, log_p_class, log_p_word_in_class):
    hit = 0.0

    for i in range (0, len(test_data)):
        test_index = i
        class_name = classification(test_data[test_index], subFolder, log_p_class, log_p_word_in_class)
        if test_label[i] == class_name:
            hit += 1

    hit /= len(test_data)
    hit *= 100

    s = "Korrekte Klassifikationen: %.2f%%"% (hit)
    print s

In [14]:
count_files = []

for dir_name in subFolder:
    directory_path = folder_name + dir_name
    count_files.append(len(os.listdir(directory_path)))
    
count_training = 0 # 60%
count_test = 0 # 40%

for i in range (0, len(count_files)):     
    count_60 = (int)(round(0.6 * count_files[i], 0))
    count_training += count_60
    count_test += (count_files[i] - count_60)

In [15]:
training_data = np.zeros((count_training,vector_length))
test_data = np.zeros((count_test,vector_length))
training_label = []
test_label = []

index_start = 0
index_end = 0
index_training = 0
index_test = 0

for dir_name in subFolder:
    
    directory_path = folder_name + dir_name
    count_files = len(os.listdir(directory_path))
    count_60 = (int)(round(0.6 * count_files, 0))
    index_end += count_files
    
    for i in range(index_start, index_end):
        if i < index_start + count_60:
            training_data[index_training,:] = matrix[i,:]
            index_training += 1
            training_label.append(dir_name)
        else:
            test_data[index_test,:] = matrix[i,:]
            index_test += 1
            test_label.append(dir_name)
    
    index_start = index_end

In [16]:
p_class = np.zeros((len(subFolder))) 
num_words_class = np.zeros((len(subFolder)))   
num_words_in_vocabulary = len(training_data[0])
p_num_words_in_class = np.zeros((len(subFolder), len(training_data[0])))

for i in range(0, len(training_data)):
    label = training_label[i]
    vector = training_data[i]

    if label == subFolder[0]:
        class_index = 0
    elif label == subFolder[1]:
        class_index = 1
    elif label == subFolder[2]:
        class_index = 2
    elif label == subFolder[3]:
        class_index = 3
        
    p_class[class_index] += 1
    num_words_class[class_index] += np.sum(vector)
    p_num_words_in_class[class_index] += vector

p_class /= len(training_data)

for i in range(0, len(p_num_words_in_class)):
    p_num_words_in_class[i] += 1
    p_num_words_in_class[i] = p_num_words_in_class[i] / (num_words_class[i] + num_words_in_vocabulary)

In [17]:
log_p_class = np.log10(p_class)
log_p_word_in_class = np.log10(p_num_words_in_class)

In [18]:
#Testdaten
testClassification(test_data, test_label, log_p_class, log_p_word_in_class)


Korrekte Klassifikationen: 91.73%

In [20]:
#Trainingsdaten
testClassification(training_data, training_label, log_p_class, log_p_word_in_class)


Korrekte Klassifikationen: 94.78%