In [11]:
from numpy import linalg as LA
#for float division
from __future__ import division
# import the time model to allow python to pause.
import time
import numpy as np
import matplotlib.pyplot as plt
import math
from numpy import linalg as LA
import scipy as sp
import urllib2
from urllib2 import urlopen, URLError, HTTPError
import tarfile as tar
import sys
import os
import re
%matplotlib inline
In [12]:
downloadFileName = '20news-18828.tar.gz'
In [3]:
# Get folder path containing text files
path = os.path.curdir+'/'+downloadFileName
folder_name = "20news-18828/"
subFolder = ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
tar_file = tar.open(path, mode='r:gz')
for dir_name in subFolder:
subdir_and_files = [
tarinfo for tarinfo in tar_file.getmembers()
if tarinfo.name.startswith(folder_name + dir_name)
]
tar_file.extractall(members=subdir_and_files)
tar_file.close()
Lesen Sie alle Dateien aus diesen vier Verzeichnissen in eine Array von Strings ein (d.h. ein File in einem String). Speichern Sie zusätzlich die Klassenzugehörigkeit jedes Dokuments in einem Vektor ab (Kontrolle: Sie müssten jetzt 3387 Strings im Speicher haben)
In [4]:
data = []
labels = []
for dir_name in subFolder:
directory_path = folder_name + dir_name
file_names = os.listdir(directory_path)
for name in file_names:
f = open(directory_path + '/' + name, 'r')
data.append(f.read())
f.close()
labels.append(dir_name)
data_length = len(data)
print data_length
Im nächsten Schritt muss jeder String in Worte (Tokens) zerlegt werden, die durch Leerzeichen, Kommas etc. voneinander getrennt sind. Hierzu setzen wir das Python-Standardpaket re ein, das zur Analyse regulärer Ausdrücke dient. Durch folgenden Befehl werden alle Tokens eines Strings textline in einer Liste l gespeichert, nachdem er zuvor mit lower() in Kleinbuchstaben umgewandelt wurde.
In [5]:
def strip_header ( text ):
_before,_blankline,after = text.partition ('\n\n')
return after
In [6]:
def tokenStringToList(data):
l = re.compile(r"(?u)\b\w\w+\b").findall(data.lower())
return l
In [7]:
token_list = []
for i in range(0, len(data)):
l = tokenStringToList(strip_header (data[i]))
for j in range(0, len(l)):
if l[j] not in token_list:
token_list.append(l[j])
vector_length = len(token_list)
print vector_length
Berechnen Sie für jeden Text einen Merkmalsvektor, der für jedes Wort des Vokabulars seine Häufigkeit innerhalb des Texts enthält.
In [8]:
matrix = np.zeros((data_length, vector_length))
for i in range(0, len(data)):
j = 0
for token in token_list:
matrix[i][j] = data[i].count(token)
j += 1
In [10]:
len(matrix)
Out[10]:
Verwenden Sie die ersten 60% der Daten als Trainingsdatensatz, die restlichen als Testdatensatz. Trainieren Sie damit einen multinomialen naiven Bayes-Klassifikator. Bestimmen Sie den Anteil korrekter Klassifikationen auf Ihren Trainings- und Testdaten. Wie gut generalisiert Ihr Klassifikator?
In [13]:
def classification(vector, labels, p_class, p_word_in_class):
p = np.zeros((len(p_class)))
for i in range (0, len(p_class)):
p[i] = p_class[i] + np.sum(np.multiply(p_word_in_class[i], vector))
c = np.argmax(p)
return labels[c]
def testClassification(test_data, test_label, log_p_class, log_p_word_in_class):
hit = 0.0
for i in range (0, len(test_data)):
test_index = i
class_name = classification(test_data[test_index], subFolder, log_p_class, log_p_word_in_class)
if test_label[i] == class_name:
hit += 1
hit /= len(test_data)
hit *= 100
s = "Korrekte Klassifikationen: %.2f%%"% (hit)
print s
In [14]:
count_files = []
for dir_name in subFolder:
directory_path = folder_name + dir_name
count_files.append(len(os.listdir(directory_path)))
count_training = 0 # 60%
count_test = 0 # 40%
for i in range (0, len(count_files)):
count_60 = (int)(round(0.6 * count_files[i], 0))
count_training += count_60
count_test += (count_files[i] - count_60)
In [15]:
training_data = np.zeros((count_training,vector_length))
test_data = np.zeros((count_test,vector_length))
training_label = []
test_label = []
index_start = 0
index_end = 0
index_training = 0
index_test = 0
for dir_name in subFolder:
directory_path = folder_name + dir_name
count_files = len(os.listdir(directory_path))
count_60 = (int)(round(0.6 * count_files, 0))
index_end += count_files
for i in range(index_start, index_end):
if i < index_start + count_60:
training_data[index_training,:] = matrix[i,:]
index_training += 1
training_label.append(dir_name)
else:
test_data[index_test,:] = matrix[i,:]
index_test += 1
test_label.append(dir_name)
index_start = index_end
In [16]:
p_class = np.zeros((len(subFolder)))
num_words_class = np.zeros((len(subFolder)))
num_words_in_vocabulary = len(training_data[0])
p_num_words_in_class = np.zeros((len(subFolder), len(training_data[0])))
for i in range(0, len(training_data)):
label = training_label[i]
vector = training_data[i]
if label == subFolder[0]:
class_index = 0
elif label == subFolder[1]:
class_index = 1
elif label == subFolder[2]:
class_index = 2
elif label == subFolder[3]:
class_index = 3
p_class[class_index] += 1
num_words_class[class_index] += np.sum(vector)
p_num_words_in_class[class_index] += vector
p_class /= len(training_data)
for i in range(0, len(p_num_words_in_class)):
p_num_words_in_class[i] += 1
p_num_words_in_class[i] = p_num_words_in_class[i] / (num_words_class[i] + num_words_in_vocabulary)
In [17]:
log_p_class = np.log10(p_class)
log_p_word_in_class = np.log10(p_num_words_in_class)
In [18]:
#Testdaten
testClassification(test_data, test_label, log_p_class, log_p_word_in_class)
In [20]:
#Trainingsdaten
testClassification(training_data, training_label, log_p_class, log_p_word_in_class)