In [5]:
import re
import os
In [15]:
os.name
Out[15]:
In [19]:
if os.name == 'posix':
baseDir = r'/home/hase/Documents/ZHAW/InfoEng/Lectures/Information_Retrieval/Exercises/PT_5_MiniRetrieve/'
doc_path = r'/home/hase/Documents/ZHAW/InfoEng/Lectures/Information_Retrieval/Exercises/PT_5_MiniRetrieve/documents/'
query_path = r'/home/hase/Documents/ZHAW/InfoEng/Lectures/Information_Retrieval/Exercises/PT_5_MiniRetrieve/queries/'
elif os.name == 'nt':
baseDir = r'C:\ZHAW\IR\PT_5_MiniRetrieve\\'
doc_path = r'C:\ZHAW\IR\PT_5_MiniRetrieve\documents\\'
STOPWORDS_PATH = 'stopwords.txt'
In [94]:
# First, read the entire document as a string
def readDoc(dir_path, file):
path = dir_path + file
with open(path, 'r') as f:
string = f.read()
return string
In [93]:
# Reading document and create a string
string = readDoc(doc_path, '1')
#string
In [22]:
# Define regex to parse the string, and perform a simple tokenize
# Later will need a proper tokenize function to remove stopwords
split_regex = r'\W+'
def simpleTokenize(string):
""" A simple implementation of input string tokenization
Args:
string (str): input string
Returns:
list: a list of tokens
"""
# Convert string to lowercase
string = string.lower()
# Tokenize using the split_regex definition
raw_tokens = re.split(split_regex, string)
# Remove empty tokens
tokens = []
for raw_token in raw_tokens:
if len(raw_token) != 0:
tokens.append(raw_token)
return tokens
In [106]:
#print(simpleTokenize(string))
In [85]:
# File with stopwords
stopfile = os.path.join(baseDir, STOPWORDS_PATH)
print(stopfile)
# Create list of stopwords
stopwords = []
with open(stopfile, 'r') as s:
stopwords_string = s.read()
stopwords = re.split(split_regex, stopwords_string)
type(stopwords), len(stopwords)
Out[85]:
In [54]:
def tokenize(string):
""" An implementation of input string tokenization that excludes stopwords
Args:
string (str): input string
Returns:
list: a list of tokens without stopwords
"""
tokens = simpleTokenize(string)
# Loop the entire list and add words that are on not on the stopwords list to a new list
filtered = []
for token in tokens:
if token in stopwords:
continue
else:
filtered.append(token)
return filtered
In [105]:
#tokenize(string)
for each document 'doc' in the list of documents D:
get tokens by tokenizing 'd'
for each token in tokens:
inverted index dict {token_one:[doc with token_one, frequency of token_one in doc with token_one],
token_two:[doc with token two, frequency of token_two in doc with token_two],
...
}
non-inverted index dict {doc_one:[token in doc_one, frequency of token in doc_one],
doc_one:[another_token in doc one, frequency of another_token in doc_one],
doc_two:[token in doc_two, frequency of token in doc_two],
...
}
In [197]:
def noninvIndex(dir_path, num_files):
""" A simple implementation of non-inverted index
i.e. token frequency found in document
Args:
dir_path (string): path were all the documents are stored
num_files (string): number of files stored in dir_path; assuming the name is a number
Returns:
docNoniIdx (dict): tokens frequency for each document
"""
# Create a list of files in the directory
files = []
total_files = int(num_files)
for i in range(1,total_files):
files.append(str(i))
# Dictionary to store the non-inverted index for all documents
docNoniIdx = {}
# Loop the list files to parse all the existing documents
for file in files:
# Create a string for the file read
path = dir_path + file
with open(path, 'r') as f:
string = f.read()
# tokenize of the string removing stopwords
#tokens = tokenize(string)
tokens = simpleTokenize(string)
# With the list of tokens create a non-inverted Index
noniIdx = {}
for token in tokens:
if token not in noniIdx.keys():
noniIdx[token] = 1
else:
noniIdx[token] += 1
docNoniIdx[file] = noniIdx
return docNoniIdx
In [212]:
non_invIndex = noninvIndex(doc_path,10)
In [213]:
len(non_invIndex)
Out[213]:
In [214]:
non_invIndex
Out[214]:
In [206]:
def invIndex(dir_path, num_files):
""" A simple implementation of inverted index
i.e. token frequency found in document
Args:
dir_path (string): path were all the documents are stored
num_files (string): number of files stored in dir_path; assuming the name is a number
Returns:
dociIdx (dict): frequency of token in documents dociIdx = {token:{'doc 1':freq, 'doc 2':freq},
token_two:{'doc 1:freq}, ...}
"""
# Create a list of files in the directory
files = []
total_files = int(num_files)
for i in range(1,total_files):
files.append(str(i))
# Dictionary to store the non-inverted index for all documents
dociIdx = {}
# Loop the list files to parse all the existing documents
for file in files:
# Create a string for the file read
path = dir_path + file
with open(path, 'r') as f:
string = f.read()
# tokenize of the string; remove stopwords later
tokens = simpleTokenize(string)
# With the list of tokens create a non-inverted Index
for token in tokens:
if token not in dociIdx.keys():
#print(token, 'not in')
dociIdx[token] = {file:1}
#print(dociIdx)
elif token in dociIdx.keys():
#print(token, 'in')
if file in dociIdx[token].keys():
#print(file, 'in value')
dociIdx[token][file] += 1
#print(dociIdx)
else:
#print(file, 'not in value')
dociIdx[token].update({file:1})
#print(dociIdx)
return dociIdx
In [215]:
docinvIndex = invIndex(doc_path,10)
In [216]:
docinvIndex
Out[216]: