In [1]:
from IPython.display import Image
Image(filename='files/screenshot.png')
Out[1]:
In [2]:
from IPython.display import Image
Image(filename='files/whoosh.jpg')
Out[2]:
Whoosh is a library of classes and functions for indexing text and then searching the index. It allows you to develop custom search engines for your content.
but...
If your are a begginer, you have no team, you need a fast solution, you need to work isolated or you have a small project this is your solution otherwise Elastic Search might be your tech.
In [3]:
import csv
catalog = csv.DictReader(open('files/catalogo_head.csv'))
print list(catalog)[0].keys()
In [4]:
catalog = csv.DictReader(open('files/catalogo_head.csv'))
for product in catalog:
print product["Codigo"] + ' - ' + product["Articulo"] + ' - ' + product["Categoria"]
In [5]:
Image(filename='files/tf.png')
Out[5]:
In [6]:
Image(filename='files/idf.png')
Out[6]:
In [7]:
Image(filename='files/tfidf.png')
Out[7]:
In [8]:
from nltk.corpus import stopwords
import csv
stop_words_spa = stopwords.words("spanish")
stop_words_eng = stopwords.words("english")
with open('files/adjetivos.csv', 'rb') as f:
reader = csv.reader(f)
adjetivos=[]
for row in reader:
for word in row:
adjetivos.append(word)
In [9]:
import math
#tf-idf functions:
def tf(word, blob):
return float(blob.words.count(word))/float(len(blob.words))
def idf(word, bloblist):
return (float(math.log(len(bloblist)))/float(1 + n_containing(word, bloblist)))
def n_containing(word, bloblist):
return float(sum(1 for blob in bloblist if word in blob))
def tfidf(word, blob, bloblist):
return float(tf(word, blob)) * float(idf(word, bloblist))
In [10]:
import csv
from textblob import TextBlob as tb
catalog = csv.DictReader(open('files/catalogo_head.csv'))
bloblist = []
for product in catalog:
text =unicode(product["Articulo"], encoding="utf-8", errors="ignore").lower()
text = ' '.join([word for word in text.split() if word not in stop_words_spa]) # remove Spanish stopwords
text = ' '.join([word for word in text.split() if word not in stop_words_eng]) #remove English stopwords
text = ' '.join([word for word in text.split() if word not in adjetivos]) # remove meaningless adjectives
value = tb(text) # bag of words
bloblist.append(value)
tags = []
for blob in bloblist:
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
terms = ''
for word, score in sorted_words[:3]:
terms = terms+word+' '
tags.append(terms)
for t in tags:
print unicode(t)
Types of fields:
Field boosting. Is a multiplier applied to the score of any term found in the field.
Stemming (great if you are working English)
Removes suffixes
Variation (great if you are working English)
Encodes the words in the index in a base form
In [11]:
from whoosh.lang.porter import stem
print "stemming: "+stem("analyse")
from whoosh.lang.morph_en import variations
print "variations: "
print list(variations("analyse"))[0:5]
In [12]:
import csv
catalog = csv.DictReader(open('files/catalogo_contags.csv'))
print list(catalog)[0].keys()
In [13]:
from whoosh.index import create_in
from whoosh.analysis import StemmingAnalyzer
from whoosh.fields import *
catalog = csv.DictReader(open('files/catalogo_contags.csv'))
data_set = []
for row in catalog:
row["Categoria"] = unicode(row["Categoria"], encoding="utf-8", errors="ignore").lower()
row["Articulo"] =unicode(row["Articulo"], encoding="utf-8", errors="ignore").lower()
row["Articulo"] = ' '.join([word for word in row["Articulo"].split() if word not in stop_words_spa])
row["Articulo"] = ' '.join([word for word in row["Articulo"].split() if word not in stop_words_eng])
row["Articulo"] = ' '.join([word for word in row["Articulo"].split() if word not in adjetivos])
row["tags"] = unicode(row["tags"], encoding="utf-8", errors="ignore")
row["Ean"] = unicode(row["Ean"], encoding="utf-8", errors="ignore")
row["Codigo"] = unicode(row["Codigo"], encoding="utf-8", errors="ignore")
row["PVP"] = float(row["PVP"])
row["Plazo"] = unicode(row["Plazo"], encoding="utf-8", errors="ignore")
data_set.append(row)
print str(len(data_set)) + ' products'
In [14]:
schema = Schema(Codigo=ID(stored=True),
Ean=TEXT(stored=True),
Categoria=TEXT(analyzer=StemmingAnalyzer(minsize=3),
stored=True),
Articulo=TEXT(analyzer=StemmingAnalyzer(minsize=3),
field_boost=2.0, stored=True),
Tags=KEYWORD(field_boost=1.0, stored=True),
PVP=NUMERIC(sortable = True),
Plazo = TEXT(stored=True))
In [15]:
from whoosh import index
from datetime import datetime
start = datetime.now()
ix = create_in("indexdir", schema) #clears the index
#on a directory with an existing index will clear the current contents of the index
writer = ix.writer()
for product in data_set:
writer.add_document(Codigo=unicode(product["Codigo"]),
Ean=unicode(product["Ean"]),
Categoria=unicode(product["Categoria"]),
Articulo=unicode(product["Articulo"]),
Tags=unicode(product["tags"]),
PVP=float(product["PVP"]))
writer.commit()
finish = datetime.now()
time = finish-start
print time
In [16]:
Image(filename='files/screenshot_files.png')
Out[16]:
In [17]:
from whoosh.qparser import MultifieldParser, OrGroup
qp = MultifieldParser(["Categoria",
"Articulo",
"Tags",
"Ean",
"Codigo",
"Tags"], # all selected fields
schema=ix.schema, # with my schema
group=OrGroup) # OR instead AND
user_query = 'Cargador de coche USB'
user_query = unicode(user_query, encoding="utf-8", errors="ignore")
user_query = user_query.lower()
user_query = ' '.join([word for word in user_query.split() if word not in stop_words_spa])
user_query = ' '.join([word for word in user_query.split() if word not in stop_words_eng])
print "this is our query: " + user_query
q = qp.parse(user_query)
print "this is our parsed query: " + str(q)
In [18]:
with ix.searcher() as searcher:
results = searcher.search(q)
print str(len(results))+' hits'
print results[0]["Codigo"]+' - '+results[0]["Articulo"]+' - '+results[0]["Categoria"]
In [19]:
with ix.searcher() as searcher:
print '''
----------- word-scoring sorting ------------
'''
results = searcher.search(q)
for hit in results:
print hit["Articulo"]+' - '+str(hit["PVP"])+' eur'
print '''
--------------- PVP sorting -----------------
'''
results = searcher.search(q, sortedby="PVP")
for hit in results:
print hit["Articulo"]+' - '+str(hit["PVP"])+' eur'