In [1]:
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# This code parses PIWIK log files (piwik.org) to extract user queries from digital catalogues.
# Example of input data:
#A4EB7F66122DFB4B 649796 2016-01-01 01:00:41 search.arch.be/nl/zoeken-naar-archieven/zoekresultaat/index/index/zoekterm/antwerpen/findaidstatus/verified-complete-draft/dao/1/lang/nl
# Written in the context of the MADDLAIN project (www.maddlain.iminds.be/en/home/)
# Anne Chardonnens - anchardo@ulb.ac.be - http://homepages.ulb.ac.be/~anchardo
# Simon Hengchen - shengche@ulb.ac.be - http://homepages.ulb.ac.be/~shengche
# Raphaël Hubain - rhubain@gmail.com
In [2]:
from time import sleep
import sys
class Counter():
def __init__(self, array, position = 0):
self.length = len(array)
self.position = position
self.printpercent = 0
def loop(self):
self.position += 1
percent = self.position / self.length
percent = int (percent * 100)
if self.printpercent != percent:
self.printpercent = percent
print("%s %%" % str(self.printpercent))
In [3]:
import pandas as pd #pandas = python data analysis library: http://pandas.pydata.org/
from re import search, match
In [4]:
#functions to precise where are the keywords we want to parse
def getv(url, m):
#regular expression to isolate and keep only the words entered by the end user
reg = '.*\/' + m + '\/([a-z_A-Zéèèêàâäüùôöç0-9+ \'\.\-%,]*)\/.*'
ismatch = match(reg, url) #reg will be defined below
if ismatch:
value = ismatch.group(1)
else:
value = 'NaN'
return value
def get_mdic(url):
mdic = {}
#other metadata composing the URL will also be parsed
mlist = ['zoekterm','beginjaar','eindjaar','rubriek','eadid','findaidstatus','dao','lang',
'inventarisnr','level','scan-index','foto','page']
for m in mlist:
k = m
v = getv(url, m)
mdic[k] = v
return mdic
In [5]:
#parsing function
def parse(din):
din.columns = ['visitorID','visitID','time','url'] #the input file contains 4 columns: the visitor ID, the visit ID, the timestamp of the visit and the URL
sl = []
c = Counter(din)
for i, r in din.iterrows():
c.loop()
rdic = {}
reg = '.*\/zoekterm\/.*' #each user query is preceded by ".../zoekterm/" (which means keyword in dutch)
if search(reg,r.url):
rdic = get_mdic(r.url)
rdic['visitorID'] = r.visitorID
rdic['visitID'] = r.visitID
rdic['time'] = r.time
rdic['url'] = r.url
sl.append(pd.Series(rdic))
dout = pd.DataFrame(sl)
return dout
In [6]:
#urllib.parse could also be envisaged, but didn't work well in this context
import codecs
import io
f = open("input/PGCC_input_1month.txt","r")
#input file, you can download it here: https://github.com/anchardo/PGCC
f = f.readlines()
x = io.open("input/PGCC_input_1month_ok.txt","w",encoding="utf8")
for line in f:
line = line.replace("é","e")
line = line.replace("è","e")
line = line.replace("ç","ç")
x.write(line)
In [7]:
f = 'input/PGCC_input_1month_ok.txt'
din = pd.read_csv(f, sep = '\t', header=None) #cf. pandas http://pandas.pydata.org/
din.columns = ['visitorID','visitID','time','url']
In [8]:
len(din) #number of lines (190 785)
Out[8]:
In [9]:
din.head() #check if everything is ok
Out[9]:
In [10]:
len(set(din.visitorID.tolist())) #number of visitors
Out[10]:
In [11]:
len(set(din.visitID.tolist())) #number of visits
Out[11]:
In [12]:
din.to_csv('output/A_first_parsing.csv') #export the first parsed file
In [13]:
dout = parse(din) #parsing metadata and user queries, can takes some time
#you can follow the progression through the percentages
In [14]:
dout.head(4) #check if everything seems fine, there are a lot of "NaN" in the first columns (metadata), it's normal!
Out[14]:
In [16]:
dout.to_csv('output/A_second_parsing.csv') #export output file
In [17]:
len(list(set(dout.visitID.tolist())))
Out[17]:
In [18]:
len(list(set(dout.visitorID.tolist())))
Out[18]:
In [19]:
dout = dout[['visitID','zoekterm','time']] # time is kind of an HTTP request identifier
In [20]:
dout.head() #at this stage, you should only have 3 columns: visitID, zoekterm (keyword), time
Out[20]:
In [21]:
len(list(set(dout.zoekterm.tolist())))
Out[21]:
In [22]:
len(list(set(dout.time.tolist())))
Out[22]:
In [23]:
len(list(set(dout.visitID.tolist())))
Out[23]:
In [24]:
len(dout) #length of file
Out[24]:
In [25]:
# questions? cf. Pandas documentation: http://pandas.pydata.org/pandas-docs/stable/groupby.html
In [26]:
dout = dout.groupby(['visitID','zoekterm']).count() #each time we have the same visitID + the same query, we'll keep the query only once
In [27]:
dout.head()
Out[27]:
In [28]:
len(dout) #the number of lines should be lower after reduction
Out[28]:
In [29]:
dout.reset_index(inplace=True)
dout = dout.rename(columns = {'time':'queries_count'}) #we rename the "time" column by a more explicit name
In [30]:
dout.head(29)
Out[30]:
In [31]:
len(list(set(dout.visitID.tolist()))) # we lost some visitors, probably blank cells
Out[31]:
In [32]:
len(list(set(dout.zoekterm.tolist())))
Out[32]:
In [33]:
dout.queries_count.sum() # = to sum up the queries
#there is a lost we cannot explain, as with "visitID"... probably blank cells
Out[33]:
In [34]:
dout.to_csv('output/B_grouping.csv')
In [35]:
from re import sub
import re
#we decide now what has to be done to normalise the query text
#update: maybe you should also take into account others special characters such as * & © À etc.)
def clean_text(text):
if re.search("^[0-9\-\.\/]*$", text) != None :
text = 'NULL' #we avoid keeping queries composed only of numbers
elif text != 'NaN':
text = str(text) #
text = text.lower() #convert to lower case
text = sub('[^a-zéèèêèàâäüùôöç0-9\']', ' ', text) #we don't want to keep the "?" nor the "+" between 2 words
text = str.replace(text,'é','e') #We could certainly do shorter, this is the long version but at least it does the job
text = str.replace(text,'è','e')
text = str.replace(text,'ê','e')
text = str.replace(text,'à','a')
text = str.replace(text,'â','a')
text = str.replace(text,'ä','a')
text = str.replace(text,'ü','u')
text = str.replace(text,'û','u')
text = str.replace(text,'ù','u')
text = str.replace(text,'ô','o')
text = str.replace(text,'ç','c')
text = str.replace(text,'ö','o')
text = str.replace(text,'ë','e')
text = str.replace(text,'\'',' ')
text = str.replace(text,' ',' ')
text = str.replace(text,' ',' ')
text = text.strip() #Return a copy of the string with the leading and trailing characters removed
return text
In [36]:
f = 'output/B_grouping.csv'
d = pd.read_csv(f, sep = ',', header=0, dtype=str, error_bad_lines=False)
In [37]:
d = d[['visitID','zoekterm','queries_count']]
In [38]:
d.head()
Out[38]:
In [39]:
#normalisation function
d['normalised_zoekterm'] = ''
subcount = 0
count = 0
for i, r in d.iterrows():
subcount += 1
if subcount == 500: #you can change the number depending on the size of the dataset
count += subcount
print(count)
subcount = 0
d.ix[i,'normalised_zoekterm'] = clean_text(r.zoekterm)
d.head() #to check if it works correctly
In [64]:
d.to_csv("output/C_cleaning.csv")
In [65]:
d.head(200) #we can check on a bigger sample if everything seems fine
#queries composed only by numbers appear now as "NULL"
Out[65]:
In [ ]:
#the cleaning step has revealed new duplicates and we want to keep each identical request only once/visit
In [6]:
d = pd.read_csv("output/C_cleaning.csv")
d = d.drop('Unnamed: 0', axis = 1)
#d = d.drop('Unnamed: 0.1', axis = 1)
In [7]:
d.head(10)
Out[7]:
In [8]:
len(d)
Out[8]:
In [9]:
d.queries_count.sum()
Out[9]:
In [10]:
len(list(set(d.normalised_zoekterm.tolist())))
Out[10]:
In [11]:
d = d.groupby(['visitID','normalised_zoekterm']).sum() #we want to keep each query only once, but keep track of the number of similar queries/visit
d.reset_index(inplace=True)
d.head(30)
Out[11]:
In [73]:
len(d)
Out[73]:
In [74]:
d.queries_count.sum()
Out[74]:
In [75]:
len(list(set(d.visitID.tolist())))
Out[75]:
In [76]:
d.to_csv('output/C_cleaning_without_duplicates.csv')
In [12]:
dno = d.copy()
dno['no'] = '1'
dno = dno[['visitID','queries_count','no']]
dno.head(10)
Out[12]:
In [13]:
dno = dno.groupby(['visitID','no']).count()
dno.reset_index(inplace=True)
dno = dno.rename(columns = {'queries_count':'requests'})
dno.head(5)
Out[13]:
In [14]:
dno["requests"].mean()
Out[14]:
In [15]:
dno["requests"].sum()
Out[15]:
In [17]:
len(dno)
Out[17]:
In [19]:
dno.to_csv('output/number_of_requests_by_visit.csv')
In [ ]:
#we prepare the input
In [77]:
input = d[['normalised_zoekterm']] #we keep only the normalised queries
In [78]:
input.to_csv('output/D/to_be_clustered.csv', index=False, header=False)
In [79]:
input.head()
Out[79]:
In [ ]:
#below: function + algorithm (Jaro-Winkler) to do the clustering
In [80]:
import os, sys, getopt, io
import json, random, jellyfish
from bs4 import BeautifulSoup as bs
from collections import defaultdict
from jellyfish import levenshtein_distance as ld, jaro_distance
In [104]:
import re, codecs, os, sys, getopt, io
import json, random, jellyfish
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
from collections import defaultdict
from jellyfish import levenshtein_distance as ld, jaro_distance
content = [line.rstrip('\n') for line in io.open('output/D/to_be_clustered.csv')]
result = open('output/D/clusters_85.txt',"a")
Clusters = []
Centroid = []
Scores = []
for string in content:
Matched = 0
if len(Clusters) == 0:
Clusters.append([string])
Centroid.append([string])
Scores.append([])
continue
for ClustNum in range(len(Clusters)):
Dist = jellyfish.jaro_distance(string, Centroid[ClustNum][0])
if Dist > 0.85:
Clusters[ClustNum].append(string)
if len(Scores[ClustNum]) == 0:
Scores[ClustNum].append(Dist)
Matched = 1
break
if Matched ==0:
Clusters.append([string])
Centroid.append([string])
Scores.append([])
size = len(Clusters)
x = 0
while x < size:
cluster = Clusters[x]
centroid = Centroid[x]
list = []
for cl in cluster:
list.append(str(cl) + '\t' + str(centroid[0]))
result.write('\n'.join(list))
result.write('\n')
x = x+1
result.close()
In [93]:
d = pd.read_csv('output/C_cleaning_without_duplicates.csv') #we need the file with normalised queries
In [95]:
d = d.drop('Unnamed: 0', axis = 1)
In [96]:
d.head() #quick check
Out[96]:
In [89]:
clu = pd.read_csv('output/D/clusters_85.txt', sep='\t', header=None)
In [90]:
clu.columns = ['queries','clusters']
In [91]:
clu.head()
Out[91]:
In [92]:
len(set(clu.queries.tolist()))
Out[92]:
In [145]:
len(set(clu.clusters.tolist()))
Out[145]:
In [158]:
def get_cluster(clu, text):
out = ""
try:
out = clu.loc[clu.queries == text, 'clusters'].values[0]
except:
out = "NAN (cluster not found)"
return out
In [161]:
d['cluster'] = ""
In [169]:
c = Counter(d)
for i, r in d.iterrows():
c.loop()
text = r.normalised_zoekterm
d.loc[i, 'cluster'] = get_cluster(clu, text)
In [170]:
len(set(d.cluster.tolist()))
Out[170]:
In [174]:
d = d[['cluster','queries_count','normalised_zoekterm','visitID']]
In [178]:
d.sort_values(by='cluster')
Out[178]:
In [179]:
d.to_csv('output/D/Clusters_Data_Reconciliation.csv') #the end!