Python (v3) script to Parse, Group, Clean and Cluster user queries from digital catalogues, using Piwik log files.



In [1]:

    
# !/usr/bin/env python
# -*- coding: utf-8 -*-

# This code parses PIWIK log files (piwik.org) to extract user queries from digital catalogues. 

# Example of input data: 
#A4EB7F66122DFB4B	649796	2016-01-01 01:00:41	search.arch.be/nl/zoeken-naar-archieven/zoekresultaat/index/index/zoekterm/antwerpen/findaidstatus/verified-complete-draft/dao/1/lang/nl

# Written in the context of the MADDLAIN project (www.maddlain.iminds.be/en/home/)

# Anne Chardonnens - anchardo@ulb.ac.be - http://homepages.ulb.ac.be/~anchardo
# Simon Hengchen - shengche@ulb.ac.be - http://homepages.ulb.ac.be/~shengche
# Raphaël Hubain - rhubain@gmail.com



In [2]:

    
from time import sleep 
import sys

class Counter():
    def __init__(self, array, position = 0):
        self.length = len(array)
        self.position = position
        self.printpercent = 0
    
    def loop(self):
        self.position += 1
        percent = self.position / self.length
        percent = int (percent * 100)
        if self.printpercent != percent:
            self.printpercent = percent
            print("%s %%" % str(self.printpercent))

A) PARSING (1/2)



In [3]:

    
import pandas as pd  #pandas = python data analysis library: http://pandas.pydata.org/
from re import search, match



In [4]:

    
#functions to precise where are the keywords we want to parse

def getv(url, m):
    #regular expression to isolate and keep only the words entered by the end user
    reg = '.*\/' + m + '\/([a-z_A-Zéèèêàâäüùôöç0-9+ \'\.\-%,]*)\/.*'   
    ismatch = match(reg, url) #reg will be defined below
    if ismatch:
        value = ismatch.group(1)
    else:
        value = 'NaN'
    return value

def get_mdic(url):
    mdic = {}
    #other metadata composing the URL will also be parsed
    mlist = ['zoekterm','beginjaar','eindjaar','rubriek','eadid','findaidstatus','dao','lang',
                     'inventarisnr','level','scan-index','foto','page']
    for m in mlist:
        k = m
        v = getv(url, m)
        mdic[k] = v
    return mdic



In [5]:

    
#parsing function
def parse(din):
    din.columns = ['visitorID','visitID','time','url'] #the input file contains 4 columns: the visitor ID, the visit ID, the timestamp of the visit and the URL  
    sl = []
    c = Counter(din)
    for i, r in din.iterrows():
        c.loop()
        rdic = {}
        reg = '.*\/zoekterm\/.*'  #each user query is preceded by ".../zoekterm/" (which means keyword in dutch)
        if search(reg,r.url):
            rdic = get_mdic(r.url)
            rdic['visitorID'] = r.visitorID
            rdic['visitID'] = r.visitID
            rdic['time'] =  r.time
            rdic['url'] =  r.url
            sl.append(pd.Series(rdic))
    dout = pd.DataFrame(sl)
    return dout

URL Decoding



In [6]:

    
#urllib.parse could also be envisaged, but didn't work well in this context
import codecs 
import io
f = open("input/PGCC_input_1month.txt","r") 
#input file, you can download it here: https://github.com/anchardo/PGCC 
f = f.readlines()
x = io.open("input/PGCC_input_1month_ok.txt","w",encoding="utf8")
for line in f:
    line = line.replace("Ã©","e") 
    line = line.replace("Ã¨","e")
    line = line.replace("Ã§","ç")
    x.write(line)

Read the csv file and return a dataframe



In [7]:

    
f = 'input/PGCC_input_1month_ok.txt'
din = pd.read_csv(f, sep = '\t', header=None) #cf. pandas http://pandas.pydata.org/
din.columns = ['visitorID','visitID','time','url']



In [8]:

    
len(din) #number of lines (190 785)









    Out[8]:





34386



In [9]:

    
din.head() #check if everything is ok









    Out[9]:






  
    
      
      visitorID
      visitID
      time
      url
    
  
  
    
      0
      A86A2AC06F9B8292
      649717
      2016-01-01 01:28:28
      search.arch.be/nl/zoeken-naar-archieven/zoekre...
    
    
      1
      A86A2AC06F9B8292
      649717
      2016-01-01 01:29:23
      search.arch.be/nl/zoeken-naar-archieven/zoekre...
    
    
      2
      A86A2AC06F9B8292
      649717
      2016-01-01 01:30:38
      search.arch.be/nl/zoeken-naar-archieven/zoekre...
    
    
      3
      AF8E50DF2C7A1B33
      649730
      2016-01-01 00:00:29
      search.arch.be/fr/rechercher-des-archives/resu...
    
    
      4
      A7C63C026A0FB429
      649779
      2016-01-01 00:46:08
      search.arch.be/nl/zoeken-naar-archieven/zoekre...

Check the number of visits & visitors



In [10]:

    
len(set(din.visitorID.tolist())) #number of visitors









    Out[10]:





5290



In [11]:

    
len(set(din.visitID.tolist())) #number of visits









    Out[11]:





8095



In [12]:

    
din.to_csv('output/A_first_parsing.csv') #export the first parsed file

A) PARSING (2/2)



In [13]:

    
dout = parse(din) #parsing metadata and user queries, can takes some time
#you can follow the progression through the percentages









    



1 %
2 %
3 %
4 %
5 %
6 %
7 %
8 %
9 %
10 %
11 %
12 %
13 %
14 %
15 %
16 %
17 %
18 %
19 %
20 %
21 %
22 %
23 %
24 %
25 %
26 %
27 %
28 %
29 %
30 %
31 %
32 %
33 %
34 %
35 %
36 %
37 %
38 %
39 %
40 %
41 %
42 %
43 %
44 %
45 %
46 %
47 %
48 %
49 %
50 %
51 %
52 %
53 %
54 %
55 %
56 %
57 %
58 %
59 %
60 %
61 %
62 %
63 %
64 %
65 %
66 %
67 %
68 %
69 %
70 %
71 %
72 %
73 %
74 %
75 %
76 %
77 %
78 %
79 %
80 %
81 %
82 %
83 %
84 %
85 %
86 %
87 %
88 %
89 %
90 %
91 %
92 %
93 %
94 %
95 %
96 %
97 %
98 %
99 %
100 %



In [14]:

    
dout.head(4) #check if everything seems fine, there are a lot of "NaN" in the first columns (metadata), it's normal!









    Out[14]:






  
    
      
      beginjaar
      dao
      eadid
      eindjaar
      findaidstatus
      foto
      inventarisnr
      lang
      level
      page
      rubriek
      scan-index
      time
      url
      visitID
      visitorID
      zoekterm
    
  
  
    
      0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2016-01-01 01:28:28
      search.arch.be/nl/zoeken-naar-archieven/zoekre...
      649717
      A86A2AC06F9B8292
      gotem
    
    
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2016-01-01 01:29:23
      search.arch.be/nl/zoeken-naar-archieven/zoekre...
      649717
      A86A2AC06F9B8292
      gotem
    
    
      2
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2016-01-01 01:30:38
      search.arch.be/nl/zoeken-naar-archieven/zoekre...
      649717
      A86A2AC06F9B8292
      gotem
    
    
      3
      NaN
      1
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2016-01-01 00:00:29
      search.arch.be/fr/rechercher-des-archives/resu...
      649730
      AF8E50DF2C7A1B33
      couthuin



In [16]:

    
dout.to_csv('output/A_second_parsing.csv') #export output file

2_queriesVisits_raw.csv

Check count of visitors / visits



In [17]:

    
len(list(set(dout.visitID.tolist())))









    Out[17]:





8095



In [18]:

    
len(list(set(dout.visitorID.tolist())))









    Out[18]:





5290

B) GROUPING



In [19]:

    
dout = dout[['visitID','zoekterm','time']] # time is kind of an HTTP request identifier



In [20]:

    
dout.head() #at this stage, you should only have 3 columns: visitID, zoekterm (keyword), time









    Out[20]:






  
    
      
      visitID
      zoekterm
      time
    
  
  
    
      0
      649717
      gotem
      2016-01-01 01:28:28
    
    
      1
      649717
      gotem
      2016-01-01 01:29:23
    
    
      2
      649717
      gotem
      2016-01-01 01:30:38
    
    
      3
      649730
      couthuin
      2016-01-01 00:00:29
    
    
      4
      649779
      dreesens
      2016-01-01 00:46:08

Count of similar entries by column: zoekterm (queries), time, visit



In [21]:

    
len(list(set(dout.zoekterm.tolist())))









    Out[21]:





10826



In [22]:

    
len(list(set(dout.time.tolist())))









    Out[22]:





33909



In [23]:

    
len(list(set(dout.visitID.tolist())))









    Out[23]:





8095



In [24]:

    
len(dout) #length of file









    Out[24]:





34358

Groupby



In [25]:

    
# questions? cf. Pandas documentation: http://pandas.pydata.org/pandas-docs/stable/groupby.html



In [26]:

    
dout = dout.groupby(['visitID','zoekterm']).count() #each time we have the same visitID + the same query, we'll keep the query only once



In [27]:

    
dout.head()









    Out[27]:






  
    
      
      
      time
    
    
      visitID
      zoekterm
      
    
  
  
    
      649717
      gotem
      3
    
    
      649730
      couthuin
      1
    
    
      649779
      dreesens
      1
    
    
      649783
      Jonathan paleovrisotis
      3
    
    
      649796
      antwerpen
      1



In [28]:

    
len(dout) #the number of lines should be lower after reduction









    Out[28]:





14895



In [29]:

    
dout.reset_index(inplace=True)
dout = dout.rename(columns = {'time':'queries_count'}) #we rename the "time" column by a more explicit name



In [30]:

    
dout.head(29)









    Out[30]:






  
    
      
      visitID
      zoekterm
      queries_count
    
  
  
    
      0
      649717
      gotem
      3
    
    
      1
      649730
      couthuin
      1
    
    
      2
      649779
      dreesens
      1
    
    
      3
      649783
      Jonathan paleovrisotis
      3
    
    
      4
      649796
      antwerpen
      1
    
    
      5
      649796
      borsbeek
      2
    
    
      6
      649851
      Trouw Christelle Van Keymeulen
      1
    
    
      7
      649867
      deffrenne
      1
    
    
      8
      649939
      digital
      2
    
    
      9
      649964
      Albert
      1
    
    
      10
      649964
      CHENEE
      3
    
    
      11
      649964
      Desirotte
      1
    
    
      12
      649964
      Epschied
      1
    
    
      13
      649964
      Laubach
      1
    
    
      14
      649964
      Specht
      5
    
    
      15
      650017
      Thiry
      1
    
    
      16
      650056
      registres paroissiaux paturages
      1
    
    
      17
      650067
      contracten kortrijk
      1
    
    
      18
      650168
      
      4
    
    
      19
      650200
      DENtergem
      1
    
    
      20
      650200
      deurle
      1
    
    
      21
      650200
      ruiselede
      1
    
    
      22
      650249
      wallez
      2
    
    
      23
      650367
      
      1
    
    
      24
      650378
      bochmans
      1
    
    
      25
      650406
      de billehe
      2
    
    
      26
      650470
      photo du village de Bellefontaine
      2
    
    
      27
      650470
      photo+du+village+de+Bellefontaine
      1
    
    
      28
      650483
      duyckaerts
      2



In [31]:

    
len(list(set(dout.visitID.tolist()))) # we lost some visitors, probably blank cells









    Out[31]:





8095



In [32]:

    
len(list(set(dout.zoekterm.tolist())))









    Out[32]:





10826



In [33]:

    
dout.queries_count.sum() # = to sum up the queries
#there is a lost we cannot explain, as with "visitID"... probably blank cells









    Out[33]:





34358



In [34]:

    
dout.to_csv('output/B_grouping.csv')

C) CLEANING



In [35]:

    
from re import sub
import re

#we decide now what has to be done to normalise the query text
#update: maybe you should also take into account others special characters such as * & © À etc.)

def clean_text(text):
    if re.search("^[0-9\-\.\/]*$", text) != None : 
         text = 'NULL'  #we avoid keeping queries composed only of numbers 
    elif text != 'NaN':
        text = str(text) #
        text = text.lower() #convert to lower case 
        text = sub('[^a-zéèèêèàâäüùôöç0-9\']', ' ', text) #we don't want to keep the "?" nor the "+" between 2 words
        text = str.replace(text,'é','e') #We could certainly do shorter, this is the long version but at least it does the job
        text = str.replace(text,'è','e')
        text = str.replace(text,'ê','e')
        text = str.replace(text,'à','a')
        text = str.replace(text,'â','a')
        text = str.replace(text,'ä','a')
        text = str.replace(text,'ü','u')
        text = str.replace(text,'û','u')
        text = str.replace(text,'ù','u')
        text = str.replace(text,'ô','o')
        text = str.replace(text,'ç','c')
        text = str.replace(text,'ö','o')
        text = str.replace(text,'ë','e')
        text = str.replace(text,'\'',' ')
        text = str.replace(text,'  ',' ')
        text = str.replace(text,'   ',' ')
        text = text.strip() #Return a copy of the string with the leading and trailing characters removed
    return text



In [36]:

    
f = 'output/B_grouping.csv'
d = pd.read_csv(f, sep = ',', header=0, dtype=str, error_bad_lines=False)



In [37]:

    
d = d[['visitID','zoekterm','queries_count']]



In [38]:

    
d.head()









    Out[38]:






  
    
      
      visitID
      zoekterm
      queries_count
    
  
  
    
      0
      649717
      gotem
      3
    
    
      1
      649730
      couthuin
      1
    
    
      2
      649779
      dreesens
      1
    
    
      3
      649783
      Jonathan paleovrisotis
      3
    
    
      4
      649796
      antwerpen
      1



In [39]:

    
#normalisation function
d['normalised_zoekterm'] = ''
subcount = 0
count = 0
for i, r in d.iterrows():
    subcount += 1
    if subcount == 500: #you can change the number depending on the size of the dataset
        count += subcount
        print(count)
        subcount = 0
    d.ix[i,'normalised_zoekterm'] = clean_text(r.zoekterm)
    
d.head() #to check if it works correctly









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-39-84c18dce6a56> in <module>()
      9         print(count)
     10         subcount = 0
---> 11     d.ix[i,'normalised_zoekterm'] = clean_text(r.zoekterm)
     12 
     13 d.head() #to check if it works correctly

<ipython-input-35-df8a1e1d983c> in clean_text(text)
      6 
      7 def clean_text(text):
----> 8     if re.search("^[0-9\-\.\/]*$", text) != None :
      9          text = 'NULL'  #we avoid keeping queries composed only of numbers
     10     elif text != 'NaN':

/Applications/anaconda/lib/python3.5/re.py in search(pattern, string, flags)
    171     """Scan through string looking for a match to the pattern, returning
    172     a match object, or None if no match was found."""
--> 173     return _compile(pattern, flags).search(string)
    174 
    175 def sub(pattern, repl, string, count=0, flags=0):

TypeError: expected string or bytes-like object



In [64]:

    
d.to_csv("output/C_cleaning.csv")



In [65]:

    
d.head(200) #we can check on a bigger sample if everything seems fine
#queries composed only by numbers appear now as "NULL"









    Out[65]:






  
    
      
      visitID
      zoekterm
      queries_count
      normalised_zoekterm
    
  
  
    
      0
      1000034
      steennatie
      2
      steennatie
    
    
      1
      1000060
      janssens
      1
      janssens
    
    
      2
      1000097
      abbaye d'aulne
      1
      abbaye d aulne
    
    
      3
      1000121
      albin De Vuyst
      1
      albin de vuyst
    
    
      4
      1000121
      matheus De Vuyst
      4
      matheus de vuyst
    
    
      5
      1000121
      matheus+De+Vuyst
      4
      matheus de vuyst
    
    
      6
      1000125
      MEDAERTS
      2
      medaerts
    
    
      7
      1000125
      MEDAERTS Franz
      2
      medaerts franz
    
    
      8
      1000126
      naissance à sugny
      1
      naissance a sugny
    
    
      9
      1000138
      de waegenaere
      1
      de waegenaere
    
    
      10
      1000138
      de+waegenaere
      1
      de waegenaere
    
    
      11
      1000164
      Charlet
      2
      charlet
    
    
      12
      1000188
      dave
      2
      dave
    
    
      13
      1000270
      administrateur territorial congo
      1
      administrateur territorial congo
    
    
      14
      1000270
      administrateur+territorial+congo
      9
      administrateur territorial congo
    
    
      15
      1000270
      congo
      1
      congo
    
    
      16
      1000282
      muller
      3
      muller
    
    
      17
      1000315
      cornelia verhoeven
      1
      cornelia verhoeven
    
    
      18
      1000315
      cornelia+verhoeven
      1
      cornelia verhoeven
    
    
      19
      1000342
      Mariages Esrtaimpuis
      1
      mariages esrtaimpuis
    
    
      20
      1000342
      mariages
      2
      mariages
    
    
      21
      1000351
      loof felix
      1
      loof felix
    
    
      22
      1000351
      loof+felix
      2
      loof felix
    
    
      23
      1000364
      flaman
      5
      flaman
    
    
      24
      1000383
      coo
      2
      coo
    
    
      25
      1000383
      rechtbank
      3
      rechtbank
    
    
      26
      1000391
      kohier penning
      1
      kohier penning
    
    
      27
      1000391
      kohier+penning
      1
      kohier penning
    
    
      28
      1000391
      kohieren 100ste penning
      2
      kohieren 100ste penning
    
    
      29
      1000391
      penningkohieren
      1
      penningkohieren
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      170
      1002489
      haine
      2
      haine
    
    
      171
      1002489
      la montagne
      2
      la montagne
    
    
      172
      1002489
      la montagne oud heverlee
      1
      la montagne oud heverlee
    
    
      173
      1002489
      la+montagne+oud+heverlee
      2
      la montagne oud heverlee
    
    
      174
      1002489
      sint kwintens leuven
      2
      sint kwintens leuven
    
    
      175
      1002489
      sint lambertus heverlee
      3
      sint lambertus heverlee
    
    
      176
      1002489
      sint+kwintens+leuven
      1
      sint kwintens leuven
    
    
      177
      1002489
      spaans dak
      1
      spaans dak
    
    
      178
      1002512
      Uytterhoeven
      2
      uytterhoeven
    
    
      179
      1002585
      Delangre Leopold
      1
      delangre leopold
    
    
      180
      1002589
      fort saint-heribert
      1
      fort saint heribert
    
    
      181
      1002589
      saint heribert
      1
      saint heribert
    
    
      182
      1002602
      LOFFLER
      1
      loffler
    
    
      183
      1002604
      dommages de guerre
      2
      dommages de guerre
    
    
      184
      1002626
      Tiri
      4
      tiri
    
    
      185
      1002643
      Gouy lez pieton
      1
      gouy lez pieton
    
    
      186
      1002643
      Gouy lez pieton rue champetre
      2
      gouy lez pieton rue champetre
    
    
      187
      1002713
      parochieregisters
      1
      parochieregisters
    
    
      188
      1002721
      vanhyfte
      2
      vanhyfte
    
    
      189
      1002733
      troussart
      1
      troussart
    
    
      190
      1002752
      leuven
      5
      leuven
    
    
      191
      1002755
      kalken
      2
      kalken
    
    
      192
      1002763
      Van keer
      1
      van keer
    
    
      193
      1002763
      Van+keer
      1
      van keer
    
    
      194
      1002827
      1628
      1
      NULL
    
    
      195
      1002827
      chronique 1628
      1
      chronique 1628
    
    
      196
      1002827
      chronique dite de 1628
      1
      chronique dite de 1628
    
    
      197
      1002827
      chronique manuscrite
      1
      chronique manuscrite
    
    
      198
      1002827
      chronique+dite+de+1628
      1
      chronique dite de 1628
    
    
      199
      1002827
      chronique+manuscrite
      1
      chronique manuscrite
    
  

200 rows × 4 columns

New reduction after the cleaning step



In [ ]:

    
#the cleaning step has revealed new duplicates and we want to keep each identical request only once/visit



In [6]:

    
d = pd.read_csv("output/C_cleaning.csv")
d = d.drop('Unnamed: 0', axis = 1)
#d = d.drop('Unnamed: 0.1', axis = 1)



In [7]:

    
d.head(10)









    Out[7]:






  
    
      
      visitID
      zoekterm
      queries_count
      normalised_zoekterm
    
  
  
    
      0
      1000034
      steennatie
      2
      steennatie
    
    
      1
      1000060
      janssens
      1
      janssens
    
    
      2
      1000097
      abbaye d'aulne
      1
      abbaye d aulne
    
    
      3
      1000121
      albin De Vuyst
      1
      albin de vuyst
    
    
      4
      1000121
      matheus De Vuyst
      4
      matheus de vuyst
    
    
      5
      1000121
      matheus+De+Vuyst
      4
      matheus de vuyst
    
    
      6
      1000125
      MEDAERTS
      2
      medaerts
    
    
      7
      1000125
      MEDAERTS Franz
      2
      medaerts franz
    
    
      8
      1000126
      naissance à sugny
      1
      naissance a sugny
    
    
      9
      1000138
      de waegenaere
      1
      de waegenaere



In [8]:

    
len(d)









    Out[8]:





77034



In [9]:

    
d.queries_count.sum()









    Out[9]:





182490



In [10]:

    
len(list(set(d.normalised_zoekterm.tolist())))









    Out[10]:





37400



In [11]:

    
d = d.groupby(['visitID','normalised_zoekterm']).sum() #we want to keep each query only once, but keep track of the number of similar queries/visit 
d.reset_index(inplace=True)
d.head(30)









    Out[11]:






  
    
      
      visitID
      normalised_zoekterm
      queries_count
    
  
  
    
      0
      649717
      gotem
      3
    
    
      1
      649730
      couthuin
      1
    
    
      2
      649779
      dreesens
      1
    
    
      3
      649783
      jonathan paleovrisotis
      3
    
    
      4
      649796
      antwerpen
      1
    
    
      5
      649796
      borsbeek
      2
    
    
      6
      649851
      trouw christelle van keymeulen
      1
    
    
      7
      649867
      deffrenne
      1
    
    
      8
      649939
      digital
      2
    
    
      9
      649964
      albert
      1
    
    
      10
      649964
      chenee
      3
    
    
      11
      649964
      desirotte
      1
    
    
      12
      649964
      epschied
      1
    
    
      13
      649964
      laubach
      1
    
    
      14
      649964
      specht
      5
    
    
      15
      650017
      thiry
      1
    
    
      16
      650056
      registres paroissiaux paturages
      1
    
    
      17
      650067
      contracten kortrijk
      1
    
    
      18
      650200
      dentergem
      1
    
    
      19
      650200
      deurle
      1
    
    
      20
      650200
      ruiselede
      1
    
    
      21
      650249
      wallez
      2
    
    
      22
      650378
      bochmans
      1
    
    
      23
      650406
      de billehe
      2
    
    
      24
      650470
      photo du village de bellefontaine
      3
    
    
      25
      650483
      duyckaerts
      2
    
    
      26
      650570
      rommel
      21
    
    
      27
      650570
      schaken
      3
    
    
      28
      650579
      lorenzi giuseppe
      1
    
    
      29
      650595
      akten
      11



In [73]:

    
len(d)









    Out[73]:





67426



In [74]:

    
d.queries_count.sum()









    Out[74]:





181455



In [75]:

    
len(list(set(d.visitID.tolist())))









    Out[75]:





42046



In [76]:

    
d.to_csv('output/C_cleaning_without_duplicates.csv')

Optional step to count the number of requests/session



In [12]:

    
dno = d.copy()
dno['no'] = '1'
dno = dno[['visitID','queries_count','no']]
dno.head(10)









    Out[12]:






  
    
      
      visitID
      queries_count
      no
    
  
  
    
      0
      649717
      3
      1
    
    
      1
      649730
      1
      1
    
    
      2
      649779
      1
      1
    
    
      3
      649783
      3
      1
    
    
      4
      649796
      1
      1
    
    
      5
      649796
      2
      1
    
    
      6
      649851
      1
      1
    
    
      7
      649867
      1
      1
    
    
      8
      649939
      2
      1
    
    
      9
      649964
      1
      1



In [13]:

    
dno = dno.groupby(['visitID','no']).count()
dno.reset_index(inplace=True)
dno = dno.rename(columns = {'queries_count':'requests'})
dno.head(5)



In [14]:

    
dno["requests"].mean()









    Out[14]:





1.6036246016267897



In [15]:

    
dno["requests"].sum()









    Out[15]:





67426



In [17]:

    
len(dno)









    Out[17]:





42046



In [19]:

    
dno.to_csv('output/number_of_requests_by_visit.csv')

D) CLUSTERING (1/2)



In [ ]:

    
#we prepare the input



In [77]:

    
input = d[['normalised_zoekterm']] #we keep only the normalised queries



In [78]:

    
input.to_csv('output/D/to_be_clustered.csv', index=False, header=False)



In [79]:

    
input.head()









    Out[79]:






  
    
      
      normalised_zoekterm
    
  
  
    
      0
      gotem
    
    
      1
      couthuin
    
    
      2
      dreesens
    
    
      3
      jonathan paleovrisotis
    
    
      4
      antwerpen



In [ ]:

    
#below: function + algorithm (Jaro-Winkler) to do the clustering



In [80]:

    
import os, sys, getopt, io
import json, random, jellyfish
from bs4 import BeautifulSoup as bs
from collections import defaultdict
from jellyfish import levenshtein_distance as ld, jaro_distance



In [104]:

    
import re, codecs, os, sys, getopt, io
import json, random, jellyfish
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
from collections import defaultdict
from jellyfish import levenshtein_distance as ld, jaro_distance

content = [line.rstrip('\n') for line in io.open('output/D/to_be_clustered.csv')]

result = open('output/D/clusters_85.txt',"a")

Clusters = []
Centroid = []
Scores = []
for string in content:
    Matched = 0

    if len(Clusters) == 0:
        Clusters.append([string])
        Centroid.append([string])
        Scores.append([])
        continue

    for ClustNum in range(len(Clusters)):
        Dist = jellyfish.jaro_distance(string, Centroid[ClustNum][0])

        if Dist > 0.85:
            Clusters[ClustNum].append(string)
       
            if len(Scores[ClustNum]) == 0:
                Scores[ClustNum].append(Dist)

            Matched = 1
            break

    if Matched ==0:       
        Clusters.append([string])
        Centroid.append([string])
        Scores.append([])
        
size = len(Clusters)

x = 0

while x < size:
    cluster = Clusters[x]
    centroid = Centroid[x]
    
    list = []
    for cl in cluster:
        list.append(str(cl) + '\t' + str(centroid[0]))
    
    result.write('\n'.join(list))
    result.write('\n')

    x = x+1
                    
result.close()

D) Clustering (2/2): Reconciliation



In [93]:

    
d = pd.read_csv('output/C_cleaning_without_duplicates.csv') #we need the file with normalised queries



In [95]:

    
d = d.drop('Unnamed: 0', axis = 1)



In [96]:

    
d.head() #quick check









    Out[96]:






  
    
      
      visitID
      normalised_zoekterm
      queries_count
    
  
  
    
      0
      649717
      gotem
      3
    
    
      1
      649730
      couthuin
      1
    
    
      2
      649779
      dreesens
      1
    
    
      3
      649783
      jonathan paleovrisotis
      3
    
    
      4
      649796
      antwerpen
      1



In [89]:

    
clu = pd.read_csv('output/D/clusters_85.txt', sep='\t', header=None)



In [90]:

    
clu.columns = ['queries','clusters']



In [91]:

    
clu.head()









    Out[91]:






  
    
      
      queries
      clusters
    
  
  
    
      0
      gotem
      gotem
    
    
      1
      gotem
      gotem
    
    
      2
      gottem
      gotem
    
    
      3
      gottem
      gotem
    
    
      4
      gottem
      gotem



In [92]:

    
len(set(clu.queries.tolist()))









    Out[92]:





37399



In [145]:

    
len(set(clu.clusters.tolist()))









    Out[145]:





11637



In [158]:

    
def get_cluster(clu, text):
    out = ""
    try:
        out = clu.loc[clu.queries == text, 'clusters'].values[0]
    except:
        out = "NAN (cluster not found)"
    return out



In [161]:

    
d['cluster'] = ""



In [169]:

    
c = Counter(d)
for i, r in d.iterrows():
    c.loop()
    
    text = r.normalised_zoekterm
    d.loc[i, 'cluster'] = get_cluster(clu, text)



In [170]:

    
len(set(d.cluster.tolist()))









    Out[170]:





11637



In [174]:

    
d = d[['cluster','queries_count','normalised_zoekterm','visitID']]



In [178]:

    
d.sort_values(by='cluster')









    Out[178]:






  
    
      
      cluster
      queries_count
      normalised_zoekterm
      visitID
    
  
  
    
      43539
      0 0137
      2
      0 0137
      1566570
    
    
      16541
      0 277 000
      1
      0 277 000
      999074
    
    
      25126
      000
      6
      000
      1170617
    
    
      12959
      0002 001 00217 000 0 0001
      1
      0500 000 02531 000 0 0003
      923985
    
    
      20287
      0002 001 00217 000 0 0001
      1
      0001 072 00007 000 0 0003
      1074043
    
    
      31344
      0002 001 00217 000 0 0001
      1
      0500 000 02358 000 0 0001
      1297143
    
    
      4731
      0002 001 00217 000 0 0001
      2
      0001 078 00004 000 0 0002
      752123
    
    
      4733
      0002 001 00217 000 0 0001
      2
      0500 000 02190 000 0 0001 r
      752201
    
    
      29550
      0002 001 00217 000 0 0001
      2
      0001 132 00003 000 0 0001
      1261297
    
    
      50987
      0002 001 00217 000 0 0001
      2
      0001 040 00001 000 0 0001
      1735044
    
    
      19928
      0002 001 00217 000 0 0001
      3
      0236 000 01070 000 0 0452
      1068852
    
    
      33127
      0002 001 00217 000 0 0001
      1
      513 9000 000 01241 000 0 0002 r
      1339269
    
    
      18154
      0002 001 00217 000 0 0001
      2
      0900 000 00110 000 0 0203 r
      1031264
    
    
      27658
      0002 001 00217 000 0 0001
      1
      6000 000 00036 000 0 0001
      1220974
    
    
      19295
      0002 001 00217 000 0 0001
      1
      5 000 00 00420 000b 0161
      1055588
    
    
      36282
      0002 001 00217 000 0 0001
      2
      0698 000 01006 000 0 0001
      1411417
    
    
      2470
      0002 001 00217 000 0 0001
      2
      9000 000 01378 000 0 0001
      704985
    
    
      1309
      0002 001 00217 000 0 0001
      3
      0269 000 00001 000 b 0001 r
      679498
    
    
      54595
      0002 001 00217 000 0 0001
      2
      510 0002 000 01256 000 0 0001
      1815037
    
    
      42384
      0002 001 00217 000 0 0001
      1
      0002 001 00217 000 0 0001
      1542355
    
    
      51003
      0002 001 00217 000 0 0001
      1
      5000 000 00601 000 0 0114 a
      1735259
    
    
      51926
      0002 001 00217 000 0 0001
      12
      0308 000 00009 000 r 001
      1756426
    
    
      32441
      0002 001 00217 000 0 0001
      1
      0394 000 00609 000 0 0500
      1322273
    
    
      11997
      0002 001 00217 000 0 0001
      1
      0 500 000 01021 000 0 0001
      905055
    
    
      52759
      0002 001 00217 000 0 0001
      5
      0500 000 01211 000 0 0001 r
      1773691
    
    
      16646
      00stkamp
      9
      oostkamp
      1001064
    
    
      47623
      00stkamp
      5
      oostkamp
      1662312
    
    
      46923
      00stkamp
      2
      oostkamp
      1643557
    
    
      46901
      00stkamp
      2
      oostkamp
      1642980
    
    
      47458
      00stkamp
      6
      oostkamp
      1658177
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      42758
      zwaluwe
      6
      zwaluwe
      1550368
    
    
      2318
      zwangssoldat
      1
      zwangssoldaten
      701691
    
    
      2317
      zwangssoldat
      1
      zwangssoldat
      701691
    
    
      20674
      zwarte dood
      1
      zwarte dood
      1081274
    
    
      36793
      zwartzusters
      1
      zwartzusters
      1422187
    
    
      20367
      zwijndrecht
      1
      zwijnaarde
      1075361
    
    
      14951
      zwijndrecht
      1
      zijde
      964497
    
    
      48037
      zwijndrecht
      2
      zwijndrecht kapelstraat
      1671348
    
    
      48036
      zwijndrecht
      2
      zwijndrecht
      1671348
    
    
      46313
      zwijndrecht
      6
      zwijndrecht
      1627800
    
    
      19988
      zwijndrecht
      1
      zwijnaarde
      1069700
    
    
      24416
      zwijndrecht
      3
      zwijndrecht
      1157993
    
    
      3545
      zwijndrecht
      1
      zwijndrecht
      728117
    
    
      20445
      zwijndrecht
      6
      zwijnaarde
      1076798
    
    
      43744
      zwijndrecht
      3
      su zwinjdrecht
      1570175
    
    
      28448
      zwijndrecht
      1
      zwijndrecht
      1239027
    
    
      43746
      zwijndrecht
      1
      zwinjdrecht
      1570175
    
    
      3792
      zwijndrecht
      6
      zwijndrecht
      732849
    
    
      2415
      zwijndrecht
      1
      zwijnaarde
      703301
    
    
      18312
      zwijnland
      2
      wijnants
      1034836
    
    
      53918
      zwijnland
      4
      winand
      1800653
    
    
      1469
      zwijnland
      3
      wijnants
      683622
    
    
      24216
      zwijnland
      2
      wijnants
      1153881
    
    
      43103
      zwijnland
      2
      zwin
      1557487
    
    
      24765
      zwijnland
      3
      winand
      1164138
    
    
      17658
      zwijnland
      2
      wijnants
      1021126
    
    
      37397
      zwijnland
      2
      zwin
      1434744
    
    
      9273
      zwijnland
      1
      zwijnland
      850430
    
    
      18511
      zwijnland
      1
      zwin
      1039214
    
    
      51080
      zwzvzzele
      1
      zwzvzzele
      1736770
    
  

54694 rows × 4 columns



In [179]:

    
d.to_csv('output/D/Clusters_Data_Reconciliation.csv') #the end!

	visitorID	visitID	time	url
0	A86A2AC06F9B8292	649717	2016-01-01 01:28:28	search.arch.be/nl/zoeken-naar-archieven/zoekre...
1	A86A2AC06F9B8292	649717	2016-01-01 01:29:23	search.arch.be/nl/zoeken-naar-archieven/zoekre...
2	A86A2AC06F9B8292	649717	2016-01-01 01:30:38	search.arch.be/nl/zoeken-naar-archieven/zoekre...
3	AF8E50DF2C7A1B33	649730	2016-01-01 00:00:29	search.arch.be/fr/rechercher-des-archives/resu...
4	A7C63C026A0FB429	649779	2016-01-01 00:46:08	search.arch.be/nl/zoeken-naar-archieven/zoekre...

	visitID	zoekterm	time
0	649717	gotem	2016-01-01 01:28:28
1	649717	gotem	2016-01-01 01:29:23
2	649717	gotem	2016-01-01 01:30:38
3	649730	couthuin	2016-01-01 00:00:29
4	649779	dreesens	2016-01-01 00:46:08

		time
visitID	zoekterm
649717	gotem	3
649730	couthuin	1
649779	dreesens	1
649783	Jonathan paleovrisotis	3
649796	antwerpen	1

	visitID	zoekterm	queries_count	normalised_zoekterm
0	1000034	steennatie	2	steennatie
1	1000060	janssens	1	janssens
2	1000097	abbaye d'aulne	1	abbaye d aulne
3	1000121	albin De Vuyst	1	albin de vuyst
4	1000121	matheus De Vuyst	4	matheus de vuyst
5	1000121	matheus+De+Vuyst	4	matheus de vuyst
6	1000125	MEDAERTS	2	medaerts
7	1000125	MEDAERTS Franz	2	medaerts franz
8	1000126	naissance à sugny	1	naissance a sugny
9	1000138	de waegenaere	1	de waegenaere
10	1000138	de+waegenaere	1	de waegenaere
11	1000164	Charlet	2	charlet
12	1000188	dave	2	dave
13	1000270	administrateur territorial congo	1	administrateur territorial congo
14	1000270	administrateur+territorial+congo	9	administrateur territorial congo
15	1000270	congo	1	congo
16	1000282	muller	3	muller
17	1000315	cornelia verhoeven	1	cornelia verhoeven
18	1000315	cornelia+verhoeven	1	cornelia verhoeven
19	1000342	Mariages Esrtaimpuis	1	mariages esrtaimpuis
20	1000342	mariages	2	mariages
21	1000351	loof felix	1	loof felix
22	1000351	loof+felix	2	loof felix
23	1000364	flaman	5	flaman
24	1000383	coo	2	coo
25	1000383	rechtbank	3	rechtbank
26	1000391	kohier penning	1	kohier penning
27	1000391	kohier+penning	1	kohier penning
28	1000391	kohieren 100ste penning	2	kohieren 100ste penning
29	1000391	penningkohieren	1	penningkohieren
...	...	...	...	...
170	1002489	haine	2	haine
171	1002489	la montagne	2	la montagne
172	1002489	la montagne oud heverlee	1	la montagne oud heverlee
173	1002489	la+montagne+oud+heverlee	2	la montagne oud heverlee
174	1002489	sint kwintens leuven	2	sint kwintens leuven
175	1002489	sint lambertus heverlee	3	sint lambertus heverlee
176	1002489	sint+kwintens+leuven	1	sint kwintens leuven
177	1002489	spaans dak	1	spaans dak
178	1002512	Uytterhoeven	2	uytterhoeven
179	1002585	Delangre Leopold	1	delangre leopold
180	1002589	fort saint-heribert	1	fort saint heribert
181	1002589	saint heribert	1	saint heribert
182	1002602	LOFFLER	1	loffler
183	1002604	dommages de guerre	2	dommages de guerre
184	1002626	Tiri	4	tiri
185	1002643	Gouy lez pieton	1	gouy lez pieton
186	1002643	Gouy lez pieton rue champetre	2	gouy lez pieton rue champetre
187	1002713	parochieregisters	1	parochieregisters
188	1002721	vanhyfte	2	vanhyfte
189	1002733	troussart	1	troussart
190	1002752	leuven	5	leuven
191	1002755	kalken	2	kalken
192	1002763	Van keer	1	van keer
193	1002763	Van+keer	1	van keer
194	1002827	1628	1	NULL
195	1002827	chronique 1628	1	chronique 1628
196	1002827	chronique dite de 1628	1	chronique dite de 1628
197	1002827	chronique manuscrite	1	chronique manuscrite
198	1002827	chronique+dite+de+1628	1	chronique dite de 1628
199	1002827	chronique+manuscrite	1	chronique manuscrite

	cluster	queries_count	normalised_zoekterm	visitID
43539	0 0137	2	0 0137	1566570
16541	0 277 000	1	0 277 000	999074
25126	000	6	000	1170617
12959	0002 001 00217 000 0 0001	1	0500 000 02531 000 0 0003	923985
20287	0002 001 00217 000 0 0001	1	0001 072 00007 000 0 0003	1074043
31344	0002 001 00217 000 0 0001	1	0500 000 02358 000 0 0001	1297143
4731	0002 001 00217 000 0 0001	2	0001 078 00004 000 0 0002	752123
4733	0002 001 00217 000 0 0001	2	0500 000 02190 000 0 0001 r	752201
29550	0002 001 00217 000 0 0001	2	0001 132 00003 000 0 0001	1261297
50987	0002 001 00217 000 0 0001	2	0001 040 00001 000 0 0001	1735044
19928	0002 001 00217 000 0 0001	3	0236 000 01070 000 0 0452	1068852
33127	0002 001 00217 000 0 0001	1	513 9000 000 01241 000 0 0002 r	1339269
18154	0002 001 00217 000 0 0001	2	0900 000 00110 000 0 0203 r	1031264
27658	0002 001 00217 000 0 0001	1	6000 000 00036 000 0 0001	1220974
19295	0002 001 00217 000 0 0001	1	5 000 00 00420 000b 0161	1055588
36282	0002 001 00217 000 0 0001	2	0698 000 01006 000 0 0001	1411417
2470	0002 001 00217 000 0 0001	2	9000 000 01378 000 0 0001	704985
1309	0002 001 00217 000 0 0001	3	0269 000 00001 000 b 0001 r	679498
54595	0002 001 00217 000 0 0001	2	510 0002 000 01256 000 0 0001	1815037
42384	0002 001 00217 000 0 0001	1	0002 001 00217 000 0 0001	1542355
51003	0002 001 00217 000 0 0001	1	5000 000 00601 000 0 0114 a	1735259
51926	0002 001 00217 000 0 0001	12	0308 000 00009 000 r 001	1756426
32441	0002 001 00217 000 0 0001	1	0394 000 00609 000 0 0500	1322273
11997	0002 001 00217 000 0 0001	1	0 500 000 01021 000 0 0001	905055
52759	0002 001 00217 000 0 0001	5	0500 000 01211 000 0 0001 r	1773691
16646	00stkamp	9	oostkamp	1001064
47623	00stkamp	5	oostkamp	1662312
46923	00stkamp	2	oostkamp	1643557
46901	00stkamp	2	oostkamp	1642980
47458	00stkamp	6	oostkamp	1658177
...	...	...	...	...
42758	zwaluwe	6	zwaluwe	1550368
2318	zwangssoldat	1	zwangssoldaten	701691
2317	zwangssoldat	1	zwangssoldat	701691
20674	zwarte dood	1	zwarte dood	1081274
36793	zwartzusters	1	zwartzusters	1422187
20367	zwijndrecht	1	zwijnaarde	1075361
14951	zwijndrecht	1	zijde	964497
48037	zwijndrecht	2	zwijndrecht kapelstraat	1671348
48036	zwijndrecht	2	zwijndrecht	1671348
46313	zwijndrecht	6	zwijndrecht	1627800
19988	zwijndrecht	1	zwijnaarde	1069700
24416	zwijndrecht	3	zwijndrecht	1157993
3545	zwijndrecht	1	zwijndrecht	728117
20445	zwijndrecht	6	zwijnaarde	1076798
43744	zwijndrecht	3	su zwinjdrecht	1570175
28448	zwijndrecht	1	zwijndrecht	1239027
43746	zwijndrecht	1	zwinjdrecht	1570175
3792	zwijndrecht	6	zwijndrecht	732849
2415	zwijndrecht	1	zwijnaarde	703301
18312	zwijnland	2	wijnants	1034836
53918	zwijnland	4	winand	1800653
1469	zwijnland	3	wijnants	683622
24216	zwijnland	2	wijnants	1153881
43103	zwijnland	2	zwin	1557487
24765	zwijnland	3	winand	1164138
17658	zwijnland	2	wijnants	1021126
37397	zwijnland	2	zwin	1434744
9273	zwijnland	1	zwijnland	850430
18511	zwijnland	1	zwin	1039214
51080	zwzvzzele	1	zwzvzzele	1736770