In [38]:
# This cell has to be run to prepare the Jupyter notebook
# The %... is an Jupyter thing, and is not part of the Python language.
# In this case we're just telling the plotting library to draw things on
# the notebook, instead of on a separate window.
%matplotlib inline
# See all the "as ..." contructs? They're just aliasing the package names.
# That way we can call methods like plt.plot() instead of matplotlib.pyplot.plot().
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
from time import time
from math import log
# import the k-means algorithm
from sklearn.cluster import KMeans
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
[Kuhlthau 1991, Ingwersen 1996, Ellis/Haugan 1997, Reiterer et al. 2000] und viele mehr...
In [16]:
# create sample data
d = {'doc01': (1,2), 'doc02': (10,10),'doc03':(60,40),'doc04':(100,80),'doc05':(99,81),'doc06':(1,1),'doc07':(45,55),'doc08':(9,10),'doc09':(11,11),'doc10':(1,11)}
# create a data frame from our sample data
sampleDF=pd.DataFrame(data=d)
# transpose the data frame for later use
sampleDF=sampleDF.transpose()
sampleDF.columns = ['term 2 (z.B. Retrieval)', 'term 1 (z.B. Information)']
sampleDF.head(10)
Out[16]:
In [17]:
# plot the sample data
sampleDF.plot(x=0,y=1,kind='scatter',alpha=0.75,s=70) # we have to define explicitly which data is used for the x and y axes
Out[17]:
In [18]:
vq=[0,1,1,0]
vd1=[50,5,0,0]
vd2=[0,2,2,0]
print "similarity between Vq and Vd1: %i"%np.inner(vq,vd1) # inner product is another name for the scalar/dot product
print "similarity between Vq and Vd2: %i"%np.inner(vq,vd2)
In [31]:
# with stopword in first dimension
vq=[100,0,0,1,2]
b=[99,1,2,0,0]
c=[80,0,0,2,2]
print "Before stopword elimination"
print "\tsimilarity between Vq and b: %i"%np.inner(vq,b)
print "\tsimilarity between Vq and c: %i"%np.inner(a,c)
# after stopword elimination
vq2=[0,0,1,2]
b2=[1,2,0,0]
c2=[0,0,2,2]
print "\nAfter stopword elimination"
print "\tsimilarity between Vq and b: %i"%np.inner(vq2,b2)
print "\tsimilarity between Vq and c: %i"%np.inner(a2,c2)
In [54]:
N=1000.0
ni=[]
quotient=[]
quotientLog=[]
for i in range(1,500):
ni.append(i)
for i in ni:
quotient.append(N/i)
quotientLog.append(log(N/i))
plt.plot(ni,quotient,label="N/ni")
plt.plot(ni,quotientLog,label="log(N/ni)")
plt.axis([0, 500, 0, 100])
plt.ylabel('Result (limited to 100)')
plt.xlabel("ni")
plt.title("Sample results of idf, N=1000")
plt.legend()
Out[54]:
In [19]:
# define the number of clusters to be found
true_k=3
# initialize the k-means algorithm
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
# apply the algorithm on the data
km.fit(sampleDF)
# add the detected clusters as a new column to the original data frame
sampleDF['cluster']=km.labels_
sampleDF=sampleDF.sort('cluster')
sampleDF.head(10)
Out[19]:
In [20]:
clusterCenters=pd.DataFrame(data=km.cluster_centers_)
clusterCenters.head()
Out[20]:
In [21]:
# plot the sample data and save the plot in the variable "ax"
ax=sampleDF.plot(x=0,y=1,kind='scatter',alpha=0.75,s=70)
# plot the centroids in red
plt.scatter(x=clusterCenters[0],y=clusterCenters[1],color='red')
# next, define the circles' centers surrounding the clusters for a better visualization result
cirlePos1=(clusterCenters[0][0],clusterCenters[1][0])
cirlePos2=(clusterCenters[0][1],clusterCenters[1][1])
cirlePos3=(clusterCenters[0][2],clusterCenters[1][2])
# create the unfilled circles with a radius of 20 (this value is arbitrary)
circ1=plt.Circle(cirlePos1,20,color='r',fill=False)
circ2=plt.Circle(cirlePos2,20,color='r',fill=False)
circ3=plt.Circle(cirlePos3,20,color='r',fill=False)
# add the circles to your plot
ax.add_patch(circ1)
ax.add_patch(circ2)
ax.add_patch(circ3)
Out[21]:
Kuhlthau 1991 Kuhlthau, C.C.: Inside the Search Process: Information Seeking from the User’s Perspective. Jrnl. o. t. American Soc. f. Information Science 42(5) (1991) 361–371
Ingwersen 1996 Ingwersen, P.: Cognitive Perspectives of Information Retrieval Interaction: Elements of a Cognitive IR Theory. Journal of Documentation 52 (1996) 3–50
Ellis/Haugan 1997 Ellis, D., Haugan, M.: Modelling the Information Seeking Patterns of Engineers and Research Scientists in an Industrial Environment. Journal of Documentation 53(4) (1997) 384–403
Reiterer et al. 2000 Reiterer, H., Mußler, G., Mann, T., Handschuh, S.: INSYDER. In: Proc. of the 23rd SIGIR ’00, ACM (2000) 112–119
Salton et al. 1975 Salton, G., Wong, A., Yang, C.S.: A vector space model for automatic indexing October 1975 Communications of the ACM: Volume 18 Issue 11, Nov. 1975
van Rijsbergen 1979 van Rijsbergen, Cornelis J.: Information Retrieval. 2. London : Butterworths, 1979
Bates 1989 Bates, M.: The Design of Browsing and Berrypicking Techniques for the Online Search Interface. Online Review 13(5) (1989) 407–424
In [ ]: