In [14]:
"""Clustering"""
# flat and hierarchical clustering

# sklearn.cluster package provides a wide range of clustering approches

## k-means: most widely used flat clustering algorithm


Out[14]:
'Clustering'

In [15]:
### 20newsgroup

import sklearn.datasets
import scipy as sp

all_data = sklearn.datasets.fetch_20newsgroups(subset="all")
print("Number of total posts: %i" % len(all_data.filenames))

groups = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
                                                 categories=groups)
print("Number of training posts in tech groups:", len(train_data.filenames))


Number of total posts: 18846
('Number of training posts in tech groups:', 3529)

In [16]:
# real data is noisy, use vectorizer to ignore them

import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

from sklearn.feature_extraction.text import TfidfVectorizer


class StemmedTfidfVectorizer(TfidfVectorizer):

    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
                                    stop_words='english', decode_error='ignore'
                                    )

vectorized = vectorizer.fit_transform(train_data.data)
num_samples, num_features = vectorized.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))


#samples: 3529, #features: 4712

In [17]:
# We now have a pool of 3,529 posts and extracted for each of them a feature vector of 4,712 dimensions. 
# That is what K-means takes as input. We will fix the cluster size to 50

from sklearn.cluster import KMeans
num_clusters = 50
km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3)
clustered = km.fit(vectorized)

print("km.labels_=%s" % km.labels_)
print("km.labels_.shape=%s" % km.labels_.shape)


Initialization complete
Iteration  0, inertia 5686.053
Iteration  1, inertia 3164.888
Iteration  2, inertia 3132.208
Iteration  3, inertia 3111.713
Iteration  4, inertia 3098.584
Iteration  5, inertia 3092.191
Iteration  6, inertia 3087.277
Iteration  7, inertia 3084.100
Iteration  8, inertia 3082.800
Iteration  9, inertia 3082.234
Iteration 10, inertia 3081.949
Iteration 11, inertia 3081.843
Iteration 12, inertia 3081.791
Iteration 13, inertia 3081.752
Iteration 14, inertia 3081.660
Iteration 15, inertia 3081.617
Iteration 16, inertia 3081.589
Iteration 17, inertia 3081.571
Converged at iteration 17: center shift 0.000000e+00 within tolerance 2.069005e-08
km.labels_=[48 23 31 ...,  6  2 22]
km.labels_.shape=3529

In [21]:
# use km to new_post

new_post = \
    """Disk drive problems. Hi, I have a problem with my hard disk.
After 1 year it is working only sporadically now.
I tried to format it, but now it doesn't boot any more.
Any ideas? Thanks.
"""
new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]

# do not need to compare new_post_vec to all post vectors, just same cluster 
similar_indices = (km.labels_ == new_post_label).nonzero()[0]

similar = []
for i in similar_indices:
    dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
    similar.append((dist, train_data.data[i]))
    
similar = sorted(similar)
print("Count similar: %i" % len(similar))

show_at_1 = similar[0]
show_at_2 = similar[int(len(similar) / 10)]
show_at_3 = similar[int(len(similar) / 2)]

print("=== #1 ===")
print(show_at_1)
print()

print("=== #2 ===")
print(show_at_2)
print()

print("=== #3 ===")
print(show_at_3)


Count similar: 56
=== #1 ===
(1.0378441731334074, u"From: Thomas Dachsel <GERTHD@mvs.sas.com>\nSubject: BOOT PROBLEM with IDE controller\nNntp-Posting-Host: sdcmvs.mvs.sas.com\nOrganization: SAS Institute Inc.\nLines: 25\n\nHi,\nI've got a Multi I/O card (IDE controller + serial/parallel\ninterface) and two floppy drives (5 1/4, 3 1/2) and a\nQuantum ProDrive 80AT connected to it.\nI was able to format the hard disk, but I could not boot from\nit. I can boot from drive A: (which disk drive does not matter)\nbut if I remove the disk from drive A and press the reset switch,\nthe LED of drive A: continues to glow, and the hard disk is\nnot accessed at all.\nI guess this must be a problem of either the Multi I/o card\nor floppy disk drive settings (jumper configuration?)\nDoes someone have any hint what could be the reason for it.\nPlease reply by email to GERTHD@MVS.SAS.COM\nThanks,\nThomas\n+-------------------------------------------------------------------+\n| Thomas Dachsel                                                    |\n| Internet: GERTHD@MVS.SAS.COM                                      |\n| Fidonet:  Thomas_Dachsel@camel.fido.de (2:247/40)                 |\n| Subnet:   dachsel@rnivh.rni.sub.org (UUCP in Germany, now active) |\n| Phone:    +49 6221 4150 (work), +49 6203 12274 (home)             |\n| Fax:      +49 6221 415101                                         |\n| Snail:    SAS Institute GmbH, P.O.Box 105307, D-W-6900 Heidelberg |\n| Tagline:  One bad sector can ruin a whole day...                  |\n+-------------------------------------------------------------------+\n")
()
=== #2 ===
(1.1503043264096682, u'From: rpao@mts.mivj.ca.us (Roger C. Pao)\nSubject: Re: Booting from B drive\nOrganization: MicroTech Software\nLines: 34\n\nglang@slee01.srl.ford.com (Gordon Lang) writes:\n\n>David Weisberger (djweisbe@unix.amherst.edu) wrote:\n>: I have a 5 1/4" drive as drive A.  How can I make the system boot from\n>: my 3 1/2" B drive?  (Optimally, the computer would be able to boot\n>: from either A or B, checking them in order for a bootable disk.  But\n>: if I have to switch cables around and simply switch the drives so that\n>: it can\'t boot 5 1/4" disks, that\'s OK.  Also, boot_b won\'t do the trick\n>: for me.)\n>: \n>: Thanks,\n>:   Davebo\n>We had the same issue plague us for months on our Gateway.  I finally\n>got tired of it so I permanently interchanged the drives.  The only\n>reason I didn\'t do it in the first place was because I had several\n>bootable 5-1/4\'s and some 5-1/4 based install disks which expected\n>the A drive.  I order all new software (and upgrades) to be 3-1/2 and\n>the number of "stupid" install programs that can\'t handle an alternate\n>drive are declining with time - the ones I had are now upgraded.  And\n>as for the bootable 5-1/4\'s I just cut 3-1/2 replacements.\n\n>If switching the drives is not an option, you might be able to wire up\n>a drive switch to your computer chasis.  I haven\'t tried it but I think\n>it would work as long as it is wired carefully.\n\nI did this.  I use a relay (Radio Shack 4PDT) instead of a huge\nswitch.  This way, if the relay breaks, my drives will still work.\n\nIt works fine, but you may still need to change the CMOS before the\ndrive switch will work correctly for some programs.\n\nrp93\n-- \nRoger C. Pao  {gordius,bagdad}!mts!rpao, rpao@mts.mivj.ca.us\n')
()
=== #3 ===
(1.2793959084781283, u'From: vg@volkmar.Stollmann.DE (Volkmar Grote)\nSubject: IBM PS/1 vs TEAC FD\nDistribution: world\nOrganization: Me? Organized?\nLines: 21\n\nHello,\n\nI already tried our national news group without success.\n\nI tried to replace a friend\'s original IBM floppy disk in his PS/1-PC\nwith a normal TEAC drive.\nI already identified the power supply on pins 3 (5V) and 6 (12V), shorted\npin 6 (5.25"/3.5" switch) and inserted pullup resistors (2K2) on pins\n8, 26, 28, 30, and 34.\nThe computer doesn\'t complain about a missing FD, but the FD\'s light\nstays on all the time. The drive spins up o.k. when I insert a disk,\nbut I can\'t access it.\nThe TEAC works fine in a normal PC.\n\nAre there any points I missed?\n\nThank you.\n\tVolkmar\n\n---\nVolkmar.Grote@Stollmann.DE\n')

In [23]:
"""noise analysis again"""
import sklearn.datasets

groups = [
    'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
    'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
train_data = sklearn.datasets.fetch_20newsgroups(subset="train",
                                                 categories=groups)

labels = train_data.target
num_clusters = 50  # sp.unique(labels).shape[0]

import nltk.stem
english_stemmer = nltk.stem.SnowballStemmer('english')

from sklearn.feature_extraction.text import TfidfVectorizer


class StemmedTfidfVectorizer(TfidfVectorizer):

    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5,
                                    stop_words='english', decode_error='ignore'
                                    )
vectorized = vectorizer.fit_transform(train_data.data)

post_group = zip(train_data.data, train_data.target)
# Create a list of tuples that can be sorted by
# the length of the posts
all = [(len(post[0]), post[0], train_data.target_names[post[1]])
       for post in post_group]
graphics = sorted([post for post in all if post[2] == 'comp.graphics'])
print(graphics[5])

noise_post = graphics[5][1]

analyzer = vectorizer.build_analyzer()
print(list(analyzer(noise_post)))


(245, u'From: SITUNAYA@IBM3090.BHAM.AC.UK\nSubject: test....(sorry)\nOrganization: The University of Birmingham, United Kingdom\nLines: 1\nNNTP-Posting-Host: ibm3090.bham.ac.uk\n\n==============================================================================\n', 'comp.graphics')
[u'situnaya', u'ibm3090', u'bham', u'ac', u'uk', u'subject', u'test', u'sorri', u'organ', u'univers', u'birmingham', u'unit', u'kingdom', u'line', u'nntp', u'post', u'host', u'ibm3090', u'bham', u'ac', u'uk']

In [24]:
useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names())
print(sorted(useful))
# ['ac', 'birmingham', 'host', 'kingdom', 'nntp', 'sorri', 'test', 'uk', 'unit', 'univers']

for term in sorted(useful):
    print('IDF(%s)=%.2f' % (term,
                            vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]]))


[u'ac', u'birmingham', u'host', u'kingdom', u'nntp', u'sorri', u'test', u'uk', u'unit', u'univers']
IDF(ac)=3.51
IDF(birmingham)=6.77
IDF(host)=1.74
IDF(kingdom)=6.68
IDF(nntp)=1.77
IDF(sorri)=4.14
IDF(test)=3.83
IDF(uk)=3.70
IDF(unit)=4.42
IDF(univers)=1.91

In [25]:
"""sklearn.metrics"""


from sklearn import metrics
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
# Homogeneity: 0.400
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
# Completeness: 0.206
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
# V-measure: 0.272
print("Adjusted Rand Index: %0.3f" %
      metrics.adjusted_rand_score(labels, km.labels_))
# Adjusted Rand Index: 0.064
print("Adjusted Mutual Information: %0.3f" %
      metrics.adjusted_mutual_info_score(labels, km.labels_))
# Adjusted Mutual Information: 0.197
print(("Silhouette Coefficient: %0.3f" %
       metrics.silhouette_score(vectorized, labels, sample_size=1000)))
# Silhouette Coefficient: 0.006


Homogeneity: 0.445
Completeness: 0.231
V-measure: 0.304
Adjusted Rand Index: 0.094
Adjusted Mutual Information: 0.223
Silhouette Coefficient: 0.006

In [27]:
new_post_vec = vectorizer.transform([new_post])
new_post_label = km.predict(new_post_vec)[0]

similar_indices = (km.labels_ == new_post_label).nonzero()[0]

similar = []
for i in similar_indices:
    dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray())
    similar.append((dist, train_data.data[i]))

similar = sorted(similar)
print("Count similar: %i" % len(similar))


Count similar: 56

In [ ]: