In [91]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint

import sys
from time import time

import numpy as np
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['clf', 'e']
`%matplotlib` prevents importing * from pylab and numpy

In [35]:
dataset = fetch_20newsgroups(subset='train')

In [36]:
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()


11314 documents
20 categories
()

In [38]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english',use_idf=True)

In [52]:
t0 = time()
X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()


done in 3.360153s
n_samples: 11314, n_features: 56122
()

In [67]:
X.shape


Out[67]:
(11314, 56122)

In [71]:
X


Out[71]:
<11314x56122 sparse matrix of type '<type 'numpy.float64'>'
	with 1180487 stored elements in Compressed Sparse Row format>

In [72]:
import scipy.sparse

In [75]:
scipy.sparse.issparse(X)


Out[75]:
True

In [78]:
scipy.sparse.isspmatrix_csr(X)


Out[78]:
True

In [83]:
XT = scipy.sparse.csr_matrix.transpose(X)

In [85]:
XT.shape


Out[85]:
(56122, 11314)

In [87]:
A =XT*X

In [88]:
A.shape


Out[88]:
(56122, 56122)

In [113]:
e = scipy.sparse.linalg.eigsh(A,10)

In [ ]:
e

In [123]:
A.shape


Out[123]:
(56122, 56122)

In [124]:
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

In [125]:
km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1,verbose=True)

In [126]:
km.fit(X)


Initialization complete
Iteration  0, inertia 21125.031
Iteration  1, inertia 11015.579
Iteration  2, inertia 10962.996
Iteration  3, inertia 10937.959
Iteration  4, inertia 10924.570
Iteration  5, inertia 10917.009
Iteration  6, inertia 10910.716
Iteration  7, inertia 10903.537
Iteration  8, inertia 10897.802
Iteration  9, inertia 10896.361
Iteration 10, inertia 10895.680
Iteration 11, inertia 10895.077
Iteration 12, inertia 10894.385
Iteration 13, inertia 10893.487
Iteration 14, inertia 10892.448
Iteration 15, inertia 10891.038
Iteration 16, inertia 10890.493
Iteration 17, inertia 10890.295
Iteration 18, inertia 10890.204
Iteration 19, inertia 10890.135
Iteration 20, inertia 10890.029
Iteration 21, inertia 10889.974
Iteration 22, inertia 10889.936
Iteration 23, inertia 10889.916
Converged at iteration 23
Out[126]:
KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=20, n_init=1,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=True)

In [127]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels, sample_size=1000))


Homogeneity: 0.365
Completeness: 0.508
V-measure: 0.425
Adjusted Rand-Index: 0.085
Silhouette Coefficient: 0.006

In [134]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()


Top terms per cluster:

In [135]:
terms


Out[135]:
[u'00',
 u'000',
 u'0000',
 u'00000',
 u'00000000',
 u'0000000004',
 u'0000000005',
 u'0000001200',
 u'000005102000',
 u'000021',
 u'0000vec',
 u'0001',
 u'000152',
 u'0002',
 u'000359',
 u'00041032',
 u'000413',
 u'0004422',
 u'0005',
 u'0005111312',
 u'0005111312na1em',
 u'000531',
 u'00072',
 u'0009',
 u'000k',
 u'000th',
 u'001',
 u'0010',
 u'00100111b',
 u'0010580b',
 u'0011',
 u'001116',
 u'001127',
 u'001230',
 u'0013',
 u'001319',
 u'001321',
 u'001428',
 u'001555',
 u'001642',
 u'001707',
 u'001718',
 u'001813',
 u'002',
 u'0020',
 u'002118',
 u'0022',
 u'002222',
 u'002251w',
 u'002302',
 u'0028',
 u'0029',
 u'002937',
 u'003',
 u'003015',
 u'003258u19250',
 u'0033',
 u'003522',
 u'003719',
 u'003749',
 u'004',
 u'0044',
 u'004418',
 u'0049',
 u'005',
 u'005131',
 u'005245',
 u'005314',
 u'0059',
 u'0060',
 u'0062',
 u'0065',
 u'0068',
 u'007',
 u'0076',
 u'0078',
 u'008',
 u'0086',
 u'009',
 u'0096b0f0',
 u'0096b11b',
 u'0096b294',
 u'0098',
 u'00am',
 u'00bjgood',
 u'00ecgillespi',
 u'00ecgillespie',
 u'00mbstultz',
 u'00pm',
 u'00r',
 u'01',
 u'010',
 u'0100',
 u'01000000b',
 u'01001111b',
 u'01002',
 u'01003',
 u'010235',
 u'010329',
 u'010734',
 u'010821',
 u'0109',
 u'011',
 u'011042',
 u'0111',
 u'0112',
 u'0114',
 u'011720',
 u'011805',
 u'011823',
 u'011855',
 u'012',
 u'0123456789',
 u'0126',
 u'013',
 u'013037',
 u'013651',
 u'0138',
 u'013939',
 u'014',
 u'01420',
 u'014237',
 u'015',
 u'015415',
 u'015442',
 u'0158',
 u'01580',
 u'016',
 u'01609',
 u'0164',
 u'01701',
 u'01720',
 u'01730',
 u'01752',
 u'01803',
 u'01821',
 u'0183',
 u'0184',
 u'01854',
 u'018b',
 u'01a',
 u'01wb',
 u'02',
 u'020',
 u'0200',
 u'020347',
 u'020356',
 u'020359',
 u'0205',
 u'020504',
 u'0209',
 u'021021',
 u'02106',
 u'02115',
 u'02118',
 u'02138',
 u'02139',
 u'02142',
 u'02154',
 u'02172',
 u'02173',
 u'022',
 u'022113',
 u'02215',
 u'022218',
 u'022222',
 u'0223',
 u'02238',
 u'0226',
 u'022922',
 u'022926',
 u'023017',
 u'023044',
 u'0235',
 u'023730',
 u'023b',
 u'024',
 u'024036',
 u'024103',
 u'024222',
 u'024246',
 u'024423',
 u'0245',
 u'02451203',
 u'025',
 u'025027',
 u'025240',
 u'025426',
 u'025818u28037',
 u'025924',
 u'02678944',
 u'02903',
 u'02917',
 u'02p',
 u'02tl',
 u'02tm_',
 u'02uv',
 u'03',
 u'030',
 u'0300',
 u'030031',
 u'0303',
 u'030412',
 u'030636',
 u'031',
 u'031349',
 u'031423',
 u'031616',
 u'0318',
 u'031823',
 u'032017',
 u'032022',
 u'032251',
 u'032345',
 u'032623',
 u'032828',
 u'033',
 u'0330',
 u'033446',
 u'034',
 u'034101',
 u'034226',
 u'0346',
 u'0349',
 u'0350',
 u'035020',
 u'035406',
 u'0357',
 u'0358',
 u'036',
 u'0362',
 u'0366',
 u'037',
 u'0372',
 u'038',
 u'0382',
 u'0384',
 u'0391',
 u'03hz',
 u'03i',
 u'03k',
 u'04',
 u'040',
 u'0400',
 u'040254',
 u'040449',
 u'041',
 u'0410',
 u'04110',
 u'041343',
 u'041741',
 u'042',
 u'0423',
 u'043',
 u'043426',
 u'043654',
 u'0437',
 u'043935',
 u'044140',
 u'044323',
 u'0444',
 u'044749',
 u'045',
 u'0454',
 u'045526',
 u'045612',
 u'045651',
 u'045u',
 u'047',
 u'0493',
 u'04g',
 u'04p',
 u'04q',
 u'05',
 u'050',
 u'0500',
 u'050046mvs104',
 u'050127',
 u'050311',
 u'050451',
 u'050550',
 u'051',
 u'0510',
 u'0511',
 u'051746',
 u'051942',
 u'052',
 u'052005',
 u'052120rap115',
 u'053',
 u'0530',
 u'0533',
 u'053333',
 u'053553',
 u'053736',
 u'053748rap115',
 u'053905',
 u'054820',
 u'055',
 u'055100',
 u'055109',
 u'055341',
 u'056',
 u'0565',
 u'0578',
 u'0582',
 u'059',
 u'0593',
 u'0598',
 u'05apr93',
 u'05l',
 u'06',
 u'060',
 u'0600',
 u'060010',
 u'060493101758',
 u'060493114752',
 u'060540',
 u'060553',
 u'06066',
 u'0608',
 u'061',
 u'06108',
 u'06111',
 u'061326',
 u'0615',
 u'062055',
 u'062219',
 u'062907',
 u'06320',
 u'063425',
 u'0635',
 u'064',
 u'064028',
 u'064804',
 u'0649',
 u'065',
 u'06520',
 u'0663',
 u'0666',
 u'0667',
 u'067',
 u'0674',
 u'06840',
 u'0688',
 u'069',
 u'06eh',
 u'06p',
 u'06paul',
 u'06w',
 u'07',
 u'070',
 u'0700',
 u'0705',
 u'07059',
 u'0709',
 u'071',
 u'071791',
 u'071823',
 u'0721',
 u'0729',
 u'073',
 u'073051',
 u'073457ripbc',
 u'0735',
 u'0739',
 u'074',
 u'074054',
 u'0747',
 u'074836',
 u'075',
 u'0751',
 u'075822',
 u'077',
 u'07748',
 u'0777',
 u'079',
 u'0792',
 u'0793',
 u'0795',
 u'08',
 u'0801',
 u'08057',
 u'081',
 u'081052',
 u'0812',
 u'0815',
 u'0820',
 u'0821',
 u'082152',
 u'0823',
 u'082430',
 u'0824e',
 u'082502acps6992',
 u'0826',
 u'083',
 u'083057',
 u'0832',
 u'083324',
 u'084',
 u'084042',
 u'0845',
 u'085',
 u'0850',
 u'08502',
 u'08520',
 u'085337',
 u'08540',
 u'085638',
 u'085848',
 u'086',
 u'0863',
 u'0866',
 u'087',
 u'088',
 u'0883',
 u'08836',
 u'089',
 u'0891',
 u'0895',
 u'08a283a0',
 u'08h',
 u'09',
 u'0900',
 u'090030',
 u'090731',
 u'0908',
 u'091',
 u'091051',
 u'091139',
 u'091202',
 u'091258',
 u'091844',
 u'092101',
 u'0922',
 u'092246dlmqc',
 u'092830',
 u'092954',
 u'0930',
 u'093227',
 u'093231',
 u'093300',
 u'093527',
 u'093914',
 u'094',
 u'094509',
 u'095',
 u'0950',
 u'0952',
 u'095220',
 u'0953',
 u'0962',
 u'0965',
 u'0969',
 u'0987',
 u'0988',
 u'099',
 u'0996',
 u'0_',
 u'0___',
 u'0a',
 u'0b',
 u'0b10',
 u'0b14',
 u'0b15',
 u'0b16',
 u'0bn',
 u'0c',
 u'0cg',
 u'0d',
 u'0d2',
 u'0db',
 u'0df',
 u'0e',
 u'0e1',
 u'0e9',
 u'0ek',
 u'0ep',
 u'0f',
 u'0fh',
 u'0fk',
 u'0g',
 u'0g4',
 u'0g8',
 u'0ggv',
 u'0gn',
 u'0h',
 u'0h2',
 u'0h9',
 u'0ha',
 u'0hd',
 u'0ht',
 u'0i',
 u'0ic',
 u'0iv',
 u'0ivbtm9',
 u'0ivbud',
 u'0ivbud9',
 u'0ivbudk',
 u'0ivbvl',
 u'0ivf2l',
 u'0j',
 u'0k',
 u'0k5',
 u'0kd',
 u'0kj',
 u'0km',
 u'0l',
 u'0m',
 u'0m75u',
 u'0m8b',
 u'0m8bnh',
 u'0ma',
 u'0max',
 u'0mk',
 u'0mk80',
 u'0mvbdi',
 u'0mvbgt',
 u'0mvmk',
 u'0n',
 u'0n1',
 u'0o',
 u'0p',
 u'0pd',
 u'0q',
 u'0qax',
 u'0qq',
 u'0qu',
 u'0qvq',
 u'0r',
 u'0r_',
 u'0ra6abh107h',
 u'0rdf',
 u'0rhj',
 u'0s',
 u'0sl',
 u'0t',
 u'0t7',
 u'0tb',
 u'0tbxn',
 u'0tbxom',
 u'0tg',
 u'0th',
 u'0tq',
 u'0tq33',
 u'0tq6',
 u'0u',
 u'0u1',
 u'0u14',
 u'0v',
 u'0va',
 u'0w',
 u'0w5',
 u'0w5r',
 u'0wa',
 u'0wc',
 u'0we',
 u'0wk',
 u'0x',
 u'0x0',
 u'0x00',
 u'0x01',
 u'0x03',
 u'0x0f',
 u'0x10',
 u'0x100',
 u'0x20',
 u'0x21',
 u'0x2e0',
 u'0x30',
 u'0x37f',
 u'0x3c',
 u'0x3f',
 u'0xff',
 u'0y',
 u'0z',
 u'10',
 u'100',
 u'1000',
 u'10000',
 u'100000',
 u'100015',
 u'100024',
 u'10009',
 u'1000cc',
 u'1000r',
 u'1000s',
 u'1000w',
 u'1000yds',
 u'1001',
 u'10012',
 u'10016',
 u'10018',
 u'1002',
 u'10022',
 u'1003',
 u'10036',
 u'100387',
 u'100444',
 u'100452',
 u'1005',
 u'1006',
 u'1007',
 u'1008',
 u'1009',
 u'100921rk0vsanu',
 u'10098',
 u'100dpi',
 u'100hz',
 u'100k',
 u'100km',
 u'100lez',
 u'100m',
 u'100ma',
 u'100mb',
 u'100mhz',
 u'100mph',
 u'100ns',
 u'100s',
 u'101',
 u'1010',
 u'10101',
 u'101010',
 u'101044',
 u'1011',
 u'10115',
 u'10118',
 u'1012',
 u'101241',
 u'1013',
 u'101323',
 u'1014',
 u'1015',
 u'1016',
 u'101636',
 u'1017',
 u'10176',
 u'10179',
 u'1018',
 u'1019',
 u'101944',
 u'101957',
 u'101e',
 u'102',
 u'1020',
 u'102007',
 u'10206',
 u'1021',
 u'1023',
 u'1024',
 u'1024x758',
 u'1024x768',
 u'1024x768x16',
 u'1024x768x256',
 u'1024x768x65536',
 u'1024x768x70',
 u'1024x786x24',
 u'1025',
 u'10250',
 u'1026',
 u'1027',
 u'10273',
 u'10274',
 u'102756',
 u'1028',
 u'1029',
 u'102nd',
 u'103',
 u'1030',
 u'103038',
 u'1031',
 u'1032',
 u'103237',
 u'10326',
 u'1033',
 u'1034',
 u'1035',
 u'1036',
 u'1037',
 u'1038',
 u'1039',
 u'103rd',
 u'104',
 u'1040',
 u'104158',
 u'1042',
 u'1043',
 u'10438',
 u'1045',
 u'1046',
 u'1047',
 u'104746',
 u'1048',
 u'1049',
 u'105',
 u'1050',
 u'1051',
 u'10510',
 u'1053',
 u'105307',
 u'1054',
 u'1055',
 u'1056',
 u'1057',
 u'105738',
 u'1058',
 u'10580',
 u'105809',
 u'1059',
 u'105m',
 u'105mb',
 u'106',
 u'1060',
 u'10601',
 u'1061',
 u'10615',
 u'1062',
 u'1063',
 u'1064',
 u'1065',
 u'1066',
 u'10669',
 u'1067',
 u'1068',
 u'1069',
 u'10694',
 u'106ps',
 u'107',
 u'1070',
 u'1071',
 u'10711',
 u'1072',
 u'1073',
 u'1074',
 u'10748539',
 u'1075',
 u'1076',
 u'1077',
 u'1078',
 u'1079',
 u'10792',
 u'108',
 u'1080',
 u'1081',
 u'1082',
 u'1083',
 u'1084',
 u'10847',
 u'1085',
 u'1088',
 u'10886',
 u'1089',
 u'10890',
 u'109',
 u'1090',
 u'10901',
 u'1091',
 u'1093',
 u'1094',
 u'10946',
 u'1095',
 u'1096',
 u'1097',
 u'1098',
 u'1099',
 u'10_',
 u'10a',
 u'10base',
 u'10baset',
 u'10c',
 u'10cm',
 u'10e20',
 u'10h',
 u'10k',
 u'10km',
 u'10m',
 u'10mb',
 u'10mhz',
 u'10mil',
 u'10min',
 u'10mm',
 u'10pm',
 u'10s',
 u'10th',
 u'10v',
 u'10w',
 u'10w40',
 u'10x',
 u'10x20',
 u'11',
 u'110',
 u'1100',
 u'110021',
 u'11004',
 u'1101',
 u'1102',
 u'1105',
 u'11074',
 u'1108',
 u'1109',
 u'110m',
 u'110v',
 u'110vac',
 u'111',
 u'1111',
 u'11111101b',
 u'1113',
 u'1114',
 u'1115',
 u'11150',
 u'111652',
 u'111713',
 u'1118',
 u'11181',
 u'1119',
 u'111919',
 u'112',
 u'1120',
 u'1121',
 u'1122',
 u'11229',
 u'11230',
 u'1124',
 u'1126',
 u'1127',
 u'1128',
 u'1129',
 u'11292',
 u'112f',
 u'113',
 u'11311',
 u'113128',
 u'1132',
 u'113223',
 u'1133',
 u'11331',
 u'1134',
 u'1135',
 u'11353',
 u'11361',
 u'11363',
 u'1137',
 u'1138',
 u'113956',
 u'113p',
 u'113q',
 u'113qs',
 u'113qw',
 u'113s',
 u'113s1',
 u'114',
 u'1140',
 u'1141',
 u'114127',
 u'11414',
 u'114158',
 u'1142',
 u'1143',
 u'1144',
 u'11448',
 u'1145',
 u'1145w1',
 u'1146',
 u'11467',
 u'1147',
 u'11471',
 u'11473',
 u'1147902781',
 u'1149',
 u'115',
 u'1150',
 u'115080',
 u'1151',
 u'11522',
 u'115288',
 u'115290',
 u'1152x900',
 u'115300',
 u'115313',
 u'115397',
 u'1154',
 u'115437',
 u'11546',
 u'11548',
 u'1155',
 u'115511',
 u'115565',
 u'1156',
 u'1157',
 u'115707',
 u'11573',
 u'115863',
 u'1159',
 u'115a',
 u'115vac',
 u'116',
 u'1160',
 u'116005',
 u'1161',
 u'11613',
 u'1163',
 u'116305',
 u'11632',
 u'1164',
 u'1165',
 u'11670',
 u'1169',
 u'11690',
 u'117',
 u'1170',
 u'11732',
 u'1174',
 u'1175',
 u'11751',
 u'1176',
 u'11769k',
 u'1177',
 u'11770',
 u'11782',
 u'1179',
 u'11797',
 u'118',
 u'1180',
 u'11800',
 u'11812',
 u'11825',
 u'11830',
 u'11836',
 u'1185',
 u'1186',
 u'11861',
 u'11888',
 u'119',
 u'1190',
 u'1192',
 u'1192d',
 u'1194',
 u'1196',
 u'11971',
 u'1198',
 u'11a',
 u'11b',
 u'11h',
 u'11k',
 u'11oq',
 u'11pm',
 u'11sdpa',
 u'11th',
 u'12',
 u'120',
 u'1200',
 u'12009',
 u'1200cc',
 u'1200dpi',
 u'1200x',
 u'1201',
 u'1203',
 u'120311',
 u'120399',
 u'1204',
 u'120466',
 u'1205',
 u'12050',
 u'12056',
 u'1206',
 u'120666',
 u'1207',
 u'12073',
 u'1208',
 u'1209',
 u'12091',
 u'12092',
 u'120958',
 u'120k',
 u'120km',
 u'120kvolt',
 u'120lb',
 u'120m',
 u'120mb',
 u'120mph',
 u'120v',
 u'120vac',
 u'121',
 u'1210',
 u'121019',
 u'1211',
 u'121134',
 u'1212',
 u'121236',
 u'1213',
 u'12134',
 u'12139',
 u'1214',
 u'121411',
 u'1215',
 u'12176',
 u'12180',
 u'12187',
 u'122',
 u'1220',
 ...]

In [143]:
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print terms[ind]
    print


Cluster 0:
turkish
armenian
armenians
armenia
turks
argic
turkey
serdar
genocide
zuma

Cluster 1:
com
sandvik
stratus
cramer
optilink
fbi
kent
people
article
islam

Cluster 2:
drive
scsi
ide
controller
drives
hard
disk
floppy
bus
mac

Cluster 3:
baseball
year
game
team
games
players
runs
braves
pitching
article

Cluster 4:
intercon
amanda
walker
clipper
com
corporation
systems
herndon
crypto
chaos

Cluster 5:
nasa
space
gov
henry
alaska
toronto
moon
zoo
larc
spencer

Cluster 6:
geb
banks
gordon
pitt
cs
dsl
n3jxp
cadre
chastity
shameful

Cluster 7:
hockey
team
ca
game
nhl
play
players
season
leafs
toronto

Cluster 8:
uk
ac
university
cam
dcs
ed
mathew
posting
host
nntp

Cluster 9:
card
video
drivers
vga
monitor
cards
bus
windows
driver
diamond

Cluster 10:
key
clipper
encryption
chip
keys
escrow
government
com
nsa
algorithm

Cluster 11:
__
___
berkeley
ax
simms
_____
____
baalke
vram
jpl

Cluster 12:
israel
israeli
jews
arab
jake
arabs
lebanese
israelis
adam
policy

Cluster 13:
keith
caltech
livesey
sgi
solntze
wpd
jon
schneider
cco
morality

Cluster 14:
car
cars
com
engine
oil
dealer
radar
article
just
good

Cluster 15:
access
digex
pat
express
net
online
prb
communications
com
greenbelt

Cluster 16:
windows
window
dos
file
files
ms
program
com
mouse
use

Cluster 17:
god
jesus
bible
christians
people
christ
christian
faith
believe
church

Cluster 18:
gun
guns
people
firearms
com
weapons
don
crime
control
militia

Cluster 19:
com
university
posting
host
article
nntp
know
like
ca
just


In [ ]: