In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import cPickle as pickle
import codecs
import skfuzzy as fuzz 
import time

from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score

In [2]:
%matplotlib inline
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (8, 4)})

In [3]:
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [19]:
data.target


Out[19]:
array([10, 16, 14, ...,  4,  6,  7])

In [36]:
data.data[0]


Out[36]:
u'From: stimpy@dev-null.phys.psu.edu (Gregory Nagy)\nSubject: Re: ESPN UP YOURS .........\nOrganization: Penn State Laboratory for Elementary Steam Physics\nLines: 52\nNNTP-Posting-Host: dev-null.phys.psu.edu\n\nIn article <C5u542.3CD@news.udel.edu> tmavor@earthview.cms.udel.edu writes:\n>>\n>>[Various justifiable rantings on ESPN coverage by several deleted]\n>>\n>\n>The only way to change ESPN\'s thinking, if it is even possible, is to complain\n>to them directly.  Anyone know there telephone # in Bristol, Ct?  \n\nHeh... Try the rec.autos.sport FAQ. They are always calling ESPN to complain.\nI\'m sure you could find the number for ABC there too, as many west-coast \nviewers were compaining about how something as boring as hockey cut into\nthe Long Beach GP. =)\n\n>\n>I do find it hard to believe that ESPN doesn\'t think viewers will simply\n>change the channel from a boring game....I know I did.  And then, when\n>they didn\'t show the NYI-Wash overtime(s), I was livid!  If I wanted\n>to watch baseball, I could have turned on the Phillies-Padres extra\n>inning game....instead, I went to bed angry......I boycotted ESPN\'s\n>morning Sportscenter today, I was still so incensed.\n\nWere you (and several of the other people here it seems) asleep the day\n"contracts" were explained? ASPN has a piece of paper saying it MUST\nshow that baseball game if it happens. Many businesses payedd money to\nhave their commercials run during a baseball game. This is a business,\nnot your own personal video servant.\n\n>\n>My wife says I shouldn\'t go to bed angry, but last nite.........GRRRRRRR!\n>\n\nMaybe you should put that anger into something positive. For example, I saw\nads for the new Dodge both on the ESPN and KBL broadcasts. Why not write to\nDodge saying that "thanks to the ads run during the STANLEY CUP PLAYOFFS, \nyou will now concider their products in the future. They love to hear stuff\nlike that and in the future will be more willing to buy commercial time\nfor hockey games, giving ESPN (and other networks) more incentive to carry\ngames (just one example)\n\nCome on people, as great as we think it is, Hockey does not leapfrog the\n"big three" overight.\n\n> \n>---------------------------------------------------------------------\n>Tim Mavor\t\t   |  "I am known by many names.......\n>College of Marine Studies  |   some call me.........Tim."\n>Univ. of Delaware\t   |    \n>Newark, DE 19716\t   |  "You know much that is hidden, O\' Tim!"\n>tmavor@pandora.cms.udel.edu|  \tMonty Python and the Holy Grail\t---------------------------------------------------------------------\n>\n\n\n'

In [39]:
vectorizer = CountVectorizer(strip_accents='unicode',
                             stop_words='english',
                             max_df=0.75,
                             token_pattern=u'(?u)\\b[a-zA-z][a-zA-z]+\\b')
vectors = vectorizer.fit_transform(data.data)
print len(vectorizer.vocabulary_)
vectorizer.vocabulary_


68395
Out[39]:
{u'unsupportable': 62946,
 u'nunnery': 41985,
 u'sonja': 55781,
 u'woods': 66162,
 u'hanging': 26088,
 u'woody': 66169,
 u'kalmar': 32136,
 u'localized': 35186,
 u'disobeying': 16969,
 u'yougoslavie': 67715,
 u'sprague': 56358,
 u'voecking': 64445,
 u'canea': 9514,
 u'opener': 42926,
 u'ioannidis': 30391,
 u'rscharfy': 51895,
 u'discribed': 16848,
 u'bratislava': 8280,
 u'transvestism': 61007,
 u'mc_rssqp_fqod': 37176,
 u'surfmodl': 58109,
 u'broward': 8589,
 u'bringing': 8462,
 u'dascenzo': 14826,
 u'kolassa': 33162,
 u'wooded': 66155,
 u'enddate': 19482,
 u'vibrational': 64060,
 u'wooden': 66156,
 u'wednesday': 65246,
 u'andrei_lomakin': 3610,
 u'rauno': 49144,
 u'circuitry': 11123,
 u'thrace': 59914,
 u'complainers': 12280,
 u'borgward': 7991,
 u'gorman': 24758,
 u'keirnan': 32440,
 u'consenting': 12754,
 u'targa': 58901,
 u'inanimate': 29116,
 u'mdata': 37380,
 u'ingnorance': 29601,
 u'errors': 20050,
 u'dialogs': 16352,
 u'defenses': 15379,
 u'usenet': 63207,
 u'designing': 15965,
 u'televangelists': 59278,
 u'replaced': 50394,
 u'evolutionism': 20485,
 u'nukurangi': 41952,
 u'gakhaus': 23522,
 u'mailings': 36260,
 u'brainwashed': 8247,
 u'affiliates': 2501,
 u'highpass': 27082,
 u'affiliated': 2500,
 u'jdwj\\': 31183,
 u'poeples': 46157,
 u'kids': 32748,
 u'^]_fas': 535,
 u'controversy': 13059,
 u'circumferences': 11138,
 u'jumprun': 31907,
 u'sig_alrm': 54626,
 u'neurologist': 40996,
 u'spotty': 56343,
 u'tredysvr': 61083,
 u'freemason': 23000,
 u'topography': 60554,
 u'projection': 47427,
 u'_knowledge_': 1127,
 u'battisti': 6296,
 u'spacewalks': 55963,
 u'stern': 56962,
 u'dn]': 17349,
 u'dn^': 17350,
 u'constuct': 12870,
 u'umumi': 62228,
 u'dnd': 17358,
 u'dnf': 17361,
 u'dna': 17351,
 u'blashephemers': 7462,
 u'polyribosylribitol': 46302,
 u'insecurity': 29809,
 u'dnk': 17363,
 u'ramstein': 48972,
 u'dnp': 17365,
 u'distortions': 17135,
 u'sermons': 53869,
 u'spaying': 56026,
 u'dnz': 17367,
 u'benedikt': 6759,
 u'deloreans': 15586,
 u'populations': 46391,
 u'meteorologist': 37902,
 u'macwherehouse': 36102,
 u'cherbayev': 10695,
 u'adventist': 2368,
 u'reingold': 50050,
 u'intake': 29971,
 u'morally': 39306,
 u'pirenne': 45736,
 u'cgj[j^': 10371,
 u'matroid': 36979,
 u'phang': 45307,
 u'ofte': 42581,
 u'wang': 64921,
 u'i_i': 28310,
 u'hyatt': 28146,
 u'alflatoxin': 3002,
 u'hslrswi': 27884,
 u'i_c': 28308,
 u'i_b': 28306,
 u'disparagement': 16986,
 u'want': 64932,
 u'pinto': 45707,
 u'i_s': 28314,
 u'beyer': 6952,
 u'travel': 61030,
 u'copious': 13197,
 u'sclark': 53144,
 u'brassard': 8277,
 u'barbra': 6076,
 u'seds': 53543,
 u'assimilated': 4897,
 u'bordres': 7983,
 u'dinosaurs': 16617,
 u'wrong': 66411,
 u'wrona': 66410,
 u'_exclusion_': 1001,
 u'sentencing': 53757,
 u'glenns': 24441,
 u'menlo': 37713,
 u'disengaging': 16883,
 u'snugly': 55544,
 u'welcomed': 65317,
 u'listlessness': 34992,
 u'stoicism': 57119,
 u'conciously': 12468,
 u'instmanflags': 29920,
 u'airbags': 2769,
 u'beerhall': 6564,
 u'aargh': 1549,
 u'playhouse': 45962,
 u'fir': 22042,
 u'uor': 63032,
 u'xmptablewidget': 66961,
 u'uow': 63037,
 u'fit': 22109,
 u'screaming': 53252,
 u'gridiron': 25155,
 u'discourse': 16823,
 u'folate': 22494,
 u'cfs': 10349,
 u'uoa': 63023,
 u'fia': 21817,
 u'fif': 21871,
 u'fig': 21879,
 u'uoe': 63024,
 u'reentered': 49783,
 u'fin': 21967,
 u'fio': 22038,
 u'fil': 21909,
 u'sxiwlam': 58422,
 u'songwriter': 55778,
 u'vouchers': 64548,
 u'effects': 18780,
 u'turboed': 61592,
 u'eedsp': 18736,
 u'rasterized': 49080,
 u'photolyzed': 45483,
 u'barton': 6171,
 u'telekom': 59253,
 u'ingria': 29612,
 u'rasterizer': 49081,
 u'timeout': 60166,
 u'ingrid': 29613,
 u'uninfluenced': 62633,
 u'iibtin': 28651,
 u'parasites': 44237,
 u'isys': 30699,
 u'syf': 58448,
 u'toleration': 60440,
 u'xuserfilesearchpath': 67242,
 u'syd': 58442,
 u'enviroment': 19764,
 u'adapt': 2116,
 u'aquarids': 4261,
 u'jkbi': 31463,
 u'abbott': 1583,
 u'lovingkindness': 35451,
 u'mattter': 37011,
 u'estimate': 20197,
 u'whippits': 65540,
 u'chlorine': 10844,
 u'dgr': 16262,
 u'isotex': 30650,
 u'dgp': 16261,
 u'pfaff': 45259,
 u'disturbed': 17164,
 u'dischw': 16773,
 u'yester': 67555,
 u'purpose': 48017,
 u'mcafree': 37178,
 u'loook': 35339,
 u'breed': 8358,
 u'curiouser': 14308,
 u'callous': 9374,
 u'megabytes': 37577,
 u'olds': 42700,
 u'bakelite': 5879,
 u'renovated': 50310,
 u'masten': 36893,
 u'oldx': 42703,
 u'needed': 40800,
 u'acsnet': 2038,
 u'master': 36894,
 u'ddmc': 15062,
 u'tranceiver': 60886,
 u'graywings': 25051,
 u'hiya': 27255,
 u'genesis': 23947,
 u'cyprs': 14519,
 u'berlioz': 6845,
 u'rewards': 50965,
 u'mdgoodma': 37393,
 u'aaaahhh': 1519,
 u'mutilated': 40093,
 u'positively': 46476,
 u'ahmed': 2706,
 u'nonperfect': 41606,
 u'bannister': 6028,
 u'duckworth': 18110,
 u'tois': 60420,
 u'varje': 63683,
 u'ahmet': 2709,
 u'exclaimed': 20614,
 u'feeling': 21612,
 u'excessivly': 20597,
 u'hederson': 26625,
 u'atfedilen': 5046,
 u'miscellanea': 38570,
 u'starhawk': 56709,
 u'nwxrxe': 42070,
 u'eslami': 20129,
 u'spiff': 56185,
 u'affairs': 2488,
 u'wholesome': 65608,
 u'hymen': 28184,
 u'vga': 64031,
 u'coskrey': 13388,
 u'vgx': 64036,
 u'vgr': 64034,
 u'shipments': 54317,
 u'memebers': 37663,
 u'kariya': 32225,
 u'diminishing': 16583,
 u'xdmcplibsrc': 66729,
 u'simplify': 54786,
 u'soemone': 55626,
 u'jburgin': 31134,
 u'dg[': 16249,
 u'resonated': 50639,
 u'kc^[_n': 32361,
 u'halasi': 25905,
 u'majd': 36304,
 u'sy_': 58436,
 u'vertices': 63971,
 u'tech': 59126,
 u'fugitives': 23242,
 u'hiatt': 27019,
 u'saying': 52813,
 u'dickey': 16392,
 u'tempter': 59341,
 u'padded': 43928,
 u'ptcburp': 47839,
 u'tempted': 59340,
 u'klimek': 32987,
 u'apace': 3991,
 u'pallate': 44045,
 u'rsx[x': 51931,
 u'kinase': 32811,
 u'nordique': 41644,
 u'lube': 35550,
 u'uprade': 63086,
 u'xivo': 66866,
 u'wntshoxy': 66078,
 u'plate': 45926,
 u'jtpoupor': 31821,
 u'plato': 45937,
 u'platt': 45943,
 u'mcnab': 37317,
 u'photoelectric': 45471,
 u'altogether': 3254,
 u'massacring': 36878,
 u'icebreakers': 28404,
 u'droning': 17942,
 u'jaguar': 30975,
 u'ldr': 34158,
 u'nicely': 41223,
 u'patch': 44466,
 u'openly': 42933,
 u'iat^': 28354,
 u'mipmapping': 38525,
 u'programmatic': 47390,
 u'lot_': 35400,
 u'oppposition': 43006,
 u'xv_': 67249,
 u'etid': 20277,
 u'pinon': 45699,
 u'cslip': 14129,
 u'lots': 35405,
 u'irq': 30517,
 u'irs': 30551,
 u'xvu': 67279,
 u'xvt': 67276,
 u'xvi': 67265,
 u'xvj': 67272,
 u'srinivas': 56503,
 u'conductive': 12541,
 u'ira': 30457,
 u'irb': 30471,
 u'irc': 30474,
 u'cordoned': 13245,
 u'ire': 30478,
 u'irf': 30481,
 u'xvf': 67262,
 u'discipline': 16779,
 u'extend': 20913,
 u'nature': 40619,
 u'utcsri': 63291,
 u'rtty': 51960,
 u'ultrasonics': 62163,
 u'pharisee': 45314,
 u'extent': 20928,
 u'vboi': 63747,
 u'almos': 3173,
 u'lookit': 35324,
 u'keyspace': 32610,
 u'lookin': 35322,
 u'parys': 44384,
 u'fearlessly': 21552,
 u'libyan': 34699,
 u'gopher': 24736,
 u'minuses': 38515,
 u'knossos': 33085,
 u'dabbott': 14593,
 u'brindle': 8459,
 u'fr^': 22864,
 u'fr]': 22863,
 u'frc': 22962,
 u'fra': 22866,
 u'frg': 23050,
 u'fri': 23051,
 u'fro': 23104,
 u'bothers': 8064,
 u'frw': 23155,
 u'frv': 23154,
 u'fry': 23156,
 u'toning': 60503,
 u'obese': 42242,
 u'contemporateous': 12934,
 u'spit': 56235,
 u'albeaj': 2919,
 u'hepb': 26815,
 u'xiangxin': 66828,
 u'spid': 56176,
 u'doubts': 17688,
 u'spin': 56201,
 u'propellants': 47519,
 u'wildcat': 65721,
 u'contingencies': 12967,
 u'iitvax': 28682,
 u'professionally': 47353,
 u'voltmeter': 64498,
 u'misconstrued': 38585,
 u'canspace': 9542,
 u'polysaccharide': 46304,
 u'majid': 36309,
 u'unox': 62805,
 u'wildswans': 65729,
 u'kilcore': 32767,
 u'conditioned': 12521,
 u'koht': 33142,
 u'diservice': 16885,
 u'breakfast': 8324,
 u'hone': 27528,
 u'memorial': 37679,
 u'memoriam': 37680,
 u'mcontent': 37333,
 u'honk': 27538,
 u'democracies': 15637,
 u'spews': 56159,
 u'bonepart': 7874,
 u'hons': 27551,
 u'kidd': 32737,
 u'jartsu': 31073,
 u'kinsley': 32857,
 u'hartje': 26268,
 u'frenchmen': 23029,
 u'zeos': 68085,
 u'altar': 3211,
 u'academic': 1782,
 u'stillness': 57048,
 u'academia': 1781,
 u'nibby': 41215,
 u'mentorg': 37735,
 u'nurishment': 41995,
 u'corporate': 13300,
 u'larrison': 33920,
 u'absurdities': 1754,
 u'appropriately': 4202,
 u'stnt': 57097,
 u'jimfr': 31416,
 u'isodata': 30634,
 u'ha[': 25735,
 u'lasse': 33950,
 u'bighelmet': 7095,
 u'hibbitt': 27023,
 u'hah': 25838,
 u'hai': 25853,
 u'haj': 25886,
 u'hal': 25901,
 u'ham': 25958,
 u'han': 26022,
 u'hao': 26121,
 u'hab': 25744,
 u'hac': 25767,
 u'advancement': 2356,
 u'hay': 26416,
 u'mcnamara': 37322,
 u'wiving': 66016,
 u'haq': 26144,
 u'har': 26146,
 u'hat': 26322,
 u'hav': 26374,
 u'haw': 26395,
 u'etrbom': 20289,
 u'municipal': 39978,
 u'elders': 18987,
 u'gibanez': 24221,
 u'werdna': 65364,
 u'unequivocally': 62526,
 u'objective': 42263,
 u'indicative': 29328,
 u'stallings': 56630,
 u'miff': 38265,
 u'loompanics': 35332,
 u'utilites': 63311,
 u'ebroadwe': 18522,
 u'expartiate': 20759,
 u'defragmenter': 15439,
 u'stero': 56967,
 u'crowd': 13976,
 u'crowe': 13980,
 u'czech': 14538,
 u'mosques': 39406,
 u'crown': 13984,
 u'anarres': 3546,
 u'deflection': 15424,
 u'captive': 9612,
 u'defragmented': 15438,
 u'contiplating': 12986,
 u'deragatory': 15857,
 u'communicantes': 12160,
 u'lockdown': 35201,
 u'inhuman': 29644,
 u'hawks_dw': 26409,
 u'locksmithing': 35212,
 u'syphilis': 58552,
 u'completly': 12300,
 u'herrin': 26893,
 u'starring': 56722,
 u'vasilion': 63704,
 u'moflngan': 39061,
 u'mollard': 39125,
 u'restlessness': 50734,
 u'dnc': 17357,
 u'anomalous': 3778,
 u'acog': 2005,
 u'evangelistical': 20408,
 u'benched': 6739,
 u'dmitri_kvartalnov': 17324,
 u'kellett': 32457,
 u'kilgore': 32768,
 u'keith_primeau': 32445,
 u'mangoe': 36504,
 u'marshall': 36786,
 u's]wz': 52243,
 u'douay': 17666,
 u'marshals': 36789,
 u'aspencade': 4819,
 u'shoots': 54378,
 u'dissassembly': 17077,
 u'despised': 16006,
 u'fabric': 21101,
 u'raped': 49033,
 u'grasping': 25004,
 u'despises': 16007,
 u'rapes': 49034,
 u'medallists': 37490,
 u'alexandrian': 2989,
 u'perfumes': 44998,
 u'spurting': 56409,
 u'hadorot': 25809,
 u'ybqiegkrbcg': 67479,
 u'kazemzadeh': 32330,
 u'thesaurus': 59772,
 u'universityy': 62710,
 u'perfumed': 44997,
 u'unliveable': 62747,
 u'congratulations': 12651,
 u'humbled': 28000,
 u'toolkits': 60529,
 u'rippling': 51268,
 u'dny': 17366,
 u'largish': 33908,
 u'nicest': 41226,
 u'soldering': 55675,
 u'ishbeld': 30593,
 u'passenger': 44413,
 u'disgrace': 16888,
 u'flyback': 22408,
 u'twincom': 61735,
 u'xk_': 66877,
 u'bedsheet': 6539,
 u'slurring': 55295,
 u'mclure': 37302,
 u'xkm': 66889,
 u'xkc': 66878,
 u'confsesed': 12629,
 u'cambodia': 9415,
 u'asshole': 4888,
 u'rspk': 51913,
 u'pasadena': 44387,
 u'xfwapn': 66788,
 u'xkw': 66891,
 u'xku': 66890,
 u'palms': 44059,
 u'fi^_[': 21815,
 u'rightousness': 51202,
 u'palme': 44055,
 u'xthe': 67168,
 u'aragorn': 4291,
 u'hawkes': 26403,
 u'explosions': 20866,
 u'loren': 35372,
 u'contadictorily': 12906,
 u'shootout': 54377,
 u'o_wcp_': 42185,
 u'omran': 42796,
 u'childs': 10781,
 u'egypte': 18845,
 u'chaim': 10402,
 u'corps': 13307,
 u'aneksantizias': 3637,
 u'aohr': 3977,
 u'nezareti': 41155,
 u'amplification': 3450,
 u'mb\\y': 37121,
 u'grappler': 25000,
 u'mb\\t': 37120,
 u'freelance': 22995,
 u'reneb': 50291,
 u'underlining': 62438,
 u'cscrjn': 14096,
 u'foreseable': 22637,
 u'macho': 36021,
 u'oversight': 43664,
 u'tenacious': 59344,
 u'windshields': 65838,
 u'paychecks': 44595,
 u'flatbed': 22210,
 u'jerk': 31267,
 u'jere': 31261,
 u'olympus': 42748,
 u'embark': 19223,
 u'unpleasent': 62819,
 u'rechargable': 49527,
 u'exact': 20528,
 u'minute': 38516,
 u'_like_': 1142,
 u'constutitional': 12871,
 u'skewed': 54998,
 u'facman': 21146,
 u'_said_': 1287,
 u'opstad': 43025,
 u'reimpose': 50035,
 u'multiprocessing': 39930,
 u'brent_gilchrist': 8373,
 u'adorno': 2308,
 u'__u__': 834,
 u'hindered': 27132,
 u'copyrighted': 13225,
 u'wbruvold': 65147,
 u'heavyweight': 26602,
 u'chopping': 10888,
 u'winecj': 65846,
 u'bagging': 5842,
 u'xthis': 67171,
 u'celebrated': 10168,
 u'zrz': 68307,
 u'cnob': 11650,
 u'zrm': 68303,
 u'geography': 24025,
 u'krasnodar': 33326,
 u'drafted': 17799,
 u'possiblility': 46507,
 u'climbs': 11443,
 u'honour': 27546,
 u'vanderbilt': 63620,
 u'address': 2168,
 u'baudot': 6312,
 u'benson': 6795,
 u'shamelessness': 54098,
 u'impacted': 28909,
 u'queue': 48551,
 u'sprouted': 56393,
 u'windowing': 65823,
 u'ensoniq': 19675,
 u'unreconstructed': 62863,
 u'^^wo': 547,
 u'darnell': 14800,
 u'opposes': 42999,
 u'toxoplasmosis': 60750,
 u'ooooooopsssss': 42890,
 u'perished': 45024,
 u'gibbons': 24225,
 u'opposed': 42998,
 u'_free_': 1025,
 u'consoled': 12802,
 u'inetray': 29426,
 u'shatterstar': 54163,
 u'ooooooh': 42886,
 u'wvt': 66513,
 u'approving': 4211,
 u'consoles': 12803,
 u'wondeing': 66128,
 u'edgeways': 18645,
 u'zabpp': 67925,
 u'zebee': 68047,
 u'orujev': 43284,
 u'rutherford': 52118,
 u'oscillator': 43308,
 u'following': 22516,
 u'xv_error': 67251,
 u'munroe': 39986,
 u'kxh': 33559,
 u'mailboxes': 36250,
 u'amarian': 3300,
 u'listens': 34987,
 u'litre': 35018,
 u'_g\\_v': 1030,
 u'thanking': 59628,
 u'subroutines': 57586,
 u'powerbooks': 46628,
 u'extensions': 20923,
 u'meddler': 37494,
 u'powersupplies': 46650,
 u'convincingly': 13125,
 u'fueled': 23231,
 u'overscanning': 43657,
 u'insitute': 29843,
 u'surfing': 58107,
 u'kodachrome': 33129,
 u'savasi': 52768,
 u'remotes': 50255,
 u'inhabiting': 29621,
 u'subsisting': 57620,
 u'nsi': 41873,
 u'professors': 47357,
 u'pettefar': 45242,
 u'jimmy': 31421,
 u'kxq': 33562,
 u'tddd': 59081,
 u'kgibson': 32642,
 u'multipath': 39911,
 u'janne_ojanen': 31038,
 u'jig': 31398,
 u'disconnect': 16799,
 u'jin': 31426,
 u'jio': 31433,
 u'sysmptoms': 58577,
 u'jim': 31406,
 u'jih': 31401,
 u'techie': 59132,
 u'\\pq': 283,
 u'jiq': 31436,
 u'apron': 4233,
 u'nsc': 41868,
 u'_vttjq]': 1394,
 u'bashful': 6216,
 u'deburie': 15156,
 u'aprox': 4234,
 u'sorted': 55842,
 u'ladysmith': 33710,
 u'didn': 16420,
 u'stevek': 56989,
 u'_you_': 1456,
 u'tchecoslovaquie': 59045,
 u'instability': 29864,
 u'quarter': 48484,
 u'quartet': 48492,
 u'stevel': 56990,
 u'abortionists': 1674,
 u'advacned': 2353,
 u'bursting': 8995,
 u'storeys': 57177,
 u'mlbizer': 38838,
 u'entering': 19692,
 u'vipunen': 64245,
 u'disasters': 16742,
 u'nsu': 41890,
 u'seriously': 53863,
 u'praechtig': 46724,
 u'nsw': 41891,
 u'telix': 59290,
 u'incentives': 29154,
 u'crazies': 13740,
 u'grandma': 24955,
 u'byte': 9153,
 u'bootloader': 7959,
 u'dsharedcode': 18022,
 u'hamed': 25972,
 u'dashnag': 14834,
 u'dashnak': 14836,
 u'modest': 39023,
 u'benching': 6741,
 u'hamer': 25974,
 u'quibble': 48557,
 u'jjao': 31448,
 u'hemorrhoids': 26773,
 u'spoked': 56270,
 u'zuflikarpasic': 68327,
 u'graingenes': 24936,
 u'spoken': 56272,
 u'gsdo': 25326,
 u'parkinson': 44280,
 u'vfast': 64020,
 u'spokes': 56274,
 u'periodical': 45012,
 u'eventhandlers': 20436,
 u'amipro': 3398,
 u'uhgv': 62060,
 u'lingering': 34897,
 u'militarily': 38345,
 u'surges': 58117,
 u'gervasi': 24097,
 u'arround': 4611,
 u'absorbs': 1736,
 u'surged': 58112,
 u'hoyle': 27769,
 u'crossroads': 13964,
 u'partisitions': 44364,
 u'wandering': 64917,
 u'koranic': 33217,
 u'turned': 61634,
 u'auroral': 5299,
 u'auroras': 5300,
 u'jewell': 31312,
 u'hohocon': 27390,
 u'clearcoat': 11361,
 u'guldenspoorstraat': 25479,
 u'turner': 61635,
 u'zoe': 68242,
 u'zog': 68244,
 u'coliseum': 11849,
 u'sankar': 52615,
 u'zok': 68247,
 u'mii_': 38286,
 u'zoo': 68258,
 u'zop': 68269,
 u'zot': 68277,
 u'bhagwans': 6997,
 u'zoy': 68280,
 u'aerostich': 2454,
 u'opposite': 43001,
 u'squalor': 56434,
 u'spewing': 56158,
 u'bwise': 9116,
 u'grateful': 25015,
 u'croatianism': 13906,
 u'msdos_uploads': 39696,
 u'touchy': 60676,
 u'tackling': 58691,
 u'scavanger': 52927,
 u'jitters': 31441,
 u'messier': 37853,
 u'touche': 60669,
 u'multitrack': 39947,
 u'wynn': 66561,
 u'hmpetro': 27309,
 u'imagines': 28813,
 u'friction': 23053,
 u'inconsistent': 29221,
 u'lexidata': 34558,
 u'decanoate': 15163,
 u'imagined': 28812,
 u'coutnry': 13547,
 u'recomended': 49592,
 u'pitsburgh': 45771,
 u'overeagerness': 43588,
 u'koslov': 33247,
 u'pgleason': 45283,
 u'punishement': 47964,
 u'rejoiced': 50089,
 u'windowed': 65822,
 u'balconies': 5903,
 u'rejoices': 50090,
 u'etcetera': 20225,
 u'moepi': 39055,
 u'recombinant': 49588,
 u'justified': 31951,
 u'userprofile': 63219,
 u'pimr': 45669,
 u'neutrophilic': 41025,
 u'racal': 48754,
 u'cunews': 14273,
 u'g^sn': 23460,
 u'noisier': 41540,
 u'airfares': 2778,
 u'bbhome': 6356,
 u'mcnrg^dc': 37329,
 u'gtefsd': 25363,
 u'tejgvhl': 59217,
 u'weatherford': 65223,
 u'defensively': 15382,
 u'dexter': 16225,
 u'nrsv': 41864,
 u'activating': 2052,
 u'sponsorship': 56293,
 u'vaccines': 63507,
 u'kilometer': 32789,
 u'moons': 39281,
 u'exploration': 20857,
 u'vender': 63849,
 u'tocopherol': 60390,
 u'mixdown': 38742,
 u'uos': 63034,
 u'annexing': 3729,
 u'euve': 20374,
 u'jumbo': 31899,
 u'wickedly': 65650,
 u'misphrased': 38631,
 u'sboop': 52839,
 u'flipped': 22301,
 u'workplace': 66219,
 u'uoz': 63039,
 u'semitic': 53697,
 u'grooming': 25233,
 u'decodable': 15233,
 u'fix': 22125,
 u'votrax': 64544,
 u'allowance': 3137,
 u'iplmail': 30434,
 u'erdem': 19960,
 u'transcaucasia': 60909,
 u'collider': 11892,
 u'\\zgd': 344,
 u'west': 65385,
 u'wesw': 65405,
 u'wesp': 65382,
 u'wess': 65383,
 u'incubation': 29268,
 u'orff': 43166,
 u'motives': 39456,
 u'biafra': 7019,
 u'admail': 2244,
 u'xannounce': 66621,
 u'spokestwinkie': 56281,
 u'yzx': 67884,
 u'yzy': 67885,
 u'yzz': 67886,
 u'photon': 45486,
 u'readings': 49327,
 u'photos': 45494,
 u'tightened': 60119,
 u'[vn': 157,
 u'abject': 1644,
 u'extant': 20912,
 u'[vm': 156,
 u'[vk': 155,
 u'talented': 58780,
 u'kimura': 32809,
 u'chyang': 11031,
 u'yz[': 67878,
 u'vecchi': 63785,
 u'ivem': 30782,
 u'ives': 30784,
 u'dulimart': 18149,
 u'voicemail': 64455,
 u'graphed': 24973,
 u'ballpark': 5944,
 u'singla': 54840,
 u'attackers': 5143,
 u'sriram': 56506,
 u'zonker': 68257,
 u'technology': 59160,
 u'evansmp': 20414,
 u'verified': 63927,
 u'arceneaux': 4320,
 u'afdc': 2482,
 u'dalnet': 14663,
 u'cunyvm': 14282,
 u'verifies': 63928,
 u'afds': 2483,
 u'otte': 43410,
 u'nimeroff': 41349,
 u'visually': 64321,
 u'assigns': 4895,
 u'strawman': 57278,
 u'tolar': 60430,
 u'alshalalbi': 3205,
 u'jview': 31979,
 u'constraining': 12854,
 u'exergetical': 20679,
 u'advertisement': 2385,
 u'elint': 19109,
 u'wholeness': 65605,
 u'chevette': 10726,
 u'ilgili': 28729,
 u'gwx': 25635,
 u'comer': 12029,
 u'persistently': 45116,
 u'macdac': 35983,
 u'planets': 45899,
 u'fatcity': 21425,
 u'malodah': 36416,
 u'generator': 23937,
 u'mandolin': 36490,
 u'plunderers': 46064,
 u'grounder': 25252,
 u'malouf': 36419,
 u'aborts': 1676,
 u'valuelessness': 63589,
 u'absense': 1717,
 u'zion': 68171,
 u'rejoin': 50092,
 u'diphenhydramine': 16632,
 u'sums': 57844,
 u'spokespersons': 56280,
 u'traffic': 60846,
 u'preference': 46884,
 u'remco': 50223,
 u'fugazi': 23240,
 u'sensational': 53733,
 u'malfunctions': 36386,
 u'thyroiditis': 60040,
 u'memmove': 37670,
 u'superiority': 57960,
 u'aldred': 2962,
 u'obstruct': 42340,
 u'seating': 53454,
 u'crossposting': 13961,
 u'bestraffas': 6903,
 u'complementarity': 12287,
 u'whiters': 65572,
 u'gu_x': 25388,
 u'substance': 57625,
 u'catologues': 9939,
 u'measurin': 37458,
 u'sciousness': 53135,
 u'drivable': 17914,
 u'francois': 22925,
 u'transmutation': 60981,
 u'hareket': 26204,
 u'averse': 5473,
 u'disinformation': 16916,
 u'sess': 53910,
 u'^dg]': 569,
 u'disparaging': 16987,
 u'outskated': 43535,
 u'spacelab': 55951,
 u'thailand': 59614,
 u'graal': 24875,
 u'hopkins': 27592,
 u'revolutionists': 50950,
 u'hosch': 27655,
 u'golem': 24670,
 u'corelscsi': 13257,
 u'sensitively': 53744,
 u'perturbed': 45161,
 u'porsche': 46422,
 u'disrupts': 17068,
 u'pivot': 45794,
 u'rossi': 51722,
 u'yqxx': 67764,
 u'gnashing': 24547,
 u'seales': 53409,
 u'dialtone': 16356,
 u'gleam': 24423,
 u'glean': 24424,
 u'redirection': 49718,
 u'sealed': 53408,
 u'frobinson': 23105,
 u'bubble': 8724,
 u'witt': 66007,
 u'witr': 66003,
 ...}
from sklearn.naive_bayes import MultinomialNB from sklearn import metrics newsgroups = fetch_20newsgroups(subset='all') newsgroups_target_train = newsgroups.target[0:(len(newsgroups.data)*0.7)] newsgroups_target_test = newsgroups.target[(len(newsgroups.data)*0.7):len(newsgroups.data)] print 'Counting labels in test dataset' from collections import Counter counts = Counter(newsgroups_target_test) print counts print len(counts) vectorizer = CountVectorizer(stop_words='english', min_df=0.001) vectors_count = vectorizer.fit_transform(newsgroups.data) vectors = TfidfTransformer(use_idf=True, norm=u'l2').fit_transform(vectors_count) n, m = vectors.shape print int(n*0.7),n vectors_train = vectors[0:int(n*0.7), :] vectors_test = vectors[int(n*0.7):n, :] clf = MultinomialNB(alpha=.01) clf.fit(vectors_train, newsgroups_target_train) pred = clf.predict(vectors_test) metrics.f1_score(newsgroups_target_test, pred, average='weighted')

w/o tf-idf: 0.89394081689789762 w tf-idf: 0.89268981555882321 w tf-idf norm: 0.89268981555882321


In [84]:
%run davies_bouldin.py

In [89]:
%run util.py

In [94]:
%run kmeans_script.py


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
/Users/lucasbrunialti/git/biclustering/experiments/kmeans_script.py in <module>()
     80 
     81 if __name__ == '__main__':
---> 82     main()

/Users/lucasbrunialti/git/biclustering/experiments/kmeans_script.py in main()
     71 
     72 def main():
---> 73     function_to_run = sys.argv[1]
     74 
     75     current_module = sys.modules[__name__]

IndexError: list index out of range

In [6]:
def get_dataset():
    return fetch_20newsgroups(subset='all')


def preprocess(newsgroups_data):
    vectorizer = CountVectorizer(stop_words='english', min_df=0.001)
    X = vectorizer.fit_transform(newsgroups_data.data)

    X_train_norm_tfidf = TfidfTransformer(norm=u'l2', use_idf=True).fit_transform(X)
    X_train_tfidf = TfidfTransformer(use_idf=True).fit_transform(X)
    X_train_norm = TfidfTransformer(norm=u'l2', use_idf=False).fit_transform(X)
    X_train = TfidfTransformer(use_idf=False).fit_transform(X)

    return X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf

In [7]:
dataset = get_dataset()
X_train, X_train_norm, X_train_tfidf, X_train_norm_tfidf = preprocess(dataset)

In [10]:
from onmtf import matrix_factorization_clustering
U, S, V, labels_pred, _, error = matrix_factorization_clustering(X_train.toarray()[0:1000], 20, 20, num_iters=1000)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-10-aa63241d7c07> in <module>()
      1 from onmtf import matrix_factorization_clustering
----> 2 U, S, V, labels_pred, _, error = matrix_factorization_clustering(X_train.toarray()[0:1000], 20, 20, num_iters=1000)

/Users/lucasbrunialti/git/biclustering/experiments/onmtf.py in matrix_factorization_clustering(X, k, l, factorization_func, norm, num_iters)
     37 
     38     for i in range(num_iters):
---> 39         U, S, V = factorization_func(X, U, S, V)
     40 
     41         error_ant = error

/Users/lucasbrunialti/git/biclustering/experiments/onmtf.py in onmtf(X, U, S, V)
      5 def onmtf(X, U, S, V):
      6     U *= ((X.dot(V).dot(S.T)) / (U.dot(S).dot(V.T).dot(X.T).dot(U)))
----> 7     V *= ((X.T.dot(U).dot(S)) / (V.dot(S.T).dot(U.T).dot(X).dot(V)))
      8     S *= ((U.T.dot(X).dot(V)) / (U.T.dot(U).dot(S).dot(V.T).dot(V)))
      9     return U, S, V

KeyboardInterrupt: 

In [5]:
a = pd.read_pickle('experiments/all_news_df.pkl')
a['channel'].unique()


Out[5]:
array(['igay', 'deles', 'tecnologia', 'ultimosegundo', 'arena', 'economia',
       'luxo', 'saude', 'igirl', 'jovem', 'delas', 'esporte', 'gente'], dtype=object)

In [6]:
%run run_algo.py ig fnmtf


ERROR: File `u'run_algo.py'` not found.

In [ ]: