In [1]:
from pymongo import MongoClient

In [2]:
client=MongoClient()
client=MongoClient('mongodb://.../')
db=client['eventData']
sen=db.documents_english

In [3]:
from nltk.tokenize import RegexpTokenizer

In [4]:
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

In [7]:
tokenizer = RegexpTokenizer(r'\w+')
# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [10]:
%%time
texts = []
actuallyTrained=0;
for i in sen.find():
        try:
            raw = ''.join(i['document']).lower()        
            tokens = tokenizer.tokenize(raw)
            stopped_tokens = [i for i in tokens if not i in en_stop]
            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
            texts.append(stemmed_tokens) 
            actuallyTrained=actuallyTrained+1
        except:
            pass
print(actuallyTrained)


1161388
CPU times: user 1h 6min 12s, sys: 7min 44s, total: 1h 13min 56s
Wall time: 1h 16min 49s

In [12]:
%%time
dictionary = corpora.Dictionary(texts)


CPU times: user 2min 35s, sys: 1.14 s, total: 2min 36s
Wall time: 2min 37s

In [26]:
dictionary.token2id


Out[26]:
{'call': 0,
 '800': 1,
 '342': 2,
 '2507': 3,
 'send': 4,
 'question': 5,
 'websit': 6,
 'extens': 7,
 'umd': 8,
 'edu': 9,
 'hgic': 10,
 'plant': 11,
 'week': 12,
 'giant': 13,
 'pussi': 14,
 'willow': 15,
 'japanes': 16,
 'salix': 17,
 'chaenomeloid': 18,
 's': 19,
 'way': 20,
 'won': 21,
 't': 22,
 'feel': 23,
 'fond': 24,
 'can': 25,
 'pet': 26,
 'seed': 27,
 'catalog': 28,
 'often': 29,
 'safe': 30,
 'pledg': 31,
 'somewher': 32,
 'near': 33,
 'front': 34,
 'assur': 35,
 'custom': 36,
 'compani': 37,
 'carri': 38,
 'genet': 39,
 'engin': 40,
 'univers': 41,
 'maryland': 42,
 'home': 43,
 'garden': 44,
 'inform': 45,
 'center': 46,
 'offer': 47,
 'free': 48,
 'pest': 49,
 'famili': 50,
 'will': 51,
 'also': 52,
 'receiv': 53,
 'friend': 54,
 'tuesday': 55,
 '11': 56,
 '30': 57,
 'march': 58,
 'life': 59,
 'tribut': 60,
 '5616': 61,
 'old': 62,
 'court': 63,
 'road': 64,
 'follow': 65,
 'funer': 66,
 'servic': 67,
 '12': 68,
 'pm': 69,
 'paid': 70,
 'death': 71,
 'notic': 72,
 'view': 73,
 'sign': 74,
 'onlin': 75,
 'guestbook': 76,
 'baltimoresun': 77,
 'com': 78,
 'obituari': 79,
 'lieu': 80,
 'flower': 81,
 'pleas': 82,
 'rememb': 83,
 'annel': 84,
 'memori': 85,
 'contribut': 86,
 'american': 87,
 'cancer': 88,
 'societi': 89,
 '8219': 90,
 'town': 91,
 'dr': 92,
 'baltimor': 93,
 'md': 94,
 '21236': 95,
 'lectur': 96,
 'tell': 97,
 'battl': 98,
 'tune': 99,
 'music': 100,
 'scale': 101,
 'place': 102,
 'context': 103,
 'quarrel': 104,
 'world': 105,
 'art': 106,
 'philosophi': 107,
 'religion': 108,
 'polit': 109,
 'scienc': 110,
 'chamber': 111,
 'benjamin': 112,
 'myer': 113,
 'yoon': 114,
 'nah': 115,
 'cho': 116,
 'young': 117,
 'bae': 118,
 'guest': 119,
 'perform': 120,
 'schubert': 121,
 'string': 122,
 'quintet': 123,
 'c': 124,
 'major': 125,
 '7': 126,
 'p': 127,
 'm': 128,
 'howard': 129,
 'commun': 130,
 'colleg': 131,
 'monteabaro': 132,
 'recit': 133,
 'hall': 134,
 '10901': 135,
 'littl': 136,
 'patux': 137,
 'parkway': 138,
 'columbia': 139,
 'futur': 140,
 'pianist': 141,
 'david': 142,
 'wasser': 143,
 'program': 144,
 'titl': 145,
 'day': 146,
 'earth': 147,
 '4': 148,
 'sunday': 149,
 'feb': 150,
 '23': 151,
 '16': 152,
 'danc': 153,
 'concert': 154,
 'misako': 155,
 'ballet': 156,
 'present': 157,
 'spring': 158,
 'collag': 159,
 '2': 160,
 'jim': 161,
 'rous': 162,
 'theatr': 163,
 'wild': 164,
 'lake': 165,
 'high': 166,
 'school': 167,
 '5460': 168,
 'trumpet': 169,
 'counti': 170,
 'histor': 171,
 'philip': 172,
 'merril': 173,
 'expert': 174,
 'african': 175,
 'histori': 176,
 'memorabilia': 177,
 'cultur': 178,
 'apprais': 179,
 'public': 180,
 'televis': 181,
 'show': 182,
 'chesapeak': 183,
 'collect': 184,
 'librari': 185,
 'miller': 186,
 'branch': 187,
 '8421': 188,
 'frederick': 189,
 'author': 190,
 'black': 191,
 'seri': 192,
 'forum': 193,
 'former': 194,
 'assist': 195,
 'u': 196,
 'secretari': 197,
 'agricultur': 198,
 'robert': 199,
 'thompson': 200,
 'speak': 201,
 'farm': 202,
 'tabl': 203,
 'first': 204,
 'presbyterian': 205,
 'church': 206,
 '9535': 207,
 'circl': 208,
 '18': 209,
 'film': 210,
 'screen': 211,
 'discuss': 212,
 'hopework': 213,
 'hip': 214,
 'hop': 215,
 'beyond': 216,
 'beat': 217,
 'rhyme': 218,
 'saturday': 219,
 '22': 220,
 'jewish': 221,
 'congreg': 222,
 'continu': 223,
 '22nd': 224,
 'annual': 225,
 'matchmak': 226,
 '5885': 227,
 'oliv': 228,
 'room': 229,
 '200': 230,
 'admiss': 231,
 '17': 232,
 'senior': 233,
 'student': 234,
 '443': 235,
 '518': 236,
 '1500': 237,
 'email': 238,
 'boxoffic': 239,
 'howardcc': 240,
 'gala': 241,
 'celebr': 242,
 'held': 243,
 '29': 244,
 'horowitz': 245,
 'visual': 246,
 'classic': 247,
 'italian': 248,
 'dish': 249,
 'altern': 250,
 'vibrant': 251,
 'red': 252,
 'tomato': 253,
 'slice': 254,
 'creami': 255,
 'mozzarella': 256,
 'fragrant': 257,
 'bright': 258,
 'green': 259,
 'basil': 260,
 'leav': 261,
 'color': 262,
 'flag': 263,
 'say': 264,
 'lot': 265,
 'peopl': 266,
 'harbaugh': 267,
 'said': 268,
 'featur': 269,
 'work': 270,
 'local': 271,
 'artist': 272,
 'oil': 273,
 'watercolor': 274,
 'acryl': 275,
 'pastel': 276,
 'mix': 277,
 'media': 278,
 'potteri': 279,
 'jewelri': 280,
 'fuse': 281,
 'glass': 282,
 'lemmon': 283,
 'dulaney': 284,
 'valley': 285,
 'inc': 286,
 '10': 287,
 'w': 288,
 'padonia': 289,
 'york': 290,
 'timonium': 291,
 '21093': 292,
 'monday': 293,
 'august': 294,
 '9pm': 295,
 'thrift': 296,
 'outlet': 297,
 'extrem': 298,
 'popular': 299,
 'big': 300,
 'ceo': 301,
 'campisi': 302,
 'thrill': 303,
 'team': 304,
 'icon': 305,
 'brand': 306,
 'give': 307,
 'new': 308,
 'destin': 309,
 'find': 310,
 'outstand': 311,
 'save': 312,
 'product': 313,
 'know': 314,
 'love': 315,
 'sale': 316,
 'hostess': 317,
 'cake': 318,
 'merchandis': 319,
 'store': 320,
 'repres': 321,
 'make': 322,
 'donat': 323,
 'st': 324,
 'jude': 325,
 'children': 326,
 'research': 327,
 'hospit': 328,
 'detail': 329,
 'wound': 330,
 'includ': 331,
 'casualti': 332,
 'report': 333,
 'aug': 334,
 '5': 335,
 'shoot': 336,
 'kill': 337,
 'maj': 338,
 'gen': 339,
 'harold': 340,
 'highest': 341,
 'rank': 342,
 'member': 343,
 'afghan': 344,
 'war': 345,
 'offici': 346,
 'shooter': 347,
 'armi': 348,
 'soldier': 349,
 'close': 350,
 'target': 351,
 'open': 352,
 'fire': 353,
 'train': 354,
 'base': 355,
 'least': 356,
 '15': 357,
 'among': 358,
 'specialist': 359,
 'reservist': 360,
 'northern': 361,
 'california': 362,
 'manag': 363,
 'appl': 364,
 'cupertino': 365,
 'calif': 366,
 'headquart': 367,
 'accord': 368,
 'move': 369,
 'shield': 370,
 'british': 371,
 'colonel': 372,
 'rifl': 373,
 'pistol': 374,
 'struck': 375,
 'six': 376,
 'bullet': 377,
 'weapon': 378,
 'one': 379,
 'aid': 380,
 '31': 381,
 'year': 382,
 'captain': 383,
 'volunt': 384,
 'deploy': 385,
 'shot': 386,
 'multipl': 387,
 'time': 388,
 'paralyz': 389,
 'waist': 390,
 'serv': 391,
 'affair': 392,
 'offic': 393,
 'navi': 394,
 'noncommiss': 395,
 'injur': 396,
 'unit': 397,
 'state': 398,
 'readi': 399,
 'signific': 400,
 'addit': 401,
 'econom': 402,
 'militari': 403,
 'iraq': 404,
 'less': 405,
 'sectarian': 406,
 'govern': 407,
 'defens': 408,
 'chuck': 409,
 'hagel': 410,
 'john': 411,
 'kerri': 412,
 'stand': 413,
 'fulli': 414,
 'support': 415,
 'inclus': 416,
 'iraqi': 417,
 'particularli': 418,
 'fight': 419,
 'islam': 420,
 'last': 421,
 'plane': 422,
 'began': 423,
 'conduct': 424,
 'airstrik': 425,
 'forc': 426,
 'tri': 427,
 'advanc': 428,
 'irbil': 429,
 'capit': 430,
 'semiautonom': 431,
 'kurdistan': 432,
 'region': 433,
 'attack': 434,
 'ten': 435,
 'thousand': 436,
 'civilian': 437,
 'sinjar': 438,
 'aircraft': 439,
 'nightli': 440,
 'drop': 441,
 'relief': 442,
 'suppli': 443,
 'yazidi': 444,
 'minor': 445,
 'trap': 446,
 'barren': 447,
 'mount': 448,
 'mcdonnel': 449,
 'special': 450,
 'correspond': 451,
 'mostaghim': 452,
 'tehran': 453,
 'reuter': 454,
 'washington': 455,
 'post': 456,
 'lawmak': 457,
 'tap': 458,
 'succeed': 459,
 'nouri': 460,
 'al': 461,
 'maliki': 462,
 'prime': 463,
 'minist': 464,
 'appear': 465,
 'back': 466,
 'neighbor': 467,
 'iran': 468,
 'crucial': 469,
 'baghdad': 470,
 'alli': 471,
 'faction': 472,
 'militia': 473,
 'deepli': 474,
 'divid': 475,
 'nation': 476,
 'alreadi': 477,
 'endors': 478,
 'nomin': 479,
 'haider': 480,
 'ibadi': 481,
 'next': 482,
 'premier': 483,
 'power': 484,
 'top': 485,
 'iranian': 486,
 'secur': 487,
 'ali': 488,
 'shamkhani': 489,
 'publicli': 490,
 'welcom': 491,
 'legal': 492,
 'process': 493,
 'elect': 494,
 'semioffici': 495,
 'far': 496,
 'news': 497,
 'agenc': 498,
 'comment': 499,
 'came': 500,
 'longtim': 501,
 'deputi': 502,
 'speaker': 503,
 'countri': 504,
 'parliament': 505,
 'formal': 506,
 'becom': 507,
 'polar': 508,
 'figur': 509,
 'vow': 510,
 'challeng': 511,
 'coup': 512,
 'arab': 513,
 'recent': 514,
 'visit': 515,
 'suprem': 516,
 'council': 517,
 'presid': 518,
 'hassan': 519,
 'rouhani': 520,
 'probabl': 521,
 'posit': 522,
 'observ': 523,
 'twin': 524,
 'two': 525,
 'add': 526,
 'momentum': 527,
 'deep': 528,
 'differ': 529,
 'constel': 530,
 'issu': 531,
 'middl': 532,
 'east': 533,
 'elsewher': 534,
 'seem': 535,
 'agre': 536,
 'step': 537,
 'asid': 538,
 'embroil': 539,
 'monthslong': 540,
 'stalem': 541,
 'face': 542,
 'grave': 543,
 'threat': 544,
 'sunni': 545,
 'muslim': 546,
 'milit': 547,
 'overrun': 548,
 'much': 549,
 'north': 550,
 'west': 551,
 'critic': 552,
 'polici': 553,
 'push': 554,
 'camp': 555,
 'charg': 556,
 'reject': 557,
 'variou': 558,
 'kurdish': 559,
 'lead': 560,
 'shiit': 561,
 'like': 562,
 'wield': 563,
 'great': 564,
 'influenc': 565,
 'must': 566,
 'balanc': 567,
 'demand': 568,
 'sensit': 569,
 'name': 570,
 'cabinet': 571,
 'believ': 572,
 'fail': 573,
 'salam': 574,
 'promin': 575,
 'sheik': 576,
 'embattl': 577,
 'tribe': 578,
 'sinc': 579,
 'fighter': 580,
 'qaida': 581,
 'offshoot': 582,
 'seiz': 583,
 'northwestern': 584,
 'june': 585,
 'sent': 586,
 '700': 587,
 'personnel': 588,
 'protect': 589,
 'diplomat': 590,
 'take': 591,
 'stock': 592,
 'capac': 593,
 'organ': 594,
 'citizen': 595,
 'advocaci': 596,
 'group': 597,
 'line': 598,
 'now': 599,
 'transit': 600,
 'advoc': 601,
 'brace': 602,
 'jan': 603,
 '21': 604,
 'inaugur': 605,
 'gov': 606,
 'larri': 607,
 'hogan': 608,
 'express': 609,
 'skeptic': 610,
 'abil': 611,
 'pay': 612,
 'separ': 613,
 'purpl': 614,
 'light': 615,
 'rail': 616,
 'project': 617,
 'connect': 618,
 'suburb': 619,
 'montgomeri': 620,
 'princ': 621,
 'georg': 622,
 'woodlawn': 623,
 'citi': 624,
 'travel': 625,
 'ground': 626,
 'part': 627,
 'downtown': 628,
 'southeast': 629,
 'mayor': 630,
 'stephani': 631,
 'rawl': 632,
 'blake': 633,
 'told': 634,
 'prioriti': 635,
 'ask': 636,
 'keep': 637,
 'budget': 638,
 'resid': 639,
 'live': 640,
 'along': 641,
 'propos': 642,
 'rout': 643,
 'bicycl': 644,
 'current': 645,
 'leader': 646,
 'administr': 647,
 'cum': 648,
 'other': 649,
 'come': 650,
 'see': 651,
 'slip': 652,
 'away': 653,
 'rep': 654,
 'elijah': 655,
 'e': 656,
 'join': 657,
 'wednesday': 658,
 'night': 659,
 'plan': 660,
 'baltimorean': 661,
 'voic': 662,
 'heard': 663,
 'still': 664,
 'acknowledg': 665,
 'fate': 666,
 'lie': 667,
 'hand': 668,
 'tilghman': 669,
 'island': 670,
 'watermen': 671,
 'plead': 672,
 'guilti': 673,
 'friday': 674,
 'district': 675,
 'illeg': 676,
 '185': 677,
 '925': 678,
 'pound': 679,
 'stripe': 680,
 'bass': 681,
 'bay': 682,
 'investig': 683,
 'hayden': 684,
 'allegedli': 685,
 'threaten': 686,
 'harm': 687,
 'wit': 688,
 'cooper': 689,
 'roll': 690,
 'explet': 691,
 'man': 692,
 'ok': 693,
 'care': 694,
 'document': 695,
 'feder': 696,
 'grand': 697,
 'juri': 698,
 'indict': 699,
 'novemb': 700,
 'crimin': 701,
 'conspiraci': 702,
 'involv': 703,
 'harvest': 704,
 'interst': 705,
 'known': 706,
 'rockfish': 707,
 'fish': 708,
 'import': 709,
 'commerci': 710,
 'recreat': 711,
 'speci': 712,
 'sever': 713,
 'stay': 714,
 'healthi': 715,
 'amid': 716,
 'pollut': 717,
 'michael': 718,
 'd': 719,
 '41': 720,
 'william': 721,
 'j': 722,
 'lednum': 723,
 '42': 724,
 'admit': 725,
 'sell': 726,
 '498': 727,
 '293': 728,
 'ring': 729,
 'oper': 730,
 '2007': 731,
 '2011': 732,
 'sold': 733,
 'wholesal': 734,
 'pennsylvania': 735,
 'delawar': 736,
 'case': 737,
 'depart': 738,
 'natur': 739,
 'resourc': 740,
 'found': 741,
 'anchor': 742,
 'net': 743,
 'kent': 744,
 'eastern': 745,
 'side': 746,
 'bridg': 747,
 'three': 748,
 'ago': 749,
 'hard': 750,
 'truce': 751,
 'collaps': 752,
 'within': 753,
 'hour': 754,
 'isra': 755,
 'went': 756,
 'miss': 757,
 'dead': 758,
 'heavi': 759,
 'palestinian': 760,
 'southern': 761,
 'gaza': 762,
 'strip': 763,
 'appar': 764,
 'captur': 765,
 'nightmar': 766,
 'scenario': 767,
 'vastli': 768,
 'complic': 769,
 'ceas': 770,
 'netanyahu': 771,
 'accus': 772,
 'hama': 773,
 'control': 774,
 'flagrantli': 775,
 'violat': 776,
 'humanitarian': 777,
 'paus': 778,
 'nearli': 779,
 '1': 780,
 '500': 781,
 '63': 782,
 'troop': 783,
 'insist': 784,
 'clash': 785,
 'rafah': 786,
 'tip': 787,
 'occur': 788,
 'start': 789,
 'abort': 790,
 'accept': 791,
 'israel': 792,
 'version': 793,
 'event': 794,
 'condemn': 795,
 'quick': 796,
 'breakdown': 797,
 'closer': 798,
 'strain': 799,
 'relat': 800,
 'barack': 801,
 'obama': 802,
 'white': 803,
 'hous': 804,
 'confer': 805,
 'arrang': 806,
 'halt': 807,
 'entir': 808,
 'right': 809,
 'dismantl': 810,
 'network': 811,
 'tunnel': 812,
 'ad': 813,
 'reduc': 814,
 'bloodsh': 815,
 'anger': 816,
 'violenc': 817,
 'boil': 818,
 'bank': 819,
 'die': 820,
 'reignit': 821,
 'specter': 822,
 'widen': 823,
 'conflict': 824,
 'outsid': 825,
 'run': 826,
 'area': 827,
 'sought': 828,
 'identifi': 829,
 '2nd': 830,
 'lt': 831,
 'hadar': 832,
 'goldin': 833,
 '60': 834,
 'tank': 835,
 'shell': 836,
 'rake': 837,
 'neighborhood': 838,
 'cover': 839,
 'flee': 840,
 'clinic': 841,
 'swiftli': 842,
 'overwhelm': 843,
 'unlik': 844,
 'concess': 845,
 'unknown': 846,
 'fishermen': 847,
 'put': 848,
 'sea': 849,
 'wafi': 850,
 'jamal': 851,
 'ventur': 852,
 'ruin': 853,
 'hope': 854,
 'might': 855,
 'spokesman': 856,
 'col': 857,
 'peter': 858,
 'lerner': 859,
 'confront': 860,
 'destroy': 861,
 'infiltr': 862,
 'allow': 863,
 'term': 864,
 'suspect': 865,
 'chao': 866,
 'drag': 867,
 'fear': 868,
 'meanwhil': 869,
 'fresh': 870,
 'barrag': 871,
 'rocket': 872,
 'mortar': 873,
 'round': 874,
 'seek': 875,
 'prove': 876,
 'scrambl': 877,
 'bomb': 878,
 'shelter': 879,
 'even': 880,
 'though': 881,
 'estim': 882,
 'stockpil': 883,
 'substanti': 884,
 'diminish': 885,
 'sobelman': 886,
 'jerusalem': 887,
 'paul': 888,
 'richter': 889,
 'lking': 890,
 'tribun': 891,
 'saudi': 892,
 'riyadh': 893,
 'arabia': 894,
 'king': 895,
 'abdullah': 896,
 'ii': 897,
 'broke': 898,
 'silenc': 899,
 '3': 900,
 'saw': 901,
 'intern': 902,
 'offens': 903,
 'describ': 904,
 'crime': 905,
 'sponsor': 906,
 'terror': 907,
 'speech': 908,
 'innoc': 909,
 'mutil': 910,
 'bodi': 911,
 'contravent': 912,
 'teach': 913,
 'soon': 914,
 'daybreak': 915,
 'quickli': 916,
 'upon': 917,
 'respit': 918,
 'trek': 919,
 'devast': 920,
 'go': 921,
 'zone': 922,
 'bombard': 923,
 'hornet': 924,
 'puzzl': 925,
 'poli': 926,
 'domin': 927,
 'board': 928,
 'ball': 929,
 'well': 930,
 'creat': 931,
 'insid': 932,
 'damascu': 933,
 '6': 934,
 'secu': 935,
 'arena': 936,
 'milford': 937,
 'mill': 938,
 'defeat': 939,
 'westlak': 940,
 '75': 941,
 '53': 942,
 'lauren': 943,
 'hit': 944,
 'seven': 945,
 'throw': 946,
 'jenna': 947,
 'kaufman': 948,
 'made': 949,
 'final': 950,
 '09': 951,
 'minut': 952,
 'veteran': 953,
 '25': 954,
 'fall': 955,
 'class': 956,
 '4a': 957,
 'season': 958,
 'good': 959,
 'job': 960,
 'prang': 961,
 'especi': 962,
 'earli': 963,
 'second': 964,
 'half': 965,
 'turnov': 966,
 'convert': 967,
 'chanc': 968,
 'mani': 969,
 'layup': 970,
 'overcom': 971,
 'use': 972,
 'hold': 973,
 'eight': 974,
 'point': 975,
 'morgan': 976,
 'newton': 977,
 'forward': 978,
 'got': 979,
 'plenti': 980,
 'help': 981,
 'contain': 982,
 'teammat': 983,
 'took': 984,
 'game': 985,
 'third': 986,
 'quarter': 987,
 'smith': 988,
 'runner': 989,
 'pendleton': 990,
 'fed': 991,
 '24': 992,
 'boost': 993,
 '27': 994,
 'carlo': 995,
 'betancur': 996,
 'fifth': 997,
 'stage': 998,
 'pari': 999,
 ...}

In [14]:
%%time
corpus = [dictionary.doc2bow(text) for text in texts]


CPU times: user 2min 17s, sys: 34.8 s, total: 2min 52s
Wall time: 2min 58s

In [15]:
%%time
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=1)


CPU times: user 43min, sys: 1min 25s, total: 44min 25s
Wall time: 44min 36s

In [110]:
ldamodel[corpus[1]]


Out[110]:
[(5, 0.57942845600499893), (8, 0.39703790016898888)]

In [44]:
ldamodel


Out[44]:
<gensim.models.ldamodel.LdaModel at 0x116530b00>

In [16]:
actuallyTrained


Out[16]:
1161388

change result into multidimensinal array in order to feed in k-means model,

the no of dimension is the no of topics


In [18]:
%%time
dim=20
result=[]
for i in range(0,actuallyTrained):
    feature=[]
    previousindex=0
    for item in ldamodel[corpus[i]]:
        index=item[0]
        #print(index)
        for beforeindex in range(previousindex,index):
            feature.append(0)
        feature.append(item[1])
        previousindex=index+1
    while (len(feature)<dim):
        feature.append(0);  #add in 0 at the end
    result.append(feature)


CPU times: user 23min 14s, sys: 1min 38s, total: 24min 52s
Wall time: 3h 15min 36s

In [20]:
from sklearn.cluster import KMeans
import numpy as np

In [21]:
%%time
kmeanstest=np.array(result)


CPU times: user 2.03 s, sys: 193 ms, total: 2.22 s
Wall time: 2.23 s

In [77]:
X = np.array([[1, 2,3], [1, 4,3], [1, 0,3],
...               [4, 2,4], [4, 4,5], [4, 0,6]])

In [23]:
%%time
kmeans = KMeans(n_clusters=20, random_state=0).fit(kmeanstest)


CPU times: user 2min 47s, sys: 1min 5s, total: 3min 52s
Wall time: 3min 41s

In [28]:
kmeans.labels_


Out[28]:
array([ 5, 19, 19, ...,  1,  2,  9], dtype=int32)

In [ ]: