In [8]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from gensim.models.wrappers import FastText

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.cluster.hierarchy import fcluster

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.preprocessing import MaxAbsScaler

from plotly import tools
import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

py.init_notebook_mode()



In [3]:
# Autoreload changed modules
%load_ext autoreload
%autoreload 2

In [9]:
# Load data
# idfs = pd.read_csv("datasets/idf_scores.csv")
# ft_counts = pd.read_csv("datasets/ft_counts_0.9.csv")
# ft_neighs = pd.read_csv("datasets/ft_neighbours_0.9.csv")
# wv_counts = pd.read_csv("datasets/wv_counts_0.9.csv")
# wv_neighs = pd.read_csv("datasets/wv_neighbours_0.9.csv")

wv_model = Word2Vec.load('datasets/wv_model')
ft_model = FastText.load("datasets/ft_model")

used_model = wv_model

vocab = list(used_model.wv.vocab)

top = ft_model.most_similar("electronic", topn=1000)
top_tags = [x[0] for x in top]

used_vocab = vocab[:1000]
# used_vocab = top_tags

vecs = used_model[used_vocab]

#scaler = MaxAbsScaler()
#vecs = scaler.fit_transform(raw)

In [10]:
len(vocab)


Out[10]:
194020

In [11]:
used_vocab


Out[11]:
['',
 'chao',
 'debajo',
 'musicalizacion',
 'kuplia',
 'willowz',
 'arden',
 'jersey',
 'maruiat',
 'kibrit',
 'postapo',
 'mettal',
 'ciente',
 'recokn',
 'solitaires',
 'cybersyb',
 'koss',
 'liknande',
 'oahoahoahoaaahoeeeh',
 'megaman',
 'wezz',
 'ravenorwegianrockdance',
 'lipin',
 'originalz',
 'goldschool',
 'psyche',
 'sonntagsmucke',
 'digigrind',
 'metakl',
 'guesswhat',
 'coldplaaaay',
 'vanilla',
 'moloku',
 'chinadoll',
 'tangente',
 'crohran',
 'megadeth',
 'sowhyami',
 'insolens',
 'ramped',
 'hagedorn',
 'plastique',
 'weltkulturerbe',
 'fragil',
 'sarkymaton',
 'nocte',
 'truces',
 'mcaulay',
 'naviganti',
 'chupala',
 'kioka',
 'lynchcast',
 'yoyoyoyo',
 'wher',
 'ranchhand',
 'pagtapos',
 'almostalways',
 'piticlin',
 'taoandzen',
 'pharao',
 'jaaaaam',
 'sooki',
 'puerto',
 'shimmerdrone',
 '2212',
 'paddymac',
 'umorale',
 'reek',
 'conya',
 'wirchs',
 'epple',
 'orchestrar',
 'bestofwellenwahn',
 'chained',
 'bravinha',
 'missyouu',
 'fakeheads',
 'sunkil',
 'hunches',
 'rana',
 'concsious',
 'sommerferien',
 'drank',
 'verrrr',
 'meky',
 'cucc',
 'redrum',
 'emmanuelle',
 'rahatlatici',
 '80tall',
 'bloku',
 'gigliolla',
 'troubleismy',
 'ramonesque',
 'desky',
 'relaciono',
 'miiracleenetwork',
 'vanakool',
 'airlock',
 'fas',
 'song01',
 'nomads',
 'paramour',
 'allownetworkinginternal',
 'rhp',
 'shredwork',
 'chukhung',
 'africana',
 'mccombs',
 'chubbys',
 'llegan',
 'uppity',
 'britt',
 'homeparty',
 'vasti',
 'plantcore',
 'kmem8200',
 'habbib',
 'alegrinho',
 'diplomats',
 'benhe',
 'happyrap',
 'fatzzy',
 'crveno',
 'minitous',
 'pesada',
 'geister',
 'sleeptracksradio',
 'nadar',
 '34000',
 'scopriremo',
 'germanotta',
 'sentiments',
 'postpunknewwave',
 'pseudopop',
 'politicians',
 'vodalist',
 'ashlis',
 'rymes',
 'congratulations',
 'huzurlu',
 'japanischer',
 'pilu',
 'klimatem',
 'lugosis',
 'lunatico',
 'docii',
 'tunnelmallinen',
 'durchdrehmugge',
 'simonandgarfunkel',
 'niemals',
 'psychodelka',
 'rasperries',
 'farfarawayinyourmind',
 'aaaaaaargh',
 'bamboleooooooooooo',
 '111111',
 'jazztlan',
 'impmusic',
 'triunfo',
 'puracane',
 'finnsvenska',
 'mente',
 'oeste',
 'racionais',
 'ffmrfeelingood',
 'jusz',
 'cute',
 'siam',
 'saens',
 'casette',
 'venerea',
 'sounk',
 'grungeweasel',
 'river',
 'digitalissim',
 'seent',
 'no3',
 'benawesomez',
 'vays',
 'indianish',
 'queimadura',
 'rs2',
 'minox',
 'significant',
 'plavi',
 'afuckingmazing',
 'foat',
 'pyty',
 'same',
 'hipo',
 'dietcunt',
 'boe',
 'marquiz',
 'bewak',
 'meurtries',
 'af30r',
 'or',
 'remedios',
 'scanalt',
 'heavyfuckinmetal',
 'spunkastic',
 'cespedes',
 'atmosferica',
 'racontuer',
 'victorious',
 'rnb',
 'uykuyuy',
 'greatauthors',
 'lovvvveeeeeeeeeeeeee',
 '8eed',
 'incognita',
 'cooleh',
 'orto',
 'perusrokkia',
 'xzk',
 'ciptaan',
 'allm',
 'schnhauser',
 'mirrormirror',
 'screamo0',
 'groudbreaking',
 'dragged',
 'dunoyer',
 'cantautori',
 'breeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee',
 'electrindi',
 'oficio',
 'fulvia',
 'chiildhoodsongs',
 'bitterblue',
 'smpho',
 'skrik',
 'beging',
 'klfd',
 'q112fh',
 'heijari',
 'havaya',
 'acid',
 'bouger',
 'recuerdos',
 'liljoew',
 'darkseed',
 'femcs',
 'lige',
 'rmarx',
 'tumbos',
 'industrialtrance',
 'creamyjazz',
 'mengamuk',
 'przytulance',
 'toocoolforlocalband',
 'mound',
 'nagu',
 'ebulient',
 'imagining',
 'codec',
 'titi',
 'parsisiusti',
 'kappe',
 'tentaclepr0n',
 'hedningarna',
 'grinderman',
 'charmsucky',
 'heimnar',
 'aviatory',
 '2wave',
 'entersandman',
 'grapevine',
 'potentiale',
 'fama',
 'alternatvie',
 'ryja',
 'naive',
 'monotones',
 'adamblanchardgetthis',
 'androids',
 'crash',
 'berimbau',
 'unconventional',
 'indiespunk',
 'kamuki',
 'ambientspare',
 'ailton',
 'neito',
 'undercurrent',
 'andiepam',
 'trampo',
 'ancestral',
 'bohannon',
 'radau',
 'punches',
 'alm1ghty',
 'siisti',
 'blkkeys',
 'sbng',
 'konnyu',
 'papacito',
 'eri943',
 'whate',
 'oyif',
 'hamerfall',
 'neocave',
 'smooooooth',
 '1900s',
 'jesuuuuuschristfuckinghell',
 'superbeasto',
 'grugne',
 'duffy',
 'chaquico',
 'baju',
 'heardl',
 'amazzing',
 'piz',
 'sexaaay',
 'scarsi',
 'nakibird',
 'mikefav32',
 'a04',
 'screwed',
 'paralizza',
 'kropla',
 'sought',
 'queimadas',
 'mordha',
 'pou',
 'ozn',
 'smiles',
 'lyrichahah',
 'ingleesa',
 'chlixit',
 'songcraft',
 'briz',
 'rockstories',
 'smokemarijuana',
 'mutantesmu',
 'zdark',
 'fascinating',
 'prostreet',
 'loveloveloveloveit',
 'crucifijo',
 'foss',
 'guitarrear',
 'hammil',
 'ambertek',
 'boxish',
 'betasim',
 'fvhaggard',
 'emooooooooooooooooo',
 'mestizajeando',
 'vangace',
 'daion705',
 'sdmc',
 'gutta',
 'biopoly',
 'aphextwin',
 'marginal',
 'thedoc',
 'sydstats',
 'clasicall',
 'stueck',
 'anpe',
 'pleurer',
 'maneras',
 'collectionmy',
 'caqrbine',
 'dorsia',
 'drugstore',
 'clews',
 'hellsboutique',
 'houseman',
 'gosty',
 'looooooooooooooooooooooooove',
 'denzel',
 'erectric',
 'svaraste',
 'sologasm',
 'sparklemotion',
 'u5',
 'keiko02',
 'ikad',
 'moonrise',
 'captain',
 'jinglee',
 'spurt',
 'outter',
 'cuddletastic',
 'luzne',
 'gosp',
 'tpsytrance',
 'jjjjjjjjjjjjjjeeeeeeeeeeejjeejjejjeej',
 'alkmucke',
 'kmz',
 'lerenmijnheren',
 'delat',
 'rhys',
 'oyku',
 'sister',
 'jalilly',
 'nauman',
 'dancehallshatamessi',
 'crail',
 'balda',
 'roadrunner',
 'luvs',
 'inrtuders',
 'been102',
 'hyponotic',
 'ihaveseenyoubefore',
 'hier',
 'bulls',
 'rikiparty',
 '1969radio',
 'presa',
 'ooooooooooooooooooooooooooooooooooooooo',
 'jcanale',
 'owleygirl',
 'mirroball',
 'roughshod',
 'nextdecade',
 'schwubbelig',
 'chooooooooooon',
 'promiise',
 'abandon',
 'brhg',
 'eyeball',
 'theatrebridge',
 'tarre',
 'primitives',
 'fazujace',
 'wellness',
 'hatin',
 'acostarse',
 'myusa',
 'gimmiesomo',
 'defiant',
 'gr8t',
 'myhome',
 'serien',
 'bailable',
 'blooms',
 'knoc',
 'plasma',
 'pseudorock',
 'linijki',
 'dinlen',
 'bonham',
 'rezolutne',
 'arlindo',
 'hayko',
 'tich',
 'horrorfilmmusik',
 'fggggg',
 'saglar',
 'purandaradasa',
 'shoris',
 'kolifornija',
 'cherixin',
 'xiaxian',
 'ukhiphop',
 'traurig',
 'franchutecanadian',
 'demes',
 'vampiry',
 'berliner',
 'rotle',
 'sancuary',
 'turlu',
 'ligar',
 'taek',
 'asturies',
 'doleur',
 'ease',
 'mathijs',
 'loungelic',
 'differed',
 'cattaneo',
 'govertgoudglans',
 'speel',
 'ethnotrance',
 'primrose',
 'rainawaytown',
 'hallelujiah',
 'jpunk',
 'prettymuch',
 'ottima',
 'enrici',
 'slosh',
 'parketilla',
 'finskii',
 'arier',
 'woodcutters',
 'astray',
 'sup',
 'threesixty',
 'superguat',
 'arjen',
 'krzyczysz',
 'chikka',
 'atercipelados',
 'cxcxc',
 'osfold',
 'boooo',
 'pomponette',
 'casete',
 'clix',
 'wt40',
 'rockzinho',
 'paedophilia',
 'electrofeelings',
 'rabow',
 'allsopp',
 'mamaa',
 '112612',
 'ok10',
 'lamb_fav',
 'veikala',
 'kmem1100',
 'contrario',
 '205',
 'britfolk',
 'parfait',
 'sonati',
 'roaddog23',
 'inspirujace',
 'hp',
 'airspace',
 'sexyvoices',
 'cleaveland',
 'briffett',
 'waa',
 'sommerregen',
 'aetzend',
 'hollyradio',
 'tung',
 'fusiion',
 'diefenbach',
 'jebesh',
 'gundanytt',
 'searchmore',
 'maenpaa',
 'kinsey',
 'ammia',
 'lopu',
 'electroboogie360',
 'dxm',
 'vampiryc',
 'alices',
 'dibil',
 'warrenhill',
 'hain',
 'vacuuming',
 'parlantes',
 'murderer',
 'zuglied',
 'brasil2',
 'hamrony',
 'baddis',
 'ringish',
 'schooeen',
 'hidria',
 'graft',
 'lottery',
 'hi10',
 'ambar',
 'comrade',
 'devamachine',
 'lovedbygdchill',
 'oop',
 'groovypanda',
 'metallista',
 'nala',
 'bossanovastyle',
 'posthop',
 'tow',
 'asw',
 'goldminds',
 'psyhodelic',
 'heard',
 'veshi',
 'seriozni',
 'easyy',
 'robledillo',
 'paleta',
 'humourous',
 'rancing',
 'hastetheday',
 'ruiz',
 'absorption',
 'mulle',
 'bo',
 'kiisu',
 'super8',
 'lovg',
 'threadless',
 'manuella',
 'gaelle',
 'parliment',
 'deepclassic',
 'kekekekekeek',
 'nolicom',
 'euphamism',
 'acerlo',
 'bassist',
 'dotchcity',
 'masis',
 'iceflames',
 'difinition',
 'carnatic',
 'finini',
 'staubtrocken',
 'juaaaaaaa',
 'analogies',
 'gimma',
 'yogalounge',
 'eelsoul',
 'beckers',
 'antagonism',
 'riseandshine',
 'amazine',
 'cobraaa',
 'whyt',
 'sluitwell',
 'janvier',
 'orisha',
 'emerica',
 'longstrangeday',
 'tempetuous',
 'tranquilote',
 'lanarkshire',
 'homosapiens',
 'yolculari',
 'rhinoplasty',
 'whatami',
 'pazuzu',
 'maravillioso',
 'danmce',
 'angst',
 'bobdylanesque',
 'kilsyth',
 'lern',
 'rechtvaardig',
 'airp',
 'thegoods',
 'robopop',
 'chrizzi',
 'symhony',
 'revienta',
 'edita',
 'karuna',
 'summerimte',
 'slowpaly',
 'blouson',
 'patochman',
 'zgromadzenie',
 'zuvin',
 'niekais',
 'qaf',
 'rabgibi',
 'dualde',
 'vecinidad',
 'axperimental',
 'smgalvan',
 'frisbee',
 'szczegolne',
 'kloser',
 'wraugh',
 'trebala',
 'pissedoff',
 'lyly',
 'voet',
 'weiberlektro',
 'chabad',
 'gills',
 'sereno',
 'ghospel',
 'titto',
 'chumaaaaaaaaaaaaaa',
 'femalepower',
 'guapo',
 'anotherexistence',
 'ifm',
 'mecca',
 'indulged',
 'emp',
 'loogeetoremember',
 'omgtslmm',
 'autosound',
 'hutt',
 'eha',
 'passion',
 'monika',
 '4armindosalvador',
 'escuela',
 'staves',
 'gahan',
 'tyler',
 'alakaoomasaakkaaburahhhsigmaramamsima',
 'gnijacy',
 'dyshi',
 'tediumtediumtedium',
 'arvika07',
 'tornaado',
 'rabbitsongs',
 'lora',
 'eskil',
 'eeh',
 'tracksilove',
 'zolinal',
 'sooner',
 'listeningdownbeat',
 'avenir',
 'donoso',
 'wcw',
 'felicita',
 '130102',
 'gimore',
 'pinapple',
 'me01',
 'indistrial',
 'gat',
 'lulzy',
 'luwli',
 'manigance',
 'nomal',
 'anazi',
 'flykkiller',
 'engvall',
 'keywordlinks',
 'generate',
 'brownel',
 'doing',
 'shk3',
 'firt',
 'adicting',
 'ghom',
 'jonvangelis',
 'electrosynth',
 'trav',
 'blaskapelle',
 'essayage',
 'crocker',
 'wans',
 'hihhulimeininki',
 'unfaitgful',
 'hemels',
 'tornqvist',
 'rhmc',
 'apreciando',
 'roxo',
 'bubblz',
 'hieras',
 'generaltunes',
 'antchpop',
 'hing',
 'mixage',
 'galvanisme',
 'mrauuuu',
 'groul',
 'drab',
 'desengano',
 'justthey',
 'europunk',
 'trolls',
 'favoritefavoritefavorite',
 'orch',
 'mobral',
 'rozkmina',
 'cecile',
 'salsaflute',
 'chyba',
 'kellaway',
 'willget',
 'espnol',
 'ascoltata',
 'digest',
 'tenenenenet',
 'irrenanstalt',
 'kropka',
 'shoutbox',
 'dejamos',
 'douleur',
 'alepupa',
 'literatec',
 'apolda',
 'ajentina',
 'funkyfolk',
 'classe',
 'ventos',
 'demolicated',
 'pedaly',
 'gretyl',
 'nunerd',
 'caddyshack',
 'tailors',
 'uitsteekend',
 'verdna',
 'deadeyes',
 'barika',
 'chicho',
 'ranchofresa',
 '200703',
 'pano',
 'kardemumma',
 'perofelia',
 'stepdad',
 'klibi',
 'uppdate',
 'gozyasim',
 'peppermints',
 'melorock',
 'pussies',
 'starlight',
 'progressiivista',
 'eaegasm',
 'ollies',
 'someting',
 'muisto',
 'no18',
 'remember',
 'bco',
 'audioscrobbler',
 'catalogue',
 'alittlstrange',
 'ashlyn',
 'surfpsychobilly',
 'slowcored',
 'rodrigues',
 'plume',
 'haidhausen',
 'chillmix',
 'enorrmes',
 'zenobi',
 'ancient',
 'sommin',
 'sylvia',
 'gutless',
 'melodians',
 'soffya',
 'sacks',
 'jblvr',
 'ntvg',
 'harmonizer',
 'loyd',
 '10x10',
 'zakir4',
 'kutles',
 'existia',
 '17050',
 'dgbustoscl',
 'civic',
 '20er',
 'dreamish',
 'aghwwwwwww',
 'pysgoa',
 'remained',
 'killrockstars',
 'wymiata',
 'gardflang',
 'affection',
 'no94',
 'zimbo',
 'httpwwwlastfmlistenartistjames2520brownsimilarartists',
 'fliddler',
 'babel',
 'iwc',
 '333333',
 'biff',
 'bln',
 'metaconnection',
 'trasch',
 'gamemusic',
 'embarrasing',
 'eksperiementelt',
 'rustycanuckdreamlist',
 'elecpop',
 'opere',
 'godgammel',
 'shastz',
 'eyw',
 'sumlin',
 'bawbag',
 'synchro001',
 'woh',
 'hiiiiii',
 'arch',
 'nazametku',
 'abcdef',
 'psychodela',
 'ducktails',
 'jazzyfunkhouse',
 'martinh',
 'omalley',
 'altergoth',
 'interested',
 'bobitto',
 'nwobh',
 'brainy',
 'vism',
 'gooooooooooood',
 'kuddi',
 'ep33n',
 'ucze',
 'akela',
 'sodomize',
 'httpwwwlastfmmusicthebloodbrothersloverhymeswithhideouscarwreck',
 'kickedassedness',
 'rancy',
 'olvicarte',
 'pitam',
 'thesoundofphiladelphia',
 'swdeish',
 'meine80iger',
 'adans',
 'avantjazz',
 'esquina',
 'bonemill',
 'patto',
 'friedhof',
 'absolutefavouret',
 'lulea',
 'kt',
 'zmilk',
 'harrisonian',
 'pmt',
 'crabbucket',
 'kuvia',
 'bonk',
 'awesomevoice',
 'sprano',
 'rikki',
 'liraelle',
 'catches',
 'hounddog',
 'aif',
 'eyeonkope',
 'fghfghg',
 'biyou',
 'metaller',
 'pointe',
 'nostrakraehe',
 'stoppedie',
 'sammyhaggar',
 'toho',
 'torso',
 'money',
 'remover',
 'coulda',
 'snowpatrol',
 'flexatone',
 '1940',
 'golpe',
 '130',
 'miladstein',
 'savion',
 '860',
 'phoenix',
 'oozzoozzmmsskk',
 'appreciates',
 'coverisbetter',
 'rhymefest',
 'banjo',
 'menschliche',
 'humankindness',
 'chillaut',
 'polner',
 'keparadz',
 'goldies',
 'paratiizizzaaa',
 'pwer',
 'odio',
 'apsun',
 'nemrem',
 'flummigt',
 'aca',
 'ladycore',
 'budgie1234',
 'netherland',
 'baltimore',
 'morska',
 'resembles',
 'novacain',
 'accusing',
 'bazoo',
 'revolverhelden',
 'prance',
 'pram',
 'preparator',
 'blackgrind',
 'hellmutts',
 'rockabilly',
 'biddy',
 'akkon',
 'ftm',
 'freundschaft',
 'cliff',
 'cdlm',
 'maihar',
 'ignant',
 'gbu']

In [6]:
# Create dataframe
#wv_model.most_similar("electronic", topn=10)
#top = ft_model.most_similar("electronic", topn=100)

#df = pd.DataFrame(columns=('tag_name', 'sim', 'count', 'neighbours_count', 'is_neighbour_count'))

#for i, row in enumerate(top):
#    tag = row[0]
#    sim = row[1]
#    count = ft_counts[ft_counts.tag_name == tag]["count"].values[0]
#    incount = ft_counts[ft_counts.tag_name == tag]["is_neighbour_count"].values[0]
#    nhcount = ft_counts[ft_counts.tag_name == tag]["neighbour_count"].values[0]

#    df.loc[len(df)] = [tag, sim, count, nhcount, incount]

    # if count == ncount:
    #     df.loc[len(df)] = [tag, sim, count, nhcount, incount]

In [7]:
#from sklearn.neighbors import LocalOutlierFactor
#from sklearn.ensemble import IsolationForest

#lof = LocalOutlierFactor(n_neighbors=20)
#lof_pred = lof.fit_predict(vecs)

#iso = IsolationForest()
#iso.fit(vecs)
#iso_pred = iso.decision_function(vecs)

In [8]:
# Plotting data TSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(vecs)

data = pd.concat([pd.DataFrame(X_tsne),
                pd.Series(used_vocab)],
               axis=1)

data.columns = ['x', 'y', 'word']

In [9]:
trace = go.Scatter(
    x = data['x'],
    y = data['y'],
    mode = 'markers',
    text= data['word']
#    marker=dict(
#        size='6',
#        color = iso_pred,
#        colorscale='Viridis',
#        showscale=True
#    )
)

data = [trace]

# Plot and embed in ipython notebook!
py.iplot(data, filename='basic-scatter')



In [10]:
# Plotting data PCA
tsne = PCA(n_components=2)
X_tsne = tsne.fit_transform(vecs)

data = pd.concat([pd.DataFrame(X_tsne),
                pd.Series(used_vocab)],
               axis=1)

data.columns = ['x', 'y', 'word']

In [11]:
trace = go.Scatter(
    x = data['x'],
    y = data['y'],
    mode = 'markers',
    text= data['word']
#    marker=dict(
#        size='6',
#        color = iso_pred,
#        colorscale='Viridis',
#        showscale=True
#    )
)

data = [trace]

# Plot and embed in ipython notebook!
py.iplot(data, filename='basic-scatter')



In [12]:
# Plotting data SVD
tsne = TruncatedSVD(n_components=2)
X_tsne = tsne.fit_transform(vecs)

data = pd.concat([pd.DataFrame(X_tsne),
                pd.Series(used_vocab)],
               axis=1)

data.columns = ['x', 'y', 'word']

In [13]:
trace = go.Scatter(
    x = data['x'],
    y = data['y'],
    mode = 'markers',
    text= data['word']
#    marker=dict(
#        size='6',
#        color = iso_pred,
#        colorscale='Viridis',
#        showscale=True
#    )
)

data = [trace]

# Plot and embed in ipython notebook!
py.iplot(data, filename='basic-scatter')