流距离嵌入

  • window:5, 一句话中,两个词最远距离为5。
  • 考虑到了词序,例如:“We may encounter many defeats, but we must not be defeated.”, encounter-defeats 词对会+1,而defeats-encounter 词对不会+1

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
avgdist_file = './data/dist_avg.npy'
count_file = './data/count.npy'

avgdist = np.load(avgdist_file)
count = np.load(count_file)

In [3]:
avgdist_flat = avgdist.flatten()
avgdist_flat_nonz = np.sort(avgdist_flat[avgdist_flat.nonzero()])

In [4]:
plt.plot(avgdist_flat_nonz)
plt.title('sorted data sequence')
plt.ylabel('value')
plt.show()



In [5]:
bins = np.linspace(np.ceil(np.min(avgdist_flat_nonz)), np.floor(np.max(avgdist_flat_nonz)), 30)
 
plt.hist(avgdist_flat_nonz, bins=bins, alpha=0.5)
plt.title('Histogram')
plt.xlabel('Value (30 evenly spaced bins)')
plt.ylabel('Count')
 
plt.show()



In [6]:
avgdist_var = np.var(avgdist_flat_nonz)
avgdist_mean = np.mean(avgdist_flat_nonz)

print '均值: %f' % avgdist_mean
print '方差: %f' % avgdist_var


均值: 2.859198
方差: 0.551506

In [7]:
avgdist_min = np.min(avgdist_flat_nonz)
min_idx = np.where(avgdist == 1.0)

print '最小值:%f' % avgdist_min
print '平均距离是:%f 的词对个数:%d' % (avgdist_min, len(min_idx[0]))


最小值:1.000000
平均距离是:1.000000 的词对个数:12814

结果分析

avg_dist 值小的,可能是出现的频次不高,但是都是在一起出现的 用 counts 大的排序,更有实际意义


In [11]:
from train import Vocabulary
vocab = Vocabulary()

words = []
for i, val in enumerate(zip(min_idx[0], min_idx[1])):
    key_word = '%s-%s' % (vocab.get_word(val[0]), vocab.get_word(val[1]))
    count_word = count[val[0], val[1]]
    words.append({'key':key_word, 'cnt':count_word})

words.sort(key=lambda x:x['cnt'], reverse=True)

for i in words[:100]:
    print i


{'cnt': 1639.0, 'key': 'foo-fighters'}
{'cnt': 1593.0, 'key': 'originally-computed'}
{'cnt': 1370.0, 'key': 'adequate-yearly'}
{'cnt': 1289.0, 'key': 'eyed-peas'}
{'cnt': 1239.0, 'key': 'atmospheric-administration'}
{'cnt': 1068.0, 'key': 'garden-merit'}
{'cnt': 992.0, 'key': 'costa-mesa'}
{'cnt': 876.0, 'key': 'palestine-exploration'}
{'cnt': 866.0, 'key': 'lonely-hearts'}
{'cnt': 838.0, 'key': 'electronic-frontier'}
{'cnt': 838.0, 'key': 'sage-publications'}
{'cnt': 673.0, 'key': 'simple-minds'}
{'cnt': 580.0, 'key': 'deeply-discounted'}
{'cnt': 577.0, 'key': 'costs-exemption'}
{'cnt': 539.0, 'key': 'jimmy-eat'}
{'cnt': 516.0, 'key': 'spider-catalog'}
{'cnt': 481.0, 'key': 'coach-composition'}
{'cnt': 479.0, 'key': 'statutory-instrument'}
{'cnt': 477.0, 'key': 'stanley-cups'}
{'cnt': 466.0, 'key': 'aid-ratio'}
{'cnt': 459.0, 'key': 'naughty-dog'}
{'cnt': 457.0, 'key': 'trade-paperbacks'}
{'cnt': 448.0, 'key': 'teen-hunger'}
{'cnt': 446.0, 'key': 'merry-wives'}
{'cnt': 445.0, 'key': 'restrictions-index'}
{'cnt': 439.0, 'key': 'nathan-bedford'}
{'cnt': 406.0, 'key': 'blair-witch'}
{'cnt': 377.0, 'key': 'diamond-harbour'}
{'cnt': 361.0, 'key': 'vampire-weekend'}
{'cnt': 351.0, 'key': 'index-unless'}
{'cnt': 350.0, 'key': 'hudson-soft'}
{'cnt': 349.0, 'key': 'ar-greater'}
{'cnt': 349.0, 'key': 'purple-hearts'}
{'cnt': 347.0, 'key': 'oliver-hazard'}
{'cnt': 343.0, 'key': 'cam-newton'}
{'cnt': 339.0, 'key': 'slightly-injured'}
{'cnt': 339.0, 'key': 'lie-detector'}
{'cnt': 318.0, 'key': 'cooking-vinyl'}
{'cnt': 318.0, 'key': 'capability-brown'}
{'cnt': 308.0, 'key': 'combination-thereof'}
{'cnt': 303.0, 'key': 'gregory-terrace'}
{'cnt': 302.0, 'key': 'sunshine-review'}
{'cnt': 300.0, 'key': 'gets-rid'}
{'cnt': 299.0, 'key': 'hood-canal'}
{'cnt': 290.0, 'key': 'leisure-software'}
{'cnt': 290.0, 'key': 'seeks-parental'}
{'cnt': 289.0, 'key': 'kitty-hawk'}
{'cnt': 282.0, 'key': 'tiny-mix'}
{'cnt': 281.0, 'key': 'scores-hearing'}
{'cnt': 278.0, 'key': 'hay-fever'}
{'cnt': 277.0, 'key': 'attention-span'}
{'cnt': 277.0, 'key': 'ww-norton'}
{'cnt': 273.0, 'key': 'jim-beam'}
{'cnt': 272.0, 'key': 'slide-rule'}
{'cnt': 271.0, 'key': 'kitty-empire'}
{'cnt': 271.0, 'key': 'willow-bunch'}
{'cnt': 270.0, 'key': 'alternatives-sunshine'}
{'cnt': 270.0, 'key': 'reasonably-priced'}
{'cnt': 269.0, 'key': 'bible-belt'}
{'cnt': 269.0, 'key': 'oliver-reed'}
{'cnt': 266.0, 'key': 'ak-bars'}
{'cnt': 264.0, 'key': 'funk-railroad'}
{'cnt': 264.0, 'key': 'miracle-network'}
{'cnt': 262.0, 'key': 'caroline-sullivan'}
{'cnt': 259.0, 'key': 'sufficiently-impressed'}
{'cnt': 258.0, 'key': 'fisher-cats'}
{'cnt': 247.0, 'key': 'meal-scheme'}
{'cnt': 246.0, 'key': 'global-hawk'}
{'cnt': 245.0, 'key': 'scoring-summary'}
{'cnt': 242.0, 'key': 'pro-shops'}
{'cnt': 239.0, 'key': 'demographic-breakdown'}
{'cnt': 238.0, 'key': 'distribute-animal'}
{'cnt': 234.0, 'key': 'roll-rank'}
{'cnt': 234.0, 'key': 'velvet-worm'}
{'cnt': 231.0, 'key': 'caroline-garcia'}
{'cnt': 229.0, 'key': 'citizen-deaths'}
{'cnt': 227.0, 'key': 'classification-symbol'}
{'cnt': 226.0, 'key': 'latin-lover'}
{'cnt': 224.0, 'key': 'sick-bay'}
{'cnt': 223.0, 'key': 'asking-alexandria'}
{'cnt': 223.0, 'key': 'bedford-blues'}
{'cnt': 221.0, 'key': 'berlin-thunder'}
{'cnt': 220.0, 'key': 'samsung-lions'}
{'cnt': 219.0, 'key': 'fully-laden'}
{'cnt': 219.0, 'key': 'curtis-cup'}
{'cnt': 213.0, 'key': 'kid-ink'}
{'cnt': 212.0, 'key': 'qt-interval'}
{'cnt': 211.0, 'key': 'holy-soap'}
{'cnt': 211.0, 'key': 'ace-hood'}
{'cnt': 210.0, 'key': 'slide-hampton'}
{'cnt': 209.0, 'key': 'anytime-soon'}
{'cnt': 206.0, 'key': 'dying-bride'}
{'cnt': 206.0, 'key': 'heavily-implied'}
{'cnt': 205.0, 'key': 'millennium-challenge'}
{'cnt': 204.0, 'key': 'monster-magnet'}
{'cnt': 203.0, 'key': 'badly-drawn'}
{'cnt': 201.0, 'key': 'beaver-wars'}
{'cnt': 201.0, 'key': 'fin-disc'}
{'cnt': 200.0, 'key': 'lawyers-guild'}
{'cnt': 200.0, 'key': 'gonna-miss'}

In [13]:
count_flat = count.flatten()
val = np.partition(count_flat, -100)[-100:]

words_cnt = []
for i in val[::-1]:
    idx_0, idx_1 = np.where(i==count)
    key_word = '%s-%s' % (vocab.get_word(idx_0[0]), vocab.get_word(idx_1[0]))
    avgdist_word = avgdist[idx_0[0], idx_1[0]]
    words_cnt.append({'key':key_word, 'cnt':i, 'avgdist': avgdist_word})
 
words_cnt.sort(key=lambda x:x['cnt'], reverse=True)

for i in words_cnt:
    print i


{'cnt': 1102693.0, 'key': 'united-states', 'avgdist': 1.0093471165591874}
{'cnt': 760376.0, 'key': 'new-york', 'avgdist': 1.0403366229339168}
{'cnt': 468649.0, 'key': 'high-school', 'avgdist': 1.0688297638531181}
{'cnt': 392738.0, 'key': 'world-war', 'avgdist': 1.0189693892620526}
{'cnt': 276908.0, 'key': 'may-refer', 'avgdist': 1.1273744348303407}
{'cnt': 265955.0, 'key': 'also-known', 'avgdist': 1.0779455170987573}
{'cnt': 211612.0, 'key': 'new-zealand', 'avgdist': 1.0130852692663932}
{'cnt': 201213.0, 'key': 'war-ii', 'avgdist': 1.0226227927618992}
{'cnt': 200990.0, 'key': 'los-angeles', 'avgdist': 1.0055077367033185}
{'cnt': 196407.0, 'key': 'world-ii', 'avgdist': 2.0065985428217936}
{'cnt': 195522.0, 'key': 'new-city', 'avgdist': 2.0166477429649858}
{'cnt': 192153.0, 'key': 'first-time', 'avgdist': 1.0718802204493294}
{'cnt': 190443.0, 'key': 'took-place', 'avgdist': 1.0701942313448118}
{'cnt': 187915.0, 'key': 'york-city', 'avgdist': 1.0241864672857408}
{'cnt': 177112.0, 'key': 'two-years', 'avgdist': 1.1502100365870185}
{'cnt': 165171.0, 'key': 'united-kingdom', 'avgdist': 1.0456496600492822}
{'cnt': 156134.0, 'key': 'made-debut', 'avgdist': 2.7813800965837037}
{'cnt': 147599.0, 'key': 'years-later', 'avgdist': 1.0655695499292002}
{'cnt': 144342.0, 'key': 'air-force', 'avgdist': 1.0444846267891534}
{'cnt': 135557.0, 'key': 'national-team', 'avgdist': 1.6866779288417419}
{'cnt': 130534.0, 'key': 'football-league', 'avgdist': 1.0818024422755756}
{'cnt': 128255.0, 'key': 'prime-minister', 'avgdist': 1.0330045612256833}
{'cnt': 122196.0, 'key': 'summer-olympics', 'avgdist': 1.0257373400111296}
{'cnt': 122174.0, 'key': 'world-cup', 'avgdist': 1.0515412444546304}
{'cnt': 120763.0, 'key': 'new-jersey', 'avgdist': 1.1022829840265644}
{'cnt': 119705.0, 'key': 'years-age', 'avgdist': 2.0365732425546135}
{'cnt': 119549.0, 'key': 'de-la', 'avgdist': 1.1842591740625183}
{'cnt': 118006.0, 'key': 'median-income', 'avgdist': 1.0587004050641493}
{'cnt': 117101.0, 'key': 'san-francisco', 'avgdist': 1.0201193841213996}
{'cnt': 114191.0, 'key': 'three-years', 'avgdist': 1.1805045931815992}
{'cnt': 112655.0, 'key': 'south-africa', 'avgdist': 1.0598553104611423}
{'cnt': 110945.0, 'key': 'civil-war', 'avgdist': 1.0329081977556447}
{'cnt': 110460.0, 'key': 'north-america', 'avgdist': 1.1023085279739273}
{'cnt': 110385.0, 'key': 'rural-district', 'avgdist': 1.6351315849073698}
{'cnt': 109746.0, 'key': 'village-district', 'avgdist': 3.8112915277094381}
{'cnt': 109451.0, 'key': 'hong-kong', 'avgdist': 1.015312788371052}
{'cnt': 109029.0, 'key': 'railway-station', 'avgdist': 1.1718533601151988}
{'cnt': 108805.0, 'key': 'head-coach', 'avgdist': 1.1326685354533339}
{'cnt': 108643.0, 'key': 'best-known', 'avgdist': 1.0095818414439954}
{'cnt': 106882.0, 'key': 'football-team', 'avgdist': 1.1508579555023297}
{'cnt': 103638.0, 'key': 'following-year', 'avgdist': 1.0483992358015399}
{'cnt': 102599.0, 'key': 'world-championships', 'avgdist': 1.6250060916773068}
{'cnt': 100959.0, 'key': 'state-university', 'avgdist': 1.1624223694767182}
{'cnt': 100894.0, 'key': 'national-historic', 'avgdist': 2.6839653497730289}
{'cnt': 100845.0, 'key': 'census-population', 'avgdist': 2.4456839704497}
{'cnt': 100527.0, 'key': 'studio-album', 'avgdist': 1.0718016055388104}
{'cnt': 99717.0, 'key': 'became-first', 'avgdist': 2.1817343080919001}
{'cnt': 99680.0, 'key': 'supreme-court', 'avgdist': 1.0484249598715891}
{'cnt': 99118.0, 'key': 'years-older', 'avgdist': 3.8679250993765009}
{'cnt': 99025.0, 'key': 'average-size', 'avgdist': 1.9921938904317091}
{'cnt': 98915.0, 'key': 'school-school', 'avgdist': 3.4493858363241165}
{'cnt': 98811.0, 'key': 'can-also', 'avgdist': 1.0773598081185294}
{'cnt': 98530.0, 'key': 'also-used', 'avgdist': 1.4339186034710241}
{'cnt': 97546.0, 'key': 'school-district', 'avgdist': 1.1931191437885715}
{'cnt': 97451.0, 'key': 'district-county', 'avgdist': 3.0161619685790808}
{'cnt': 97159.0, 'key': 'one-two', 'avgdist': 2.6380880824215978}
{'cnt': 96703.0, 'key': 'film-directed', 'avgdist': 1.5585038726823366}
{'cnt': 96575.0, 'key': 'may-also', 'avgdist': 1.1641729225990163}
{'cnt': 95517.0, 'key': 'age-older', 'avgdist': 2.0184888553869991}
{'cnt': 94680.0, 'key': 'north-carolina', 'avgdist': 1.0608259400084494}
{'cnt': 94484.0, 'key': 'years-old', 'avgdist': 1.0466322340290419}
{'cnt': 93606.0, 'key': 'national-register', 'avgdist': 1.0252334252077859}
{'cnt': 91420.0, 'key': 'african-american', 'avgdist': 1.9708160140013127}
{'cnt': 91377.0, 'key': 'film-festival', 'avgdist': 1.0901211464591747}
{'cnt': 90195.0, 'key': 'one-first', 'avgdist': 3.062486834081712}
{'cnt': 89197.0, 'key': 'album-released', 'avgdist': 2.5591443658419006}
{'cnt': 87747.0, 'key': 'district-district', 'avgdist': 3.2255461725187184}
{'cnt': 85007.0, 'key': 'two-later', 'avgdist': 2.0740527250697003}
{'cnt': 84918.0, 'key': 'national-league', 'avgdist': 1.7398666949292259}
{'cnt': 84478.0, 'key': 'historic-places', 'avgdist': 1.0030303747721301}
{'cnt': 84372.0, 'key': 'register-historic', 'avgdist': 2.0008652159484188}
{'cnt': 84282.0, 'key': 'television-series', 'avgdist': 1.1532474312427328}
{'cnt': 84107.0, 'key': 'first-season', 'avgdist': 1.7450866158583709}
{'cnt': 83373.0, 'key': 'new-south', 'avgdist': 1.2435200844398067}
{'cnt': 83246.0, 'key': 'register-places', 'avgdist': 2.9999639622324197}
{'cnt': 83244.0, 'key': 'general-election', 'avgdist': 1.0228364807073183}
{'cnt': 82775.0, 'key': 'national-places', 'avgdist': 3.9974267592872246}
{'cnt': 81946.0, 'key': 'south-wales', 'avgdist': 1.0285797964513217}
{'cnt': 81269.0, 'key': 'early-century', 'avgdist': 2.1853105120033467}
{'cnt': 80301.0, 'key': 'music-video', 'avgdist': 1.0784797200532994}
{'cnt': 79868.0, 'key': 'world-championship', 'avgdist': 1.6569840236390043}
{'cnt': 79737.0, 'key': 'first-two', 'avgdist': 1.5420193887404843}
{'cnt': 79282.0, 'key': 'first-round', 'avgdist': 1.1063797583310209}
{'cnt': 78219.0, 'key': 'four-years', 'avgdist': 1.1289456525908028}
{'cnt': 77397.0, 'key': 'can-used', 'avgdist': 2.2278512087031799}
{'cnt': 77107.0, 'key': 'soviet-union', 'avgdist': 1.0200241223235245}
{'cnt': 77043.0, 'key': 'washington-dc', 'avgdist': 1.0245966538166997}
{'cnt': 75994.0, 'key': 'human-rights', 'avgdist': 1.0461483801352738}
{'cnt': 75987.0, 'key': 'debut-album', 'avgdist': 1.2821403661152566}
{'cnt': 75756.0, 'key': 'two-one', 'avgdist': 2.9820740271397645}
{'cnt': 74052.0, 'key': 'every-females', 'avgdist': 2.0007832334035545}
{'cnt': 73820.0, 'key': 'can-found', 'avgdist': 2.1565564887564346}
{'cnt': 73732.0, 'key': 'many-years', 'avgdist': 1.1092470026582759}
{'cnt': 73710.0, 'key': 'roman-catholic', 'avgdist': 1.0117351784018451}
{'cnt': 73566.0, 'key': 'second-war', 'avgdist': 2.0189897507000518}
{'cnt': 73307.0, 'key': 'hall-fame', 'avgdist': 2.0010230946567176}
{'cnt': 72635.0, 'key': 'award-best', 'avgdist': 2.1954567357334618}
{'cnt': 72531.0, 'key': 'took-part', 'avgdist': 1.1381478264466227}
{'cnt': 71868.0, 'key': 'several-including', 'avgdist': 2.7977959592586408}
{'cnt': 71749.0, 'key': 'five-years', 'avgdist': 1.1184128001783997}

In [ ]: