In [1]:

    
import __init__

import pandas as pd

import cpLib.conceptDB as db
import cpLib.conceptExtraction as cpe

import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline

In this notebook, we'll study the relation between dimensions and the semantic field of concepts.

To do so, we'll study for a domain specific corpus the repartition of dimensions.

Workflow

build domain



In [6]:

    
domainWordList = [open('../../data/domain/luu_animal.txt').read().splitlines(),
                  open('../../data/domain/luu_plant.txt').read().splitlines(),
                  open('../../data/domain/luu_vehicle.txt').read().splitlines()]

def buildCptDf(d, domain, polar=False):
    cptList = cpe.buildConceptList(d, domain, True)
    if polar:
        return pd.DataFrame([c.vect[1:] for c in cptList], index = [c.word for c in cptList])
    else:
        return pd.DataFrame([c.vect for c in cptList], index = [c.word for c in cptList])

cptDf = buildCptDf(db.DB('../../data/voc/npy/text8_polar.npy'), domainWordList[0], polar=True)
cptDf[:5]









    



71291 loaded from text8_polar
mem usage 54.4MiB
loaded time 0.124932050705 s






    Out[6]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      189
      190
      191
      192
      193
      194
      195
      196
      197
      198
    
  
  
    
      aardvark
      1.570333
      1.607254
      1.676376
      1.520162
      1.540573
      1.533382
      1.468733
      1.591881
      1.570375
      1.459761
      ...
      1.701919
      1.585012
      0.937317
      0.887349
      1.702887
      1.798437
      0.721342
      1.845871
      1.389567
      4.032353
    
    
      abalone
      1.548824
      1.468633
      1.590599
      1.533940
      1.670923
      1.611977
      1.564944
      1.615798
      1.485073
      1.562848
      ...
      1.725316
      1.954154
      0.974342
      0.904230
      1.846595
      1.810580
      1.416131
      2.073856
      2.096982
      4.612715
    
    
      adder
      1.650479
      1.525432
      1.626109
      1.573190
      1.595186
      1.553065
      1.421742
      1.410038
      1.691598
      1.463953
      ...
      1.958727
      1.555460
      1.449096
      1.119370
      1.116009
      1.365480
      2.235364
      1.873697
      2.031797
      6.139337
    
    
      agouti
      1.659057
      1.491807
      1.627996
      1.527551
      1.599855
      1.544088
      1.524868
      1.578460
      1.608976
      1.536019
      ...
      2.264453
      1.377309
      2.035051
      1.394019
      1.411589
      2.755652
      1.461268
      2.385505
      2.810244
      3.158008
    
    
      albatross
      1.594904
      1.518474
      1.660378
      1.389571
      1.638179
      1.625463
      1.490517
      1.568753
      1.488399
      1.572214
      ...
      2.130862
      1.263646
      1.616922
      1.153882
      1.110804
      2.280854
      1.685167
      2.164539
      1.570621
      4.188432
    
  

5 rows × 199 columns

Std study

We plot here:

the std for each dimension
the std of std for all dimension



In [7]:

    
def stdDim(cptDf):
    stdSerie = []
    for dim in cptDf.columns:
        dimensionSerie = cptDf[dim]
        stdSerie.append(dimensionSerie.std())

        dimensionSerie.plot(kind='kde')

    plt.show()

    stdSerie = pd.Series(stdSerie)
    stdSerie.plot(kind='kde')
    return stdSerie.describe()

stdDim(cptDf)









    












    Out[7]:





count    199.000000
mean       0.123752
std        0.131581
min        0.058355
25%        0.073311
50%        0.088591
75%        0.122904
max        1.618373
dtype: float64

Domain selection

We run here the experiment for animal but you can actually run it on any domain specific corpus



In [8]:

    
domain = domainWordList[0]

Text8 corpus - Carthesian



In [9]:

    
cptDf = buildCptDf(db.DB('../../data/voc/npy/text8.npy'), domain, polar=False)
stdDim(cptDf)









    



71291 loaded from text8
mem usage 54.4MiB
loaded time 0.169363021851 s






    












    Out[9]:





count    200.000000
mean       0.946024
std        0.069843
min        0.795924
25%        0.896379
50%        0.940876
75%        0.986517
max        1.171432
dtype: float64

Text8 corpus - Polar



In [10]:

    
cptDf = buildCptDf(db.DB('../../data/voc/npy/text8_polar.npy'), domain, polar=True)
stdDim(cptDf)









    



71291 loaded from text8_polar
mem usage 54.4MiB
loaded time 0.15756893158 s






    












    Out[10]:





count    199.000000
mean       0.123752
std        0.131581
min        0.058355
25%        0.073311
50%        0.088591
75%        0.122904
max        1.618373
dtype: float64

Wikipedia corpus - Carthesian



In [11]:

    
cptDf = buildCptDf(db.DB('../../data/voc/npy/wikiEn-skipgram.npy'), domain, polar=False)
stdDim(cptDf)









    



1388424 loaded from wikiEn-skipgram
mem usage 1.6GiB
loaded time 6.04700517654 s






    












    Out[11]:





count    300.000000
mean       0.234917
std        0.017345
min        0.198909
25%        0.221880
50%        0.231710
75%        0.245841
max        0.299620
dtype: float64

Wikipedia corpus - Polar



In [12]:

    
cptDf = buildCptDf(db.DB('../../data/voc/npy/wikiEn-skipgram_polar.npy'), domain, polar=True)
stdDim(cptDf)









    



1388424 loaded from wikiEn-skipgram_polar
mem usage 1.6GiB
loaded time 6.00778484344 s






    












    Out[12]:





count    299.000000
mean       0.097820
std        0.131787
min        0.042743
25%        0.056948
50%        0.070294
75%        0.100965
max        2.104748
dtype: float64

Conclusion

We do observe, no matter the selected domain:

A few dimensions are more 'spreaded' or differently centered which seems to define the particularities of each the domain specific concept
This effect is amplified when we study only the angle - ie: the norm is a form a noise to define the semantic 'direction' for a given concept

	0	1	2	3	4	5	6	7	8	9	...	189	190	191	192	193	194	195	196	197	198
aardvark	1.570333	1.607254	1.676376	1.520162	1.540573	1.533382	1.468733	1.591881	1.570375	1.459761	...	1.701919	1.585012	0.937317	0.887349	1.702887	1.798437	0.721342	1.845871	1.389567	4.032353
abalone	1.548824	1.468633	1.590599	1.533940	1.670923	1.611977	1.564944	1.615798	1.485073	1.562848	...	1.725316	1.954154	0.974342	0.904230	1.846595	1.810580	1.416131	2.073856	2.096982	4.612715
adder	1.650479	1.525432	1.626109	1.573190	1.595186	1.553065	1.421742	1.410038	1.691598	1.463953	...	1.958727	1.555460	1.449096	1.119370	1.116009	1.365480	2.235364	1.873697	2.031797	6.139337
agouti	1.659057	1.491807	1.627996	1.527551	1.599855	1.544088	1.524868	1.578460	1.608976	1.536019	...	2.264453	1.377309	2.035051	1.394019	1.411589	2.755652	1.461268	2.385505	2.810244	3.158008
albatross	1.594904	1.518474	1.660378	1.389571	1.638179	1.625463	1.490517	1.568753	1.488399	1.572214	...	2.130862	1.263646	1.616922	1.153882	1.110804	2.280854	1.685167	2.164539	1.570621	4.188432