In [1]:
import __init__
import pandas as pd
import cpLib.conceptDB as db
import cpLib.conceptExtraction as cpe
import matplotlib.pyplot as plt
from pylab import rcParams
%matplotlib inline
In this notebook, we'll study the relation between dimensions and the semantic field of concepts.
To do so, we'll study for a domain specific corpus the repartition of dimensions.
In [6]:
domainWordList = [open('../../data/domain/luu_animal.txt').read().splitlines(),
open('../../data/domain/luu_plant.txt').read().splitlines(),
open('../../data/domain/luu_vehicle.txt').read().splitlines()]
def buildCptDf(d, domain, polar=False):
cptList = cpe.buildConceptList(d, domain, True)
if polar:
return pd.DataFrame([c.vect[1:] for c in cptList], index = [c.word for c in cptList])
else:
return pd.DataFrame([c.vect for c in cptList], index = [c.word for c in cptList])
cptDf = buildCptDf(db.DB('../../data/voc/npy/text8_polar.npy'), domainWordList[0], polar=True)
cptDf[:5]
Out[6]:
In [7]:
def stdDim(cptDf):
stdSerie = []
for dim in cptDf.columns:
dimensionSerie = cptDf[dim]
stdSerie.append(dimensionSerie.std())
dimensionSerie.plot(kind='kde')
plt.show()
stdSerie = pd.Series(stdSerie)
stdSerie.plot(kind='kde')
return stdSerie.describe()
stdDim(cptDf)
Out[7]:
In [8]:
domain = domainWordList[0]
In [9]:
cptDf = buildCptDf(db.DB('../../data/voc/npy/text8.npy'), domain, polar=False)
stdDim(cptDf)
Out[9]:
In [10]:
cptDf = buildCptDf(db.DB('../../data/voc/npy/text8_polar.npy'), domain, polar=True)
stdDim(cptDf)
Out[10]:
In [11]:
cptDf = buildCptDf(db.DB('../../data/voc/npy/wikiEn-skipgram.npy'), domain, polar=False)
stdDim(cptDf)
Out[11]:
In [12]:
cptDf = buildCptDf(db.DB('../../data/voc/npy/wikiEn-skipgram_polar.npy'), domain, polar=True)
stdDim(cptDf)
Out[12]:
We do observe, no matter the selected domain: