In [9]:
from src import dataset
from collections import defaultdict
import pandas as pd
import logging
from sklearn.svm import LinearSVR
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
logging.getLogger().setLevel(logging.INFO)
In [3]:
corpus = dataset.read_corpus()
corpus.head()
Out[3]:
In [4]:
pipeline = Pipeline([('ngram', TfidfVectorizer()),
('clf', LinearSVR())])
In [17]:
scores = defaultdict(dict)
for language, subset in corpus.groupby('lang'):
for trait in [subset.ext, subset.sta, subset.agr, subset.con, subset.opn]:
logging.info("running cv on pipeline for %s on %s" %
(trait.name, language))
predictions = cross_val_score(pipeline,
subset.text, # X
trait, # Y
cv=10,
verbose=False,
scoring='neg_mean_squared_error',
n_jobs=-1)
scores[language][trait.name] = ("%0.2f (+/- %0.2f)" % (predictions.mean(), predictions.std() * 2))
In [15]:
scores
Out[15]:
In [4]:
corpus[['ext', 'sta','agr','con','opn']].corr()
Out[4]:
Loss function
$$L(\theta) = \frac{1}{n}\sum_{i}^{n}(y_{s_{i}} - \hat{y}_{s_{i}}) $$
In [ ]:
from keras import losses
model.compile(loss=losses.mean_squared_error)