In [9]:
from src import dataset
from collections import defaultdict
import pandas as pd
import logging
from sklearn.svm import LinearSVR
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
logging.getLogger().setLevel(logging.INFO)

In [3]:
corpus = dataset.read_corpus()
corpus.head()


Out[3]:
author lang text gender age ext sta agr con opn
0 e5b59ccc-2053-4e8b-ba11-dfeeff92dd5d en @username @username ay friend, q te fumasteSSS... F 35-49 0.0 0.2 0.2 0.3 0.2
1 ed970294-8f36-4008-a82e-183ac9abf6ac en “@username: @username "you can't have your cak... M 18-24 0.1 0.2 0.2 0.0 0.1
2 4b05f4e0-2b12-48f1-94c0-c55b4caf534c en I should probably go to bed considering I have... F 18-24 0.5 0.0 0.3 0.3 0.4
3 de7f0515-32b0-4b1e-9fb9-91a66cd434a3 en @username the sameee\n@username Great!!\nRT @u... M 25-34 0.2 -0.1 0.2 0.0 0.1
4 a71c93ed-a929-4f59-8728-6639a6d31975 en On my very last Nerve!\nI am nothing and I hav... F 25-34 0.2 0.0 0.0 0.3 0.4

In [4]:
pipeline = Pipeline([('ngram', TfidfVectorizer()),
                     ('clf', LinearSVR())])

In [17]:
scores = defaultdict(dict)
for language, subset in corpus.groupby('lang'):
    for trait in [subset.ext, subset.sta, subset.agr, subset.con, subset.opn]:
        logging.info("running cv on pipeline for %s on %s" %
                     (trait.name, language))
        predictions = cross_val_score(pipeline,
                                      subset.text,  # X
                                      trait,  # Y
                                      cv=10,
                                      verbose=False,
                                      scoring='neg_mean_squared_error',
                                      n_jobs=-1)
        scores[language][trait.name] = ("%0.2f (+/- %0.2f)" % (predictions.mean(), predictions.std() * 2))


INFO:root:running cv on pipeline for ext on en
INFO:root:running cv on pipeline for sta on en
INFO:root:running cv on pipeline for agr on en
INFO:root:running cv on pipeline for con on en
INFO:root:running cv on pipeline for opn on en
INFO:root:running cv on pipeline for ext on es
INFO:root:running cv on pipeline for sta on es
INFO:root:running cv on pipeline for agr on es
INFO:root:running cv on pipeline for con on es
INFO:root:running cv on pipeline for opn on es
INFO:root:running cv on pipeline for ext on it
INFO:root:running cv on pipeline for sta on it
INFO:root:running cv on pipeline for agr on it
INFO:root:running cv on pipeline for con on it
INFO:root:running cv on pipeline for opn on it
INFO:root:running cv on pipeline for ext on nl
INFO:root:running cv on pipeline for sta on nl
INFO:root:running cv on pipeline for agr on nl
INFO:root:running cv on pipeline for con on nl
INFO:root:running cv on pipeline for opn on nl

In [15]:
scores


Out[15]:
defaultdict(dict,
            {'en': {'agr': '-0.02 (+/- 0.02)',
              'con': '-0.02 (+/- 0.02)',
              'ext': '-0.02 (+/- 0.02)',
              'opn': '-0.02 (+/- 0.01)',
              'sta': '-0.04 (+/- 0.04)'},
             'es': {'agr': '-0.02 (+/- 0.02)',
              'con': '-0.02 (+/- 0.03)',
              'ext': '-0.02 (+/- 0.03)',
              'opn': '-0.02 (+/- 0.03)',
              'sta': '-0.03 (+/- 0.03)'},
             'it': {'agr': '-0.02 (+/- 0.04)',
              'con': '-0.01 (+/- 0.01)',
              'ext': '-0.02 (+/- 0.05)',
              'opn': '-0.02 (+/- 0.03)',
              'sta': '-0.03 (+/- 0.04)'},
             'nl': {'agr': '-0.03 (+/- 0.04)',
              'con': '-0.01 (+/- 0.03)',
              'ext': '-0.02 (+/- 0.03)',
              'opn': '-0.01 (+/- 0.02)',
              'sta': '-0.03 (+/- 0.06)'}})

In [4]:
corpus[['ext', 'sta','agr','con','opn']].corr()


Out[4]:
ext sta agr con opn
ext 1.000000 0.214601 0.144430 0.133626 -0.010024
sta 0.214601 1.000000 0.345466 -0.018955 0.024044
agr 0.144430 0.345466 1.000000 -0.072996 -0.041911
con 0.133626 -0.018955 -0.072996 1.000000 -0.011676
opn -0.010024 0.024044 -0.041911 -0.011676 1.000000

Loss function

$$L(\theta) = \frac{1}{n}\sum_{i}^{n}(y_{s_{i}} - \hat{y}_{s_{i}}) $$

In [ ]:
from keras import losses
model.compile(loss=losses.mean_squared_error)

Evaluation method:

cv RMSE_tweet

cv RMSE_user