In [ ]:
from IPython.core.display import Markdown
docs = ["buen dia microtc", "excelente dia", "odio el trafico",
"la computadora", "la mesa", "la ventana"]
l = ["* " + x for x in docs]
Markdown("# Corpus\n" + "\n".join(l))
In [ ]:
from microtc.textmodel import TextModel
import numpy as np
import pandas as pd
M = np.zeros((11, len(docs)))
model = TextModel(docs, token_list=[-1])
words = {}
for x in docs:
for i in x.split(): words[model.dictionary.doc2bow([i])[0][0]] = i
[[M.__setitem__((x[0], k), "%0.4f" % x[1]) for x in model[d]] for k, d in enumerate(docs)]
M = np.concatenate((np.atleast_2d([words[x] for x in range(11)]).T, M), axis=1)
pd.DataFrame(M, columns=['Palabras'] + ["Doc. %s" % (x + 1) for x in range(len(docs))])
microtc-textModel -m supervised/microtc.model -o supervised/train.textmodel datasets/train.json.gz
{"11268": 0.08856512767171724, "text": "dear @Microsoft the newOoffice for Mac is great and all, but no Lync update? C'mon.", "7686": 0.055381812446489244, "157703": 0.08856512767171724, "330924": 0.08856512767171724, "432651": 0.08856512767171724, "226829": 0.08856512767171724, "435728": 0.08856512767171724, "154130": 0.08856512767171724, "341529": 0.08856512767171724, "80923": 0.07490376701241866, "23586": 0.08856512767171724, "70693": 0.08856512767171724, "376359": 0.08856512767171724, "415788": 0.08856512767171724, "415792": 0.08856512767171724, "198711": 0.08856512767171724, "27316": 0.07490376701241866, "411487": 0.08856512767171724, "39488": 0.05325102265860315, "84036": 0.08856512767171724, "44102": 0.08856512767171724, "109640": 0.08856512767171724, "124698": 0.08856512767171724, "12813": 0.08856512767171724, "268883": 0.08856512767171724, "156247": 0.08856512767171724,..., "num_terms": 464328
In [ ]:
import numpy as np
x = np.linspace(-10, 10, 10)
y = x**2 + 12 * x + 3.2 + np.random.uniform(-1, 1, x.shape[0])
pd.DataFrame(np.vstack([x, y]).T, columns=['x', 'y'])
In [ ]:
%pylab inline
pylab.plot(x, y, 'o')
In [ ]:
import numpy as np
X = np.vstack((x**2, x, np.ones_like(x))).T
coef = np.linalg.lstsq(X, y)[0]
print(coef)
In [ ]:
pylab.plot(x, y, 'o')
pylab.plot(x, np.dot(X, coef))
In [ ]:
x = np.linspace(-100, 100, 50)
X = np.vstack((x**2, x, np.ones_like(x))).T
pylab.plot(x, np.dot(X, coef))
pylab.plot(x, np.dot(X, np.array([1, 12, 3.2])))
pylab.legend(['Modelo', 'Real'])
for ds in data/data-*.json.gz;
do
base=`basename $ds`
base=distant_supervision/`basename $base .json.gz`
param=$base.params
model=$base.model
predict=$base.predict
gold=$base.gold
if [ ! -f $param ]
then
microtc-params -o $param -k 0:1 -S avgf1:positive:negative -n 32 -s 32 -H static:$ds datasets/train.json.gz
fi
if [ ! -f $model ]
then
microtc-train -o $model -m $param $ds
fi
done
microtc-params -o $param -k 0:1 -S avgf1:positive:negative -n 32 -s 32 -H static:$ds datasets/train.json.gz
{
"_accuracy": 0.7150208906552532,
"_avgf1:positive:negative": 0.7048618911050835,
"_macrof1": 0.469907927403389,
"_microf1": 0.7150208906552531,
"_score": 0.7048618911050835,
"_time": 123.33291339874268,
"_weightedf1": 0.6499001443154221,
"del_diac": false,
"del_dup": true,
"del_punc": false,
"emo_option": "none",
"lc": true,
"num_option": "group",
"tfidf": true,
"token_list": [
-3,
2,
3,
5,
9
],
"token_max_filter": 1.0,
"token_min_filter": -1,
"url_option": "group",
"usr_option": "group"
}
for ds in data/data-*.json.gz;
do
base=`basename $ds`
base=distant_supervision/`basename $base .json.gz`
param=$base.params
model=$base.model
predict=$base.predict
gold=$base.gold
if [ ! -f $predict ]
then
microtc-predict -o $predict -m $model datasets/train.json.gz
fi
if [ ! -f $gold ]
then
microtc-predict -o $gold -m $model datasets/gold.json.gz
fi
done
{"voc_affinity": 0.6414342629482072, "klass": "negative", "id": "628949369883000832", "text": "dear @Microsoft the newOoffice for Mac is great and all, but no Lync update? C'mon.", "decision_function": -0.038332115769748785}
{"voc_affinity": 0.7912844036697247, "klass": "negative", "id": "628976607420645377", "text": "@Microsoft how about you make a system that doesn't eat my friggin discs. This is the 2nd time this has happened and I am so sick of it!", "decision_function": -0.21017788877835167}
In [ ]:
from sklearn.cross_validation import KFold
l = []
for ts, vs in KFold(10, shuffle=True,
random_state=1, n_folds=5):
a = [list() for i in range(10)]
[a.__setitem__(x, 'ent.') for x in ts]
[a.__setitem__(x, 'val.') for x in vs]
l.append(a)
d = np.array(l).T
pd.DataFrame(d, columns=['Fold %s' % x for x in range(1, 6)])
import sklearn.cross_validation
import gzip
from b4msa.utils import tweet_iterator
from tqdm import tqdm
import json
import sys
D = [x for x in tweet_iterator(sys.argv[1])]
c = 0
for tr, val in sklearn.cross_validation.KFold(len(D), n_folds=10):
for output, index in zip(['kfolds/train-%s.json.gz' % c,
'kfolds/validation-%s.json.gz' % c],
[tr, val]):
with gzip.open(output, 'wb') as fpt:
for x in tqdm(index):
fpt.write(bytes(json.dumps(D[x]) + '\n', encoding='UTF-8'))
c += 1
for i in `seq 0 9`;
do
base=kfolds/microtc-$i
param=$base.params
model=$base.model
predict=$base.predict
train=kfolds/train-$i.json.gz
val=kfolds/validation-$i.json.gz
if [ ! -f $param ]
then
microtc-params -o $param -k 0.7 -n 32 -s 32 -H $train
fi
if [ ! -f $model ]
then
microtc-train -o $model -m $param $train
fi
if [ ! -f $predict ]
then
microtc-predict -o $predict -m $model $val
fi
done
base=supervised/microtc
param=$base.params
model=$base.model
gold=$base.gold
if [ ! -f $param ]
then
srun --mem-per-cpu=2048 -c32 microtc-params -o $param -k 0.7 -n 32 -s 32 -H datasets/train.json.gz
fi
if [ ! -f $model ]
then
srun --mem-per-cpu=2048 -c1 microtc-train -o $model -m $param datasets/train.json.gz
fi
if [ ! -f $gold ]
then
srun --mem-per-cpu=2048 -c1 microtc-predict -o $gold -m $model datasets/gold.json.gz
fi
train_klass = [x['klass'] for x in tweet_iterator('datasets/train.json.gz')]
gold_klass = [x['klass'] for x in tweet_iterator('datasets/gold.json.gz')]
train_micro = []
for i in range(10):
train_micro.append([x['decision_function']
for x in tweet_iterator('kfolds/microtc-%s.predict' % i)])
train_micro = np.concatenate(train_micro, axis=0)
test_micro = np.array([x['decision_function'] for x in tweet_iterator('supervised/microtc.gold')])
i = 0
train_ds = []
test_ds = []
while True:
train = 'distant_supervision/data-%s.predict' % i
test = 'distant_supervision/data-%s.gold' % i
if not os.path.isfile(train):
break
train_ds.append([(x['decision_function'], x['voc_affinity'])
for x in tweet_iterator(train)])
test_ds.append([(x['decision_function'], x['voc_affinity'])
for x in tweet_iterator(test)])
i += 1
train_ds = distant_supervision(train_ds)
test_ds = distant_supervision(test_ds)
TRAIN = np.concatenate((train_ds, train_micro), axis=1)
GOLD = np.concatenate((test_ds, test_micro), axis=1)
assert TRAIN.shape[1] == GOLD.shape[1]
save_csv(TRAIN, train_klass, 'train.csv')
save_csv(GOLD, gold_klass, 'gold.csv')
from microtc.utils import tweet_iterator
import numpy as np
import os
def save_csv(D, klass, output):
with open(output, 'w') as fpt:
for x, y in zip(D, klass):
fpt.write(",".join([str(i) for i in x]))
fpt.write(',' + y + '\n')
def distant_supervision(D):
DS = np.array([[y[0] for y in x] for x in D]).T
S = np.array([[y[1] for y in x] for x in D]).T
return np.array([x[np.argsort(y)] for x, y in zip(DS, S)])
0.114636783784,0.179505189563,0.2490083552,0.607012341309,0.0727107215932, 0.37516196334,0.472766213229,0.15847305556,0.449212004631,0.210050939743, 0.291776476193,0.163773511676,0.437736026062,0.0878780952512,0.355921536945, 0.171149258385,0.370602925167,0.183431922788,0.0250394972413,0.488403253384, 0.258532009063,0.118452932218,0.113392212855,0.149002886753,0.00379138978131, 0.458240438281,0.300322307891,0.31664688375,0.247096677077,-0.0477388224443, -0.930914412857,-0.896643529031,0.689744786641,positive
if [ ! -f evodag.params ]
then
EvoDAG-params --multiple-outputs -u 16 -P evodag.params -C train.csv
fi
if [ ! -f evodag.model ]
then
EvoDAG-train -n 30 -u 16 -P evodag.params -m evodag.model train.csv
fi
if [ ! -f evodag.gold ]
then
EvoDAG-predict -u 16 -m evodag.model -o evodag.gold gold.csv
fi
Competencia | Algoritmo | Macro | Micro | Positivo | Negativo | Neutro |
---|---|---|---|---|---|---|
SEMEVAL2016 | $\mu$TC | 0.4638 | 0.5199 | 0.5907 | 0.2979 | 0.5026 |
SEMEVAL2016 | $\mu$TC+EvoDAG | 0.5144 | 0.5261 | 0.6153 | 0.4472 | 0.4807 |
SEMEVAL2015 | $\mu$TC | 0.58149 | 0.6261 | 0.6430 | 0.4333 | 0.6680 |
SEMEVAL2015 | $\mu$TC+EvoDAG | 0.5756 | 0.5929 | 0.6292 | 0.4768 | 0.6207 |
TASS2015 | $\mu$TC | 0.6269 | 0.637 | 0.7217 | 0.6402 | 0.5189 |
TASS2015 | $\mu$TC+EvoDAG | 0.6351 | 0.638 | 0.7138 | 0.6412 | 0.5504 |