In [1]:
# import difflib
# # from difflib_data import *

# text1 = "fghjjkk"
# text2 = "fghjkref"
# d = difflib.Differ()
# diff = d.compare(text1, text2)
# print('\n'.join(diff))

In [50]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection, metrics, linear_model
# from numba import vectorize, u8

In [64]:
def lcs(s1, s2):
    L = {}
    z = 0
    ret = ''
    for i, c1 in enumerate(s1):
        for j, c2 in enumerate(s2):
            if c1 == c2:
                L[(i, j)] = L.get((i - 1, j - 1), 0) + 1
                if L[(i, j)] > z:
                    z = L[(i, j)]
                    ret = s1[i - z + 1:i + 1]
    return ret

def parser_w(w, inf):
    root = lcs(w, inf)
    i_r_w = w.find(root)
    suff_form = w[i_r_w+len(root):]
    i_r_inf = inf.find(root)
    suff_inf = inf[i_r_inf+len(root):]
    return [root, suff_form, suff_inf]

def openDS(path):
    return pd.read_csv(path, sep=',')

def parserDF(DF):
    return 0

def setforms(forms):
    return set(forms)

def get_forms(w, train_forms):
    for i in range(len(w)):
        form = w[i:]
#         root = w[:i]
        if form in train_forms:
            return form
    else:
        return ''
def get_ans(w, form, inf):
    for i in range(len(w)):
        formcur = w[i:]
        if formcur == form:
            return w[:i] + inf
def hashD(y):
    S = set(y)
    Dh = {}
    Dunh = {}
    i = 1
    for key in S:
        Dh[key] = i
        Dunh[i] = key
        i += 1
    return Dh, Dunh
def hashY(Y, Dh):
    res = np.zeros(Y.shape)
    for i in range(len(Y)):
        res[i] = Dh[Y[i]]
    return res
def unhashY(hY, Dunh):
    res = []
    for i in range(len(hY)):
        res.append(Dunh[hY[i]])
    return np.array(res)
D1, D2 = hashD(np.arange(15, 20))
hy = hashY(np.arange(15, 20), D1)
uhy = unhashY(hy, D2)
print(uhy)


[15 16 17 18 19]

In [19]:
a = np.array(['trabajado', 'trabajar']*2).reshape((2, 2))
pd.DataFrame(list(map(parser_w, a.T[0], a.T[1])))


Out[19]:
0 1 2
0 trabaja do r
1 trabaja do r

In [20]:
DS = openDS('task2_lemmas_train_2.csv')
words = DS.X.values
infs = DS.y.values
features = pd.DataFrame(list(map(parser_w, words, infs)))
# X = features.drop([0])

In [21]:
# features[features[1] == 'vate']
hh = np.vectorize(hash)
X = features[1].values
# X = X
Xh = hh(X.reshape((X.size, 1)))
Y = features[2].values
clf = DecisionTreeClassifier()
clf.fit(Xh, Y)
pred = clf.predict(Xh)
res = pd.DataFrame({0: X, 1: pred})
res
# res[res[0] == 'erei']


Out[21]:
0 1
0 erete are+V
1 vate re+V
2 +N
3 vamo re+V
4 erei are+V
5 sti re+V
6 hereste are+V
7 eran are+V
8 sser re+V
9 ndo re+V
10 sser re+V
11 anno e+V
12 i re+V
13 ndo re+V
14 erebbero are+V
15 herebbero are+V
16 sser re+V
17 van re+V
18 ta re+V
19 ssero re+V
20 n re+V
21 n re+V
22 te re+V
23 erebbero are+V
24 ssi re+V
25 sti re+V
26 +N
27 eremmo are+V
28 bbe +V
29 ssi re+V
... ... ...
118610 eremmo are+V
118611 i re+V
118612 erai are+V
118613 i re+V
118614 ta re+V
118615 i re+V
118616 van re+V
118617 ereste are+V
118618 eranno are+V
118619 +N
118620 on e+V
118621 ser ndere+V
118622 no re+V
118623 vo re+V
118624 mmo re+V
118625 ssero re+V
118626 te re+V
118627 ente ire+A
118628 ssero re+V
118629 sser re+V
118630 vo re+V
118631 te re+V
118632 +N
118633 ò are+V
118634 sser re+V
118635 ereste are+V
118636 ste re+V
118637 bbero +V
118638 ssimo re+V
118639 ste re+V

118640 rows × 2 columns


In [62]:
DS = openDS('task2_lemmas_test.txt')
wordstt = DS.X.values

sf = setforms(X)
get_f = np.vectorize(get_forms, excluded=['train_forms'])
f_ttr = get_f(words, sf)
f_ttt = get_f(wordstt, sf)
cv = CountVectorizer(ngram_range=(1, 10), lowercase=False, analyzer='char_wb')
model_cv = cv.fit(f_ttr)
x_train = model_cv.transform(f_ttr)
x_test = model_cv.transform(f_ttt)
# print(x_train.shape)
Dh, Dunh = hashD(Y)
y_train = hashY(Y, Dh)
# model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
# print(model_selection.cross_val_score(model, x_train, y_train, 
#                                       scoring='accuracy', cv=model_selection.StratifiedKFold(shuffle=True)).mean())
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
# pred = 
model.fit(x_train, y_train)
infs_test = model.predict(x_test)
infs_test = unhashY(infs_test, Dunh)
get_a = np.vectorize(get_ans)
res = get_a(wordstt, f_ttt, infs_test)
ans = pd.DataFrame()
ans["Id"] = range(1,len(res)+1)
ans["Category"] = res
ans.to_csv('hw4_task2.txt', sep=',', index=None)
# DF
# infs_train = clf.predict(hh(f_ttr.reshape((f_ttr.size,1))))
# get_a = np.vectorize(get_ans)
# ans = get_a(words, f_ttr, infs_test)

# Res = pd.DataFrame({0:words,1:infs,2:ans})
# Res
# from sklearn.metrics import accuracy_score
# print(accuracy_score(infs, ans))
# pd.DataFrame({0:X, 1:f_test, 2:pred, 3:infs_test})


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-62-40159812f7e1> in <module>()
     20 model.fit(x_train, y_train)
     21 infs_test = model.predict(x_test)
---> 22 infs_test = unhashY(infs_test, Dunh)
     23 get_a = np.vectorize(get_ans)
     24 res = get_a(wordstt, f_ttt, infs_test)

<ipython-input-60-5495217a8d98> in unhashY(hY, Dunh)
     60     res = np.zeros(hY.shape)
     61     for i in range(len(hY)):
---> 62         res[i] = Dunh[hY[i]]
     63     return res
     64 D1, D2 = hashD(np.arange(15, 20))

ValueError: could not convert string to float: 're+V'

In [65]:
infs_test = unhashY(infs_test, Dunh)
get_a = np.vectorize(get_ans)
res = get_a(wordstt, f_ttt, infs_test)
ans = pd.DataFrame()
ans["Id"] = range(1,len(res)+1)
ans["Category"] = res
ans.to_csv('hw4_task2.txt', sep=',', index=None)

In [57]:
pred = model.predict(x_test)


---------------------------------------------------------------------------
NotFittedError                            Traceback (most recent call last)
<ipython-input-57-756536b260cb> in <module>()
----> 1 pred = model.predict(x_test)

/home/daniel/anaconda3/lib/python3.5/site-packages/sklearn/linear_model/base.py in predict(self, X)
    334             Predicted class label per sample.
    335         """
--> 336         scores = self.decision_function(X)
    337         if len(scores.shape) == 1:
    338             indices = (scores > 0).astype(np.int)

/home/daniel/anaconda3/lib/python3.5/site-packages/sklearn/linear_model/base.py in decision_function(self, X)
    308         if not hasattr(self, 'coef_') or self.coef_ is None:
    309             raise NotFittedError("This %(name)s instance is not fitted "
--> 310                                  "yet" % {'name': type(self).__name__})
    311 
    312         X = check_array(X, accept_sparse='csr')

NotFittedError: This LogisticRegression instance is not fitted yet

In [10]:
# class Lemm():
#     def __init__(self):
#         self.D_freq = {}
#         self.Roots = pd.DataFrame
#         self.Suffs = {}
#     def parser_w(self, w, inf):
#         root = lcs(w, inf)
#         i_r_w = w.find(root)
#         suff_form = w[i_r_w+len(root):]
#         i_r_inf = inf.find(root)
#         suff_inf = inf[i_r_inf+len(root):]
        
#         return [root, suff_form, suff_inf]

In [34]:
DS = openDS('task2_lemmas_test.txt')
wordstt = DS.X.values

sf = setforms(X)
get_f = np.vectorize(get_forms, excluded=['train_forms'])
f_ttt = get_f(wordstt, sf)
# cv = CountVectorizer(lowercase=False, analyzer='char_wb')
# model_cv = cv.fit(f_ttr)
# DF = pd.DataFrame(f_test)
# DF
infs_test = clf.predict(hh(f_ttt.reshape((f_ttt.size,1))))
get_a = np.vectorize(get_ans)
res = get_a(wordstt, f_ttt, infs_test)
ans = pd.DataFrame()
ans["Id"] = range(1,len(res)+1)
ans["Category"] = res
ans.to_csv('hw4_task2.txt', sep=',', index=None)
# Res = pd.DataFrame({0:words,1:infs,2:ans})
# Res
# from sklearn.metrics import accuracy_score
# print(accuracy_score(infs, ans))
# pd.DataFrame({0:X, 1:f_test, 2:pred, 3:infs_test})

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(1, 6), lowercase=False, analyzer='char_wb')
DD = pd.DataFrame(['fhg', 'dfskl'])[0]
DD
model_cv = cv.fit(DD)
r = model_cv.transform(DD.values)
# ss = pd.DataFrame(r)
# r.head()
r


Out[48]:
<2x38 sparse matrix of type '<class 'numpy.int64'>'
	with 40 stored elements in Compressed Sparse Row format>

In [ ]: