In [1]:
# import difflib
# # from difflib_data import *
# text1 = "fghjjkk"
# text2 = "fghjkref"
# d = difflib.Differ()
# diff = d.compare(text1, text2)
# print('\n'.join(diff))
In [50]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import model_selection, metrics, linear_model
# from numba import vectorize, u8
In [64]:
def lcs(s1, s2):
L = {}
z = 0
ret = ''
for i, c1 in enumerate(s1):
for j, c2 in enumerate(s2):
if c1 == c2:
L[(i, j)] = L.get((i - 1, j - 1), 0) + 1
if L[(i, j)] > z:
z = L[(i, j)]
ret = s1[i - z + 1:i + 1]
return ret
def parser_w(w, inf):
root = lcs(w, inf)
i_r_w = w.find(root)
suff_form = w[i_r_w+len(root):]
i_r_inf = inf.find(root)
suff_inf = inf[i_r_inf+len(root):]
return [root, suff_form, suff_inf]
def openDS(path):
return pd.read_csv(path, sep=',')
def parserDF(DF):
return 0
def setforms(forms):
return set(forms)
def get_forms(w, train_forms):
for i in range(len(w)):
form = w[i:]
# root = w[:i]
if form in train_forms:
return form
else:
return ''
def get_ans(w, form, inf):
for i in range(len(w)):
formcur = w[i:]
if formcur == form:
return w[:i] + inf
def hashD(y):
S = set(y)
Dh = {}
Dunh = {}
i = 1
for key in S:
Dh[key] = i
Dunh[i] = key
i += 1
return Dh, Dunh
def hashY(Y, Dh):
res = np.zeros(Y.shape)
for i in range(len(Y)):
res[i] = Dh[Y[i]]
return res
def unhashY(hY, Dunh):
res = []
for i in range(len(hY)):
res.append(Dunh[hY[i]])
return np.array(res)
D1, D2 = hashD(np.arange(15, 20))
hy = hashY(np.arange(15, 20), D1)
uhy = unhashY(hy, D2)
print(uhy)
In [19]:
a = np.array(['trabajado', 'trabajar']*2).reshape((2, 2))
pd.DataFrame(list(map(parser_w, a.T[0], a.T[1])))
Out[19]:
In [20]:
DS = openDS('task2_lemmas_train_2.csv')
words = DS.X.values
infs = DS.y.values
features = pd.DataFrame(list(map(parser_w, words, infs)))
# X = features.drop([0])
In [21]:
# features[features[1] == 'vate']
hh = np.vectorize(hash)
X = features[1].values
# X = X
Xh = hh(X.reshape((X.size, 1)))
Y = features[2].values
clf = DecisionTreeClassifier()
clf.fit(Xh, Y)
pred = clf.predict(Xh)
res = pd.DataFrame({0: X, 1: pred})
res
# res[res[0] == 'erei']
Out[21]:
In [62]:
DS = openDS('task2_lemmas_test.txt')
wordstt = DS.X.values
sf = setforms(X)
get_f = np.vectorize(get_forms, excluded=['train_forms'])
f_ttr = get_f(words, sf)
f_ttt = get_f(wordstt, sf)
cv = CountVectorizer(ngram_range=(1, 10), lowercase=False, analyzer='char_wb')
model_cv = cv.fit(f_ttr)
x_train = model_cv.transform(f_ttr)
x_test = model_cv.transform(f_ttt)
# print(x_train.shape)
Dh, Dunh = hashD(Y)
y_train = hashY(Y, Dh)
# model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
# print(model_selection.cross_val_score(model, x_train, y_train,
# scoring='accuracy', cv=model_selection.StratifiedKFold(shuffle=True)).mean())
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
# pred =
model.fit(x_train, y_train)
infs_test = model.predict(x_test)
infs_test = unhashY(infs_test, Dunh)
get_a = np.vectorize(get_ans)
res = get_a(wordstt, f_ttt, infs_test)
ans = pd.DataFrame()
ans["Id"] = range(1,len(res)+1)
ans["Category"] = res
ans.to_csv('hw4_task2.txt', sep=',', index=None)
# DF
# infs_train = clf.predict(hh(f_ttr.reshape((f_ttr.size,1))))
# get_a = np.vectorize(get_ans)
# ans = get_a(words, f_ttr, infs_test)
# Res = pd.DataFrame({0:words,1:infs,2:ans})
# Res
# from sklearn.metrics import accuracy_score
# print(accuracy_score(infs, ans))
# pd.DataFrame({0:X, 1:f_test, 2:pred, 3:infs_test})
In [65]:
infs_test = unhashY(infs_test, Dunh)
get_a = np.vectorize(get_ans)
res = get_a(wordstt, f_ttt, infs_test)
ans = pd.DataFrame()
ans["Id"] = range(1,len(res)+1)
ans["Category"] = res
ans.to_csv('hw4_task2.txt', sep=',', index=None)
In [57]:
pred = model.predict(x_test)
In [10]:
# class Lemm():
# def __init__(self):
# self.D_freq = {}
# self.Roots = pd.DataFrame
# self.Suffs = {}
# def parser_w(self, w, inf):
# root = lcs(w, inf)
# i_r_w = w.find(root)
# suff_form = w[i_r_w+len(root):]
# i_r_inf = inf.find(root)
# suff_inf = inf[i_r_inf+len(root):]
# return [root, suff_form, suff_inf]
In [34]:
DS = openDS('task2_lemmas_test.txt')
wordstt = DS.X.values
sf = setforms(X)
get_f = np.vectorize(get_forms, excluded=['train_forms'])
f_ttt = get_f(wordstt, sf)
# cv = CountVectorizer(lowercase=False, analyzer='char_wb')
# model_cv = cv.fit(f_ttr)
# DF = pd.DataFrame(f_test)
# DF
infs_test = clf.predict(hh(f_ttt.reshape((f_ttt.size,1))))
get_a = np.vectorize(get_ans)
res = get_a(wordstt, f_ttt, infs_test)
ans = pd.DataFrame()
ans["Id"] = range(1,len(res)+1)
ans["Category"] = res
ans.to_csv('hw4_task2.txt', sep=',', index=None)
# Res = pd.DataFrame({0:words,1:infs,2:ans})
# Res
# from sklearn.metrics import accuracy_score
# print(accuracy_score(infs, ans))
# pd.DataFrame({0:X, 1:f_test, 2:pred, 3:infs_test})
In [48]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1, 6), lowercase=False, analyzer='char_wb')
DD = pd.DataFrame(['fhg', 'dfskl'])[0]
DD
model_cv = cv.fit(DD)
r = model_cv.transform(DD.values)
# ss = pd.DataFrame(r)
# r.head()
r
Out[48]:
In [ ]: