In [1]:
import pandas as pd
import numpy as np
In [2]:
data = []
with open('task2_lemmas_train') as f:
for line in f.readlines():
lines = line.strip().split(',')
for s in lines[2:]:
data += [[lines[0], lines[1], s]]
data[:10]
Out[2]:
In [3]:
#data = pd.DataFrame(data[1:], columns=['Id', 'X', 'y', 'y1', 'y2', 'y3', 'y4'])
data = pd.DataFrame(data[1:], columns=['Id', 'X', 'y'])
data.drop([u'Id'], inplace=True, axis=1)
In [4]:
data
Out[4]:
In [5]:
def get_form(data):
return data.apply(lambda x: x.split('+')[0])
def get_part(data):
return data.apply(lambda x: x.split('+')[1])
In [6]:
buf1 = get_form(data[u'y'])
buf2 = get_part(data[u'y'])
data['form'] = buf1
data['part'] = buf2
data.drop([u'y'], inplace=True, axis=1)
data
Out[6]:
In [7]:
print sum(data['part']=='V')
print sum(data['part']=='A')
print sum(data['part']=='N')
In [8]:
cnt = 0
for i in xrange(len(data)):
if data['X'][i][0] != data['form'][i][0]:
cnt += 1
print cnt
In [9]:
def find_prefixes(a, b):
pref = ''
for x, y in zip(a, b):
if x == y:
pref += x
return pref
def find_all_pref(data):
d = {}
for i in xrange(len(data)):
word = data['X'][i]
form = data['form'][i]
pref = find_prefixes(word, form)
d[pref] = d.get(pref, []) + [(form, data['part'][i])]
return d
In [10]:
d = find_all_pref(data)
cnt = 0
for k, v in d.items():
if len(v) > 1:
cnt += 1
print cnt
In [11]:
data_test = []
with open('task2_lemmas_test') as f:
for line in f.readlines():
data_test += [line.strip().split(',')]
data_test = pd.DataFrame(data_test[1:])
data_test.columns = ['Id', 'X']
data_test
Out[11]:
In [12]:
def get_cut_and_suffix(a, b):
pr = find_prefixes(a, b)
return len(a) - len(pr), b[len(pr):]
In [13]:
get_cut_and_suffix('impoverimenti', 'impoveron')
Out[13]:
In [14]:
def get_extra(data):
cuts = []
suffs = []
for i in xrange(len(data)):
word = data['X'][i]
form = data['form'][i]
rule = get_cut_and_suffix(word, form)
cuts += [rule[0]]
suffs += [rule[1]]
return cuts, suffs
In [15]:
new_f = get_extra(data)
In [16]:
data['cut'] = new_f[0]
data['suf'] = new_f[1]
In [17]:
data_part = data[['X', 'part']]
data_cut = data[['X', 'cut']]
data_suf = data[['X', 'suf']]
In [18]:
from sklearn.feature_extraction.text import FeatureHasher, TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn import model_selection, metrics, linear_model, svm
In [19]:
cv = CountVectorizer(ngram_range=(1, 5), analyzer='char_wb')
cv_fit = cv.fit(data.append(data_test)['X'])
matrix = cv_fit.transform(data['X'])
In [20]:
def convert(x):
if x == 'V':
return 0
elif x == 'N':
return 1
elif x == 'A':
return 2
def convert_back(x):
if x == 0:
return 'V'
elif x == 1:
return 'N'
elif x == 2:
return 'A'
num_part = data['part'].apply(convert)
In [21]:
model_part = svm.LinearSVC()
model_part.fit(matrix, num_part)
Out[21]:
In [22]:
predicter_part = map(convert_back, model_part.predict(cv_fit.transform(data_test['X'])))
In [23]:
model_cut = linear_model.LinearRegression()
model_cut.fit(matrix, data['cut'])
Out[23]:
In [24]:
pred_cut = map(round, model_cut.predict(cv_fit.transform(data_test['X'])))
In [27]:
def get_ans(d, x):
ans = []
for i in range(len(x), 0, -1):
cur = x[:i]
if cur in d.keys():
return d[cur]
return ans
def get_anses(data, predicter_part):
anses = []
counts = []
not_found = 0
for i in xrange(len(data)):
word = data['X'][i]
cur_ans = get_ans(d, word)
if len(cur_ans) == 0:
not_found += 1
anses += [word + '+' + 'V']
elif len(cur_ans) == 1:
anses += [cur_ans[0][0] + '+' + cur_ans[0][1]]
else:
find_pred = False
cnt = 0
poss_anses = []
for x in cur_ans:
if x[1] == predicter_part[i]:
find_pred = True
anses += [x[0] + '+' + x[1]]
break
if not find_pred:
anses += [cur_ans[-1][0] + '+' + cur_ans[-1][1]]
return anses
In [28]:
data_test['Category'] = get_anses(data_test, predicter_part)
data_test
Out[28]:
In [29]:
data_test.drop(['X'], axis=1).to_csv('submit2.txt', index=None)
In [30]:
def check():
cnt = 0
with open('submit.txt') as f:
data1 = f.readlines()
with open('submit2.txt') as g:
data2 = g.readlines()
print len(data1), len(data2)
for x, y in zip(data1, data2):
if x != y:
cnt += 1
return cnt
check()
Out[30]:
In [ ]: