In [7]:
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics, linear_model, svm
from sklearn.feature_extraction.text import FeatureHasher, TfidfVectorizer, TfidfTransformer, CountVectorizer
import scipy.sparse as ss
raw_train = pd.read_csv('linear_train.txt', header=None)
fff_test = pd.read_csv('linear_test.txt', header=None)
f = np.vectorize(lambda x: x[-7:]) # вся нужная информация в конце слов
slow = np.vectorize(str.lower) # неодинаковый регистр все портит
suffs = slow(f(raw_train[0].values))
# генерация признаков: есть или нет определенная подстрока в суффиксе слова
cv = CountVectorizer(ngram_range=(1, 6), lowercase=False, analyzer='char_wb')
model_cv = cv.fit(suffs)
data_train = model_cv.transform(suffs)
data_test = model_cv.transform(fff_test[0].values)
# признаки слов с большой буквы и написанных CAPS'ом
new_train = ss.hstack((data_train, pd.DataFrame(list(map(lambda x: 1.0 if x.istitle() else 0.0, raw_train[0].values)))))
new_test = ss.hstack((data_test, pd.DataFrame(list(map(lambda x: 1.0 if x.istitle() else 0.0, fff_test[0].values)))))
new_train = ss.hstack((new_train, pd.DataFrame(list(map(lambda x: 1.0 if x.isupper() else 0.0, raw_train[0].values)))))
new_test = ss.hstack((new_test, pd.DataFrame(list(map(lambda x: 1.0 if x.isupper() else 0.0, fff_test[0].values)))))
# классификация с помощью линейной модели
new_y_train = raw_train[1]
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
print(model_selection.cross_val_score(model, new_train, new_y_train,
scoring='roc_auc', cv=model_selection.StratifiedKFold(shuffle=True)).mean())
# предсказания на тестовой выборке
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
model.fit(new_train, new_y_train)
pred = model.predict_proba(new_test)[:, 0]
ans = pd.DataFrame()
ans["Id"] = range(len(pred))
ans["Answer"] = pred
ans.to_csv('res4.txt', sep=',', index=None)
In [ ]:
# file = open('text.txt', 'r')
# lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# file.close()
# lo
# raw = pd.DataFrame(lo)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# suffs = slow(f(raw[0].values))
# clses = raw[1].values
# # list(map(lambda x: x[-4:], raw[0].values))
# suffs
In [ ]:
# import pandas as pd
# import numpy as np
In [ ]:
# # from numba import vectorize
# class Features():
# def __init__(self, nsn, nal):
# self.D = {}
# self.nsn = nsn
# self.nal = nal
# self.nt = nal - nsn
# # @vectorize
# def dicinc(self, x, cl):
# # x, cl = w
# try:
# y = self.D[x]
# y[0] += 1
# if cl == 1.0:
# y[1] += 1 #if cl == 1.0 else -1
# except KeyError:
# self.D[x] = [1, 1] #if cl == 1.0 else [1, -1]
# # @vectorize
# def spliter(self, x, cl, ps=2):
# l = len(x)
# for i in range(0, l - ps + 1):
# self.dicinc(x[i:i+ps], cl)
# def dicsort(self):
# for key in self.D:
# y = self.D[key]
# a1 = y[1]
# a0 = y[0] - y[1]
# b1 = self.nsn - y[1]
# b0 = self.nt - a0
# y[1] = (a1*b0 - a0*b1) / self.nal / self.nal
# def tolist(self):
# l = list(map(lambda x: (self.D[x])[1], self.D))
# # print(l)
# l.sort()
# return l, l[0:5], l[-5:-1]
# def get_feature(self, x, ps=2):
# wl = []
# # x.lower()
# l = len(x)
# for i in range(0, l - ps + 1):
# fr = x[i:i+ps]
# try:
# y = self.D[fr]
# wl.append(y[1])
# except KeyError:
# continue
# # wl.sort()
# if len(wl) == 0:
# return 0.0
# # ma = np.mean(np.array(wl))
# ma = max(wl)
# return ma if ma >= 0.0 else min(wl)
# spliter('ва')
In [ ]:
# file = open('linear_train.txt', 'r')
# lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# file.close()
# raw = pd.DataFrame(lo)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# suffs = slow(f(raw[0].values))
# # suffs = f(raw[0].values)
# clses = raw[1].values
# F = Features(np.sum(clses), clses.size)
# # F.spliter(suffs, clses)
# list(map(lambda x, y: F.spliter(x, y), suffs, clses))
# # F.D
# F.dicsort()
# l, mi, ma = F.tolist()
# print(mi, ma)
# # F.D['ин']
# # print(F.nsn, F.nal)
# gf = np.array(list(map(F.get_feature, suffs)))
# ma = np.amax(gf)
# mi = np.amin(gf)
# # print(ma)
# feres = pd.DataFrame({'5_2': gf})
In [ ]:
# for key in F.D:
# y = F.D[key]
# if y[1] >= 0.001:
# print(key, y)
In [ ]:
# fff = pd.read_csv('linear_test.txt', header=None)
# fff
# # file = open('linear_test.txt', 'r')
# # lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# # file.close()
# # raw = pd.DataFrame(lo)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# # suffs = slow(f(raw[0].values))
# # f = np.vectorize(lambda x: x[-5:])
# suffs = slow(f(fff[0].values))
# # clses = fff[1].values
In [ ]:
# from sklearn import linear_model
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
# from sklearn.metrics import accuracy_score, roc_curve, auc
# from sklearn.svm import SVC
# import matplotlib.pyplot as plt
# X = gf.reshape((gf.size, 1))
# Y = clses
# # X_test = np.array(list(map(F.get_feature, suffs)))
# X_train, X_test, y_train, y_test = train_test_split(
# X, Y, test_size=0.1, random_state=42)
# clf = linear_model.SGDClassifier(loss='log', penalty='none')
# # clf = SVC()
# # clf.fit(X_train, y_train)
# # y_score = clf.decision_function(X_test.reshape((X_test.size, 1)))
# # print(y_score)
# # fpr, tpr, _ = roc_curve(y_test, y_score)
# # roc_auc = auc(fpr, tpr)
# # print(roc_auc)
# # plt.plot(fpr, tpr)
# # plt.show()
# print(cross_val_score(clf, X_train, y_train,
# scoring='roc_auc', cv=StratifiedKFold(shuffle=True)).min())
# # y_pred = cross_val_predict(clf, X_train, y_train, cv=10)
# # accuracy_score(y_test, y_pred)
# # res = pd.DataFrame(y_pred)
# # res.to_csv('res1.txt')
# # print(y_test.size, y_pred.size)
# # print(y_pred)
In [ ]:
# file = open('linear_train.txt', 'r')
# lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# file.close()
# raw = pd.DataFrame(lo)
# clses = raw[1].values
# slow = np.vectorize(str.lower)
# fff_test = pd.read_csv('linear_test.txt', header=None)
# for j in [2, 3, 4, 5]:
# for i in range(6, j - 1, -1):
# f = np.vectorize(lambda x: x[-i:])
# suffs = slow(f(raw[0].values))
# suffs_t = slow(f(fff_test[0].values))
# F = Features(np.sum(clses), clses.size)
# list(map(lambda x, y: F.spliter(x, y, ps=j), suffs, clses))
# F.dicsort()
# # l, mi, ma = F.tolist()
# # print(mi, ma)
# gf = np.array(list(map(lambda x: F.get_feature(x, ps=j), suffs)))
# gf_t = np.array(list(map(lambda x: F.get_feature(x, ps=j), suffs_t)))
# raw['r'+str(i)+'_'+str(j)] = pd.DataFrame(gf)
# fff_test['r'+str(i)+'_'+str(j)] = pd.DataFrame(gf_t)
# tit = np.array(list(map(lambda x: 1.0 if x.istitle() else 0.0, raw[0].values)))
# raw['tit'] = pd.DataFrame(tit)
# tit = np.array(list(map(lambda x: 1.0 if x.istitle() else 0.0, fff_test[0].values)))
# fff_test['tit'] = pd.DataFrame(tit)
# caps = np.array(list(map(lambda x: 1.0 if x.isupper() else 0.0, raw[0].values)))
# raw['caps'] = pd.DataFrame(caps)
# caps = np.array(list(map(lambda x: 1.0 if x.isupper() else 0.0, fff_test[0].values)))
# fff_test['caps'] = pd.DataFrame(caps)
# raw.head()
# fff_test.head()
In [ ]:
# from sklearn import linear_model
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
# from sklearn.metrics import accuracy_score, roc_curve, auc
# from sklearn.ensemble import GradientBoostingClassifier
# X = raw.drop([0, 1], axis=1).values
# Y = clses
# # X_test = np.array(list(map(F.get_feature, suffs)))
# X_train, X_test, y_train, y_test = train_test_split(
# X, Y, test_size=0.1)#, random_state=42)
# clf = linear_model.SGDClassifier(loss='log', penalty='l1')
# # clf = linear_model.LogisticRegression(penalty='l2')
# print(cross_val_score(clf, X_train, y_train,
# scoring='roc_auc', cv=StratifiedKFold(shuffle=True)).min())
# clf = GradientBoostingClassifier(n_estimators=150, random_state=42, max_depth=5)
# clf.fit(X_train, y_train)
# y_score = clf.decision_function(X_test)
# # print(y_score)
# fpr, tpr, _ = roc_curve(y_test, y_score)
# roc_auc = auc(fpr, tpr)
# print(roc_auc)
# plt.plot(fpr, tpr)
# plt.show()
In [ ]:
# fff = pd.read_csv('linear_test.txt', header=None)
# # fff
# # file = open('linear_test.txt', 'r')
# # lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# # file.close()
# # raw = pd.DataFrame(lo)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# # suffs = slow(f(raw[0].values))
# # f = np.vectorize(lambda x: x[-5:])
# suffs = slow(f(fff[0].values))
# # clses = fff[1].values
In [ ]:
# from sklearn import linear_model
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
# from sklearn.metrics import accuracy_score, roc_curve, auc
# X = raw.drop([0, 1], axis=1).values
# X_t = fff_test.drop([0], axis=1).values
# Y = clses
# # clf = linear_model.SGDClassifier(loss='log', penalty='l1')
# # clf = linear_model.LogisticRegression(penalty='l2', solver='lbfgs')
# # clf = GradientBoostingClassifier(n_estimators=200, random_state=42, max_depth=5)
# # clf.fit(X, Y)
# # y_pred = clf.predict_proba(X_t)#[:, 0]
# # res = pd.DataFrame({y_pred})
# # res.to_csv('res3.txt')
# # print(y_test.size, y_pred.size)
# model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=0.75)
# model.fit(X, Y)
# pred = model.predict_proba(X_t)[:, 0]
# ans = pd.DataFrame()
# ans["Id"] = range(len(pred))
# ans["Answer"] = pred
# ans.to_csv('res5.txt', sep=',', index=None)
In [ ]:
# import numpy as np
# import pandas as pd
# from sklearn import model_selection, metrics, linear_model, svm
# from sklearn.feature_extraction.text import FeatureHasher, TfidfVectorizer, TfidfTransformer, CountVectorizer
# import scipy.sparse as ss
# raw_train = pd.read_csv('linear_train.txt', header=None)
# fff_test = pd.read_csv('linear_test.txt', header=None)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# suffs = slow(f(raw_train[0].values))
# cv = CountVectorizer(ngram_range=(2, 4), lowercase=False, analyzer='char_wb')
# model_cv = cv.fit(suffs)
# data_train = model_cv.transform(suffs)
# data_test = model_cv.transform(fff_test[0].values)
# # data_train.shape
# # tit = np.array(list(map(lambda x: 1.0 if x.istitle() else 0.0, raw_train[0].values)))
# # data_train['tit'] = pd.DataFrame(tit)
# new_train = ss.hstack((data_train, pd.DataFrame(list(map(lambda x: 1.0 if x.istitle() else 0.0, raw_train[0].values)))))
# new_test = ss.hstack((data_test, pd.DataFrame(list(map(lambda x: 1.0 if x.istitle() else 0.0, fff_test[0].values)))))
# new_train = ss.hstack((new_train, pd.DataFrame(list(map(lambda x: 1.0 if x.isupper() else 0.0, raw_train[0].values)))))
# new_test = ss.hstack((new_test, pd.DataFrame(list(map(lambda x: 1.0 if x.isupper() else 0.0, fff_test[0].values)))))
# new_y_train = raw_train[1]
# model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
# # model = GradientBoostingClassifier()
# print(model_selection.cross_val_score(model, new_train, new_y_train,
# scoring='roc_auc', cv=model_selection.StratifiedKFold(shuffle=True)).mean())
# model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
# model.fit(new_train, new_y_train)
# pred = model.predict_proba(new_test)[:, 0]
# ans = pd.DataFrame()
# ans["Id"] = range(len(pred))
# ans["Answer"] = pred
# ans.to_csv('res4.txt', sep=',', index=None)
In [ ]: