In [7]:
import numpy as np
import pandas as pd
from sklearn import model_selection, metrics, linear_model, svm
from sklearn.feature_extraction.text import FeatureHasher, TfidfVectorizer, TfidfTransformer, CountVectorizer
import scipy.sparse as ss

raw_train = pd.read_csv('linear_train.txt', header=None)
fff_test = pd.read_csv('linear_test.txt', header=None)
f = np.vectorize(lambda x: x[-7:]) # вся нужная информация в конце слов
slow = np.vectorize(str.lower) # неодинаковый регистр все портит
suffs = slow(f(raw_train[0].values))

# генерация признаков: есть или нет определенная подстрока в суффиксе слова
cv = CountVectorizer(ngram_range=(1, 6), lowercase=False, analyzer='char_wb')
model_cv = cv.fit(suffs)
data_train = model_cv.transform(suffs)
data_test = model_cv.transform(fff_test[0].values)

# признаки слов с большой буквы и написанных CAPS'ом
new_train = ss.hstack((data_train, pd.DataFrame(list(map(lambda x: 1.0 if x.istitle() else 0.0, raw_train[0].values)))))
new_test = ss.hstack((data_test, pd.DataFrame(list(map(lambda x: 1.0 if x.istitle() else 0.0, fff_test[0].values)))))
new_train = ss.hstack((new_train, pd.DataFrame(list(map(lambda x: 1.0 if x.isupper() else 0.0, raw_train[0].values)))))
new_test = ss.hstack((new_test, pd.DataFrame(list(map(lambda x: 1.0 if x.isupper() else 0.0, fff_test[0].values)))))

# классификация с помощью линейной модели
new_y_train = raw_train[1]
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
print(model_selection.cross_val_score(model, new_train, new_y_train, 
                                      scoring='roc_auc', cv=model_selection.StratifiedKFold(shuffle=True)).mean())
# предсказания на тестовой выборке
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
model.fit(new_train, new_y_train)
pred = model.predict_proba(new_test)[:, 0]
ans = pd.DataFrame()
ans["Id"] = range(len(pred))
ans["Answer"] = pred
ans.to_csv('res4.txt', sep=',', index=None)


0.907896663061

In [ ]:
# file = open('text.txt', 'r')
# lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# file.close()
# lo

# raw = pd.DataFrame(lo)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# suffs = slow(f(raw[0].values))
# clses = raw[1].values
# # list(map(lambda x: x[-4:], raw[0].values))
# suffs

In [ ]:
# import pandas as pd
# import numpy as np

In [ ]:
# # from numba import vectorize
# class Features():
#     def __init__(self, nsn, nal):
#         self.D = {}
#         self.nsn = nsn
#         self.nal = nal
#         self.nt = nal - nsn
        
# #     @vectorize
#     def dicinc(self, x, cl):
# #         x, cl = w
#         try:
#             y = self.D[x]
#             y[0] += 1
#             if cl == 1.0:
#                 y[1] += 1 #if cl == 1.0 else -1
#         except KeyError:
#             self.D[x] = [1, 1] #if cl == 1.0 else [1, -1]
# #     @vectorize
#     def spliter(self, x, cl, ps=2):
#         l = len(x)
#         for i in range(0, l - ps + 1):
#              self.dicinc(x[i:i+ps], cl)
#     def dicsort(self):
#         for key in self.D:
#             y = self.D[key]
#             a1 = y[1]
#             a0 = y[0] - y[1]
#             b1 = self.nsn - y[1]
#             b0 = self.nt - a0
#             y[1] = (a1*b0 - a0*b1) / self.nal / self.nal
#     def tolist(self):
#         l = list(map(lambda x: (self.D[x])[1], self.D))
# #         print(l)
#         l.sort()
#         return l, l[0:5], l[-5:-1]
#     def get_feature(self, x, ps=2):
#         wl = []
# #         x.lower()
#         l = len(x)
#         for i in range(0, l - ps + 1):
#             fr = x[i:i+ps]
#             try:
#                 y = self.D[fr]
#                 wl.append(y[1])
#             except KeyError:
#                 continue
# #         wl.sort()
#         if len(wl) == 0:
#             return 0.0
# #         ma = np.mean(np.array(wl))
#         ma = max(wl)
#         return ma if ma >= 0.0 else min(wl)
    
            
# spliter('ва')

In [ ]:
# file = open('linear_train.txt', 'r')
# lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# file.close()
# raw = pd.DataFrame(lo)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# suffs = slow(f(raw[0].values))
# # suffs = f(raw[0].values)
# clses = raw[1].values

# F = Features(np.sum(clses), clses.size)
# # F.spliter(suffs, clses)
# list(map(lambda x, y: F.spliter(x, y), suffs, clses))
# # F.D
# F.dicsort()
# l, mi, ma = F.tolist()
# print(mi, ma)
# # F.D['ин']
# # print(F.nsn, F.nal)
# gf = np.array(list(map(F.get_feature, suffs)))
# ma = np.amax(gf)
# mi = np.amin(gf)
# # print(ma)
# feres = pd.DataFrame({'5_2': gf})

In [ ]:
# for key in F.D:
#     y = F.D[key]
#     if y[1] >= 0.001:
#         print(key, y)

In [ ]:
# fff = pd.read_csv('linear_test.txt', header=None)
# fff
# # file = open('linear_test.txt', 'r')
# # lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# # file.close()
# # raw = pd.DataFrame(lo)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# # suffs = slow(f(raw[0].values))
# # f = np.vectorize(lambda x: x[-5:])
# suffs = slow(f(fff[0].values))
# # clses = fff[1].values

In [ ]:
# from sklearn import linear_model
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
# from sklearn.metrics import accuracy_score, roc_curve, auc
# from sklearn.svm import SVC
# import matplotlib.pyplot as plt
# X = gf.reshape((gf.size, 1))
# Y = clses
# # X_test = np.array(list(map(F.get_feature, suffs)))
# X_train, X_test, y_train, y_test = train_test_split(
#     X, Y, test_size=0.1, random_state=42)

# clf = linear_model.SGDClassifier(loss='log', penalty='none')
# # clf = SVC()

# # clf.fit(X_train, y_train)
# # y_score = clf.decision_function(X_test.reshape((X_test.size, 1)))
# # print(y_score)
# # fpr, tpr, _ = roc_curve(y_test, y_score)
# # roc_auc = auc(fpr, tpr)
# # print(roc_auc)
# # plt.plot(fpr, tpr)
# # plt.show()
# print(cross_val_score(clf, X_train, y_train, 
#                                       scoring='roc_auc', cv=StratifiedKFold(shuffle=True)).min())
# # y_pred = cross_val_predict(clf, X_train, y_train, cv=10)
# # accuracy_score(y_test, y_pred)
# # res = pd.DataFrame(y_pred)
# # res.to_csv('res1.txt')
# # print(y_test.size, y_pred.size)
# # print(y_pred)

In [ ]:
# file = open('linear_train.txt', 'r')
# lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# file.close()
# raw = pd.DataFrame(lo)
# clses = raw[1].values
# slow = np.vectorize(str.lower)

# fff_test = pd.read_csv('linear_test.txt', header=None)

# for j in [2, 3, 4, 5]:
#     for i in range(6, j - 1, -1):
#         f = np.vectorize(lambda x: x[-i:])
#         suffs = slow(f(raw[0].values))
#         suffs_t = slow(f(fff_test[0].values))
#         F = Features(np.sum(clses), clses.size)
#         list(map(lambda x, y: F.spliter(x, y, ps=j), suffs, clses))
#         F.dicsort()
#         # l, mi, ma = F.tolist()
#         # print(mi, ma)
#         gf = np.array(list(map(lambda x: F.get_feature(x, ps=j), suffs)))
#         gf_t = np.array(list(map(lambda x: F.get_feature(x, ps=j), suffs_t)))
#         raw['r'+str(i)+'_'+str(j)] = pd.DataFrame(gf)
#         fff_test['r'+str(i)+'_'+str(j)] = pd.DataFrame(gf_t)

# tit = np.array(list(map(lambda x: 1.0 if x.istitle() else 0.0, raw[0].values)))
# raw['tit'] = pd.DataFrame(tit)
# tit = np.array(list(map(lambda x: 1.0 if x.istitle() else 0.0, fff_test[0].values)))
# fff_test['tit'] = pd.DataFrame(tit)

# caps = np.array(list(map(lambda x: 1.0 if x.isupper() else 0.0, raw[0].values)))
# raw['caps'] = pd.DataFrame(caps)
# caps = np.array(list(map(lambda x: 1.0 if x.isupper() else 0.0, fff_test[0].values)))
# fff_test['caps'] = pd.DataFrame(caps)

# raw.head()
# fff_test.head()

In [ ]:
# from sklearn import linear_model
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
# from sklearn.metrics import accuracy_score, roc_curve, auc
# from sklearn.ensemble import GradientBoostingClassifier
# X = raw.drop([0, 1], axis=1).values
# Y = clses
# # X_test = np.array(list(map(F.get_feature, suffs)))
# X_train, X_test, y_train, y_test = train_test_split(
#     X, Y, test_size=0.1)#, random_state=42)

# clf = linear_model.SGDClassifier(loss='log', penalty='l1')
# # clf = linear_model.LogisticRegression(penalty='l2')
# print(cross_val_score(clf, X_train, y_train, 
#                                       scoring='roc_auc', cv=StratifiedKFold(shuffle=True)).min())

# clf = GradientBoostingClassifier(n_estimators=150, random_state=42, max_depth=5)
# clf.fit(X_train, y_train)
# y_score = clf.decision_function(X_test)
# # print(y_score)
# fpr, tpr, _ = roc_curve(y_test, y_score)
# roc_auc = auc(fpr, tpr)
# print(roc_auc)
# plt.plot(fpr, tpr)
# plt.show()

In [ ]:
# fff = pd.read_csv('linear_test.txt', header=None)
# # fff
# # file = open('linear_test.txt', 'r')
# # lo = list(map(lambda x: [x[0], float(x[1])], map(lambda x: x.split(', '), map(str.rstrip, file.readlines()))))
# # file.close()
# # raw = pd.DataFrame(lo)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# # suffs = slow(f(raw[0].values))
# # f = np.vectorize(lambda x: x[-5:])
# suffs = slow(f(fff[0].values))
# # clses = fff[1].values

In [ ]:
# from sklearn import linear_model
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
# from sklearn.metrics import accuracy_score, roc_curve, auc
# X = raw.drop([0, 1], axis=1).values
# X_t = fff_test.drop([0], axis=1).values
# Y = clses

# # clf = linear_model.SGDClassifier(loss='log', penalty='l1')
# # clf = linear_model.LogisticRegression(penalty='l2', solver='lbfgs')
# # clf = GradientBoostingClassifier(n_estimators=200, random_state=42, max_depth=5)
# # clf.fit(X, Y)
# # y_pred = clf.predict_proba(X_t)#[:, 0]
# # res = pd.DataFrame({y_pred})
# # res.to_csv('res3.txt')
# # print(y_test.size, y_pred.size)
# model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=0.75)
# model.fit(X, Y)
# pred = model.predict_proba(X_t)[:, 0]
# ans = pd.DataFrame()
# ans["Id"] = range(len(pred))
# ans["Answer"] = pred
# ans.to_csv('res5.txt', sep=',', index=None)

In [ ]:
# import numpy as np
# import pandas as pd
# from sklearn import model_selection, metrics, linear_model, svm
# from sklearn.feature_extraction.text import FeatureHasher, TfidfVectorizer, TfidfTransformer, CountVectorizer
# import scipy.sparse as ss

# raw_train = pd.read_csv('linear_train.txt', header=None)
# fff_test = pd.read_csv('linear_test.txt', header=None)
# f = np.vectorize(lambda x: x[-5:])
# slow = np.vectorize(str.lower)
# suffs = slow(f(raw_train[0].values))
# cv = CountVectorizer(ngram_range=(2, 4), lowercase=False, analyzer='char_wb')
# model_cv = cv.fit(suffs)
# data_train = model_cv.transform(suffs)
# data_test = model_cv.transform(fff_test[0].values)
# # data_train.shape
# # tit = np.array(list(map(lambda x: 1.0 if x.istitle() else 0.0, raw_train[0].values)))
# # data_train['tit'] = pd.DataFrame(tit)
# new_train = ss.hstack((data_train, pd.DataFrame(list(map(lambda x: 1.0 if x.istitle() else 0.0, raw_train[0].values)))))
# new_test = ss.hstack((data_test, pd.DataFrame(list(map(lambda x: 1.0 if x.istitle() else 0.0, fff_test[0].values)))))
# new_train = ss.hstack((new_train, pd.DataFrame(list(map(lambda x: 1.0 if x.isupper() else 0.0, raw_train[0].values)))))
# new_test = ss.hstack((new_test, pd.DataFrame(list(map(lambda x: 1.0 if x.isupper() else 0.0, fff_test[0].values)))))

# new_y_train = raw_train[1]
# model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
# # model = GradientBoostingClassifier()
# print(model_selection.cross_val_score(model, new_train, new_y_train, 
#                                       scoring='roc_auc', cv=model_selection.StratifiedKFold(shuffle=True)).mean())
# model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=1.)
# model.fit(new_train, new_y_train)
# pred = model.predict_proba(new_test)[:, 0]
# ans = pd.DataFrame()
# ans["Id"] = range(len(pred))
# ans["Answer"] = pred
# ans.to_csv('res4.txt', sep=',', index=None)

In [ ]: