In [1]:
import pandas as pd
import numpy as np
import vowpalwabbit as vw
from sklearn import model_selection, metrics, linear_model, svm
from sklearn.feature_extraction.text import FeatureHasher, TfidfVectorizer, TfidfTransformer, CountVectorizer
import sklearn.pipeline as pipe
import re

In [2]:
train = pd.read_csv('Data - ФИВТ МФТИ - Линейные модели, часть 1_ фамилии _ Kaggle in Class.txt', header=None)
test = pd.read_csv('Data - ФИВТ МФТИ - Линейные модели, часть 1_ фамилии _ Kaggle in Class (2).txt', header=None)

data = train.append(test, ignore_index=True)
import scipy.sparse as ss

In [3]:
def clear(s):
    s = s.decode('utf-8')
    pt = re.compile('\W+', re.UNICODE)
    s = re.sub(pt,'', s)
    return s

In [4]:
test.head()


Out[4]:
0
0 Аалто
1 ААР
2 Аара
3 Ааре
4 Аарон

In [5]:
data[0] = data[0].apply(clear)

In [6]:
data.head()


Out[6]:
0 1
0 Аалтонен 1.0
1 Аар 0.0
2 Аарон 0.0
3 ААРОН 0.0
4 Аарона 0.0

In [7]:
def get_feature(data):
    extra = data.apply(lambda s: 1. if s[0].isupper() else 0.)
    return extra

In [19]:
Cs = [0.37, 0.4, 0.43, 0.47]

In [8]:
def caps_feature(data):
    extra = data.apply(lambda s: 1. if s.strip().isupper() else 0.)
    return extra

In [13]:
train_size = len(train)
cv = CountVectorizer(ngram_range=(1, 6), lowercase=False, analyzer='char_wb')
model_cv = cv.fit(data[0])
X_train = data[0][data[1].notnull()]
X_test = data[0][data[1].isnull()]
print len(X_train), len(X_test)
data_train = model_cv.transform(X_train)
data_test = model_cv.transform(X_test)
#tf = TfidfTransformer()
#data_train = tf.fit_transform(data_train)
#data_test = tf.fit_transform(data_test)
new_train = ss.hstack((data_train, pd.DataFrame(get_feature(X_train)), pd.DataFrame(caps_feature(X_train))))
new_test = ss.hstack((data_test, pd.DataFrame(get_feature(X_test)), pd.DataFrame(caps_feature(X_test))))


101408 188920

In [20]:
Cs = [0.35, 0.4, 0.45, 0.5]
for C in Cs:
    new_y_train = train[1]
    model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=C)
    print model_selection.cross_val_score(model, new_train, new_y_train, 
                                      scoring='roc_auc', cv=model_selection.StratifiedKFold(shuffle=True)).mean()


0.917252258
0.91695555285
0.917426648684
0.917133827647

In [22]:
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=0.4)
model.fit(new_train, new_y_train)
pred = model.predict_proba(new_test)[:, 0]
ans = pd.DataFrame()
ans["Id"] = range(len(pred))
ans["Answer"] = pred
ans.to_csv('submission.txt', sep=',', index=None)

In [ ]: