In [1]:
import pandas as pd
import numpy as np
import vowpalwabbit as vw
from sklearn import model_selection, metrics, linear_model, svm
from sklearn.feature_extraction.text import FeatureHasher, TfidfVectorizer, TfidfTransformer, CountVectorizer
import sklearn.pipeline as pipe
import re
In [2]:
train = pd.read_csv('Data - ФИВТ МФТИ - Линейные модели, часть 1_ фамилии _ Kaggle in Class.txt', header=None)
test = pd.read_csv('Data - ФИВТ МФТИ - Линейные модели, часть 1_ фамилии _ Kaggle in Class (2).txt', header=None)
data = train.append(test, ignore_index=True)
import scipy.sparse as ss
In [3]:
def clear(s):
s = s.decode('utf-8')
pt = re.compile('\W+', re.UNICODE)
s = re.sub(pt,'', s)
return s
In [4]:
test.head()
Out[4]:
In [5]:
data[0] = data[0].apply(clear)
In [6]:
data.head()
Out[6]:
In [7]:
def get_feature(data):
extra = data.apply(lambda s: 1. if s[0].isupper() else 0.)
return extra
In [19]:
Cs = [0.37, 0.4, 0.43, 0.47]
In [8]:
def caps_feature(data):
extra = data.apply(lambda s: 1. if s.strip().isupper() else 0.)
return extra
In [13]:
train_size = len(train)
cv = CountVectorizer(ngram_range=(1, 6), lowercase=False, analyzer='char_wb')
model_cv = cv.fit(data[0])
X_train = data[0][data[1].notnull()]
X_test = data[0][data[1].isnull()]
print len(X_train), len(X_test)
data_train = model_cv.transform(X_train)
data_test = model_cv.transform(X_test)
#tf = TfidfTransformer()
#data_train = tf.fit_transform(data_train)
#data_test = tf.fit_transform(data_test)
new_train = ss.hstack((data_train, pd.DataFrame(get_feature(X_train)), pd.DataFrame(caps_feature(X_train))))
new_test = ss.hstack((data_test, pd.DataFrame(get_feature(X_test)), pd.DataFrame(caps_feature(X_test))))
In [20]:
Cs = [0.35, 0.4, 0.45, 0.5]
for C in Cs:
new_y_train = train[1]
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=C)
print model_selection.cross_val_score(model, new_train, new_y_train,
scoring='roc_auc', cv=model_selection.StratifiedKFold(shuffle=True)).mean()
In [22]:
model = linear_model.LogisticRegression(penalty='l2', solver='lbfgs', C=0.4)
model.fit(new_train, new_y_train)
pred = model.predict_proba(new_test)[:, 0]
ans = pd.DataFrame()
ans["Id"] = range(len(pred))
ans["Answer"] = pred
ans.to_csv('submission.txt', sep=',', index=None)
In [ ]: