In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,max_df=0.95,sublinear_tf=True)
In [2]:
experts_count=pd.read_pickle('./input/experts_count.pkl')
experts_count=experts_count.fillna('none')
In [3]:
experts_count.head()
Out[3]:
In [4]:
experts_count.info()
In [7]:
Y = experts_count.Label
X_title = experts_count.Title
print (type(Y),type(X_title))
In [8]:
X_title = tfv.fit_transform(list(X_title))
In [11]:
X_title
Out[11]:
In [12]:
# for fun
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_title, Y, test_size=.10)
In [14]:
lr = LogisticRegression(C=2)
lr.fit(X_train, Y_train)
y = lr.predict(X_test)
accuracy_score(Y_test, y)
Out[14]:
Now, Begin!
when Count > 10
In [16]:
X1_train = experts_count[:80000][experts_count.Count>10]
In [18]:
X1_title=tfv.fit_transform(list(X1_train['Title']) \
+list(experts_count[80000:]['Title']))
In [19]:
print (type(X1_title))
print (X1_title.shape)
In [21]:
X_train = X1_title[:X1_train.shape[0]]
Y_train = experts_count['Label'][:80000][experts_count.Count>10]
X_test = X1_title[X1_train.shape[0]:]
Y_test = experts_count['Label'][80000:]
In [22]:
# for C=2
lr = LogisticRegression(C=2)
lr.fit(X_train, Y_train)
y = lr.predict(X_test)
accuracy_score(Y_test, y)
Out[22]:
Also, we can tune the parameters. Let's see what will happen!
In [23]:
# for C=3
lr = LogisticRegression(C=3)
lr.fit(X_train, Y_train)
y = lr.predict(X_test)
accuracy_score(Y_test, y)
Out[23]:
Next, see what will happen when experts_count.Count > 20
Conclusion for title: From above, we can see, when Count > ?, C = ? will get the best result, which is 0.???
In [25]:
X1_train = experts_count[:80000][experts_count.Count>10]
In [26]:
X1_tag=tfv.fit_transform(list(X1_train['Tag']) \
+list(experts_count[80000:]['Tag']))
In [27]:
print (type(X1_tag))
print (X1_tag.shape)
In [28]:
X_train = X1_tag[:X1_train.shape[0]]
Y_train = experts_count['Label'][:80000][experts_count.Count>10]
X_test = X1_tag[X1_train.shape[0]:]
Y_test = experts_count['Label'][80000:]
In [29]:
# for C=2
lr = LogisticRegression(C=2)
lr.fit(X_train, Y_train)
y = lr.predict(X_test)
accuracy_score(Y_test, y)
Out[29]:
Next, see what will happen when experts_count.Count > 20
In [30]:
X2_train=experts_count[:80000][experts_count.Count>20]
In [32]:
X2_tag=tfv.fit_transform(list(X2_train['Tag']) \
+list(experts_count[80000:]['Tag']))
print (type(X2_tag))
print (X2_tag.shape)
In [37]:
X_train = X2_tag[:X2_train.shape[0]]
Y_train = experts_count['Label'][:80000][experts_count.Count>20]
X_test = X2_tag[X2_train.shape[0]:]
Y_test = experts_count['Label'][80000:]
In [38]:
# for C=2
lr = LogisticRegression(C=2)
lr.fit(X_train, Y_train)
y = lr.predict(X_test)
accuracy_score(Y_test, y)
Out[38]:
Also, we can tune the parameters. Let's see what will happen!
In [39]:
# for C=3
lr = LogisticRegression(C=3)
lr.fit(X_train, Y_train)
y = lr.predict(X_test)
print ("The test accuracy is : %r" % accuracy_score(Y_test, y))
And more...
In [ ]:
X3_train=experts_count[:80000][experts_count.Count>30]
In [ ]:
X4_train=experts_count[:80000][experts_count.Count>40]
Conclusion for tag: From above, we can see, when Count > ?, C = ? will get the best result, which is 0.???
In [ ]: