In [11]:
%%writefile test.csv
number,group,text,positive
110,C,people corporation problem problem think think transport science go transport place,True
114,A,you problem government you,True
130,B,you people place corporation problem science,True
185,C,have think corporation fact corporation transport transport,True
149,A,people have science science go go corporation you you fact,True
115,C,science people go transport corporation,True
156,A,transport corporation,True
190,C,transport you,True
160,A,the problem think transport place government problem go place the have place place,True
142,B,fact science government go,True
196,A,the fact,False
166,A,problem the corporation you you the transport have work go work problem you,True
168,B,have science science place the science problem,False
167,A,government place think you corporation corporation government the go,True
143,C,place science have fact science corporation science,True
185,C,place the fact transport the transport work science fact people the think problem,True
196,C,place government work science the have go work fact place the fact,True
102,A,go government think people the have the,True
104,C,work transport problem fact people,True
177,C,work problem you think think you transport go think fact go go,True
197,B,government,True
104,A,corporation corporation fact corporation work work,True
151,C,corporation science you people corporation government people go,True
105,A,go fact fact,False
165,B,think fact government the work problem work corporation transport,True
107,B,work transport transport fact people science corporation government,True
177,B,science people you corporation transport people corporation corporation science think have problem government,True
186,B,you government problem you government science fact,True
162,B,go have go go corporation fact government,True
107,A,think have the the science problem fact think go you,False
166,B,transport place place you go government have fact fact you the people think,True
163,C,work problem place have the you you problem people fact science place go think,False
121,B,place government problem,True
118,B,work corporation people the transport corporation corporation government problem the transport have fact people,True
106,A,the problem place work place corporation go work transport,True
138,C,transport science,True
111,B,fact transport corporation place corporation you place corporation you government place people transport government,True
123,A,have work place fact science think work think go place place have problem,False
181,C,corporation transport think,True
156,B,have problem have transport have go the,True
109,B,people you have have go place transport you science corporation people,True
157,C,transport the fact have,True
192,C,have,False
183,C,the you problem think place think you transport people go problem fact go,True
134,A,people place you the corporation science think government transport,True
115,B,transport problem place problem think transport go corporation corporation you place,True
154,A,government government think corporation have science corporation transport,True
106,A,you go science transport problem problem the corporation problem transport problem think,True
138,C,place transport you people work go science government transport people,True
159,B,think have the fact think you,False
194,B,have government the place science you,True
127,C,work think you science science,False
102,C,go corporation you corporation transport place place think people go fact fact,True
103,C,have think corporation problem work go science place fact problem problem problem people,True
127,B,corporation people think go transport fact you go transport science,True
101,C,people science people have place corporation think,True
149,B,the the you fact place people science problem think go think place,False
136,A,place,False
109,A,science problem transport,True
156,C,work place think have work go transport you problem go,True
168,B,fact science problem have think,False
189,C,fact problem problem go government corporation,True
102,B,fact the corporation have science place the have,True
161,A,corporation you fact,True
161,B,work corporation corporation have place work you transport science problem government,True
177,B,fact fact,False
123,B,government,True
146,C,the fact you work you corporation place corporation fact,True
136,A,work think you,False
184,A,corporation transport go place have government go you you corporation,True
185,A,transport go fact corporation have have government think people,True
125,A,place place people work science think fact fact,False
191,A,science work government fact the problem you,True
170,B,you transport work corporation work place government people corporation you the,True
186,A,work science think corporation think fact,True
142,A,work people go think go fact transport go you,True
187,A,you problem problem work,False
183,C,corporation corporation work place fact government think work work have you have the,True
128,A,place problem transport the,True
118,C,transport think work you transport you work corporation have have have place,True
151,C,work problem you go,False
136,A,you the you think,False
178,C,corporation you the people science people think,True
171,C,transport you government you corporation the problem you have place place have think,True
104,C,transport work go corporation think,True
157,A,go place people place the,False
114,C,science go corporation corporation government place problem the,True
141,C,think government corporation go the government science go corporation problem place have people problem,True
113,B,transport science fact think fact corporation corporation work,True
170,B,transport government think you go go,True
137,B,science people go government have,True
170,C,science people place,False
154,B,place go transport the government corporation fact transport fact go you corporation,True
180,B,go transport fact government government work you science,True
166,A,work go work people science the place people problem work go work,False
157,C,problem fact fact the have fact go you government work people,True
198,C,place think transport government,True
153,A,government place problem work go the work have fact have people have work fact,True
138,A,people,False
111,B,transport science people think think government people fact,True
In [12]:
import pandas as pd
In [13]:
df = pd.read_csv('test.csv')
In [15]:
df.head()
Out[15]:
In [34]:
df.positive.value_counts(normalize=True)
Out[34]:
In [16]:
from sklearn.model_selection import train_test_split
In [20]:
x_train, x_test, y_train, y_test = train_test_split(df[df.columns.difference(['positive'])],
df['positive'], test_size=1/3)
In [21]:
x_train.shape
Out[21]:
In [23]:
y_train.shape
Out[23]:
In [24]:
x_test.shape
Out[24]:
In [25]:
from sklearn.linear_model import LogisticRegression
In [31]:
clf = LogisticRegression()
clf.fit(x_train[['number']], y_train)
Out[31]:
In [32]:
# Score is accuracy
clf.score(x_test[['number']], y_test)
Out[32]:
In [35]:
from sklearn.metrics import roc_auc_score
In [45]:
clf.classes_
Out[45]:
In [46]:
# Use `predict_proba` to get probabilities as opposed to classes
# Get the probability for the True class
x_test_probabilities = clf.predict_proba(x_test[['number']])[:, 1]
roc_auc_score(y_test, x_test_probabilities)
Out[46]:
In [47]:
from sklearn.metrics import precision_score, recall_score
In [49]:
precision_score(y_test, clf.predict(x_test[['number']]))
Out[49]:
In [50]:
recall_score(y_test, clf.predict(x_test[['number']]))
Out[50]:
In [52]:
x_train_with_categorical = pd.get_dummies(x_train, columns=['group'])
In [53]:
x_train_with_categorical.head()
Out[53]:
In [54]:
x_test_with_categorical = pd.get_dummies(x_test, columns=['group'])
In [55]:
clf = LogisticRegression()
clf.fit(x_train_with_categorical[x_train_with_categorical.columns.difference(['text'])], y_train)
Out[55]:
In [59]:
clf.score(x_test_with_categorical[x_test_with_categorical.columns.difference(['text'])], y_test)
Out[59]:
In [60]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
In [67]:
x_train_vectorizer = CountVectorizer()
x_train_vectorizer.fit(x_train['text'])
Out[67]:
In [68]:
x_train_vectorized = x_train_vectorizer.transform(x_train['text'])
In [69]:
x_train_vectorized
Out[69]:
In [70]:
clf = LogisticRegression()
clf.fit(x_train_vectorized, y_train)
Out[70]:
In [71]:
clf.score(x_train_vectorizer.transform(x_test['text']), y_test)
Out[71]:
In [ ]:
# Same process can be repeated with TF-IDF and Hashing vectorizers
In [74]:
import numpy as np
In [76]:
x_train_vectorized
Out[76]:
In [88]:
features = np.concatenate([x_train_vectorized.todense(),
x_train_with_categorical[x_test_with_categorical.columns.difference(['text'])].values
],
axis=1)
In [89]:
features
Out[89]:
In [72]:
clf = LogisticRegression()
In [90]:
clf.fit(features, y_train)
Out[90]:
In [92]:
features_test = np.concatenate([x_train_vectorizer.transform(x_test['text']).todense(),
x_test_with_categorical[x_test_with_categorical.columns.difference(['text'])].values
], axis=1)
In [93]:
clf.score(features_test, y_test)
Out[93]:
Just swap the classifier, interface is the same
List of scikit-learn classifiers: http://scikit-learn.org/stable/supervised_learning.html
In [94]:
from sklearn.ensemble import RandomForestClassifier
In [95]:
clf = RandomForestClassifier()
clf.fit(features, y_train)
Out[95]:
In [96]:
clf.score(features_test, y_test)
Out[96]: