In [16]:
import pandas as pd;
import numpy as np;
import matplotlib.pyplot as plt
%matplotlib inline
In [17]:
master = pd.read_csv('all_tweets_df.csv')
unigram_features = pd.read_csv('top_1000_unigram_features.csv')
Working subset (due to RAM constraints)
In [18]:
subset = master[master['type']=='sarcastic'][:8000].append(master[master['type']=='genuine'][:8000])
# test_subset = master[master['type']=='sarcastic'][6000:8000].append(master[master['type']=='genuine'][6000:8000])
In [19]:
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction import DictVectorizer
In [114]:
count_vec = CountVectorizer(max_features=5000)
vector = count_vec.fit_transform(subset['0'])
# vector = DictVectorizer().fit_transform(master['0'])
varr = vector.toarray()
In [117]:
# Unigrams
count_vec.get_feature_names()
Out[117]:
In [123]:
# count_vec.transform()
In [99]:
master
Out[99]:
In [21]:
unigrams = pd.DataFrame(varr)
unigrams['ToUser']=list(subset['ToUser'])
unigrams['Hashtags']=list(subset['Hashtags'])
unigrams['AllCapsCount']=list(subset['AllCapsCount'])
Tweets vectorized by Top-5000 Unigram vocabulary
In [22]:
unigrams
Out[22]:
In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(unigrams, subset['type'], test_size=0.4, random_state=0)
In [78]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(X_train, y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
# print(clf.predict([[-0.8, -1]]))
Out[78]:
In [79]:
clf.score(X_test,y_test)
Out[79]:
In [80]:
svc_predictions = clf.predict(X_test)
In [83]:
svc_preds_numeric = [1 if x=='sarcastic' else 0 for x in svc_predictions]
y_test_numeric = [1 if x=='sarcastic' else 0 for x in y_test]
fpr_svc, tpr_svc, thresh_svc = roc_curve(y_test_numeric, svc_preds_numeric)
In [98]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, svc_predictions)
Out[98]:
In [84]:
plt.figure()
lw = 2
plt.plot(fpr_svc, tpr_svc, color='blue',lw=lw, label='ROC curve',)
plt.title("SVM ROC")
Out[84]:
In [86]:
tpr_svc
Out[86]:
In [87]:
fpr_svc
Out[87]:
In [26]:
from sklearn.linear_model import LogisticRegression
lr_clf= LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1)
In [27]:
lr_clf.fit(X_train,y_train)
Out[27]:
In [29]:
lr_clf.score(X_test,y_test)
Out[29]:
In [30]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
scores = cross_val_score(lr_clf, unigrams, subset['type'], cv=cv)
scores
Out[30]:
In [31]:
lr_clf.predict_proba(X_test)
Out[31]:
In [75]:
lr_predictions = lr_clf.predict(X_test)
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn import metrics
# roc_auc_score(y_test,lr_clf.predict_proba(X_test))
# roc_curve(y_test, lr_predictions, pos_label="sarcastic")
In [49]:
lr_preds_numeric = [1 if x=='sarcastic' else 0 for x in lr_predictions]
y_test_numeric = [1 if x=='sarcastic' else 0 for x in y_test]
In [69]:
fpr_lr, tpr_lr, thresh_lr = roc_curve(y_test_numeric, lr_preds_numeric)
In [51]:
pd.Series(lr_predictions).value_counts()
Out[51]:
In [72]:
plt.figure()
lw = 2
plt.plot(fpr_lr, tpr_lr, color='red',lw=lw, label='ROC curve',)
plt.title('LogReg ROC')
Out[72]:
In [52]:
from sklearn import tree
dt_clf = tree.DecisionTreeClassifier()
dt_clf = dt_clf.fit(X_train, y_train)
In [53]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
Out[53]:
In [54]:
NB_results = clf.score(X_test, y_test)
In [97]:
NB_results
Out[97]:
In [55]:
# cross_val_score(clf.predict_proba(X_test, y_test))
nb_predictions_positive = np.array([n[1] for n in clf.predict_proba(X_test)])
In [56]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, nb_predictions_positive, pos_label="sarcastic")
In [57]:
nb_predictions_positive
Out[57]:
In [58]:
y_test_1 = [1 if n=='sarcastic' else 0 for n in y_test]
In [59]:
y_test_1 = np.array(y_test_1)
In [61]:
fpr, tpr, thresholds = roc_curve(y_test, nb_predictions_positive, pos_label="sarcastic")
In [73]:
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve',)
plt.title("NaiveBayes ROC")
Out[73]:
In [ ]:
# thresholds
(clf.sigma_[1]).argmax()
clf.sigma_[1]
# 1.4590845579489764
# len(clf.sigma_[1])
# clf.classes_
Last feature, "AllCapsCount", is most predictive
In [151]:
test_tweets = pd.read_csv('test_tweets_df.csv')
test_labels = test_tweets['label']
In [155]:
test_unigrams = pd.DataFrame(count_vec.transform(test_tweets['0']).toarray())
test_unigrams['ToUser'] = test_tweets['ToUser']
test_unigrams['Hashtags'] = test_tweets['Hashtags']
test_unigrams['AllCapsCount'] = test_tweets['AllCapsCount']
In [156]:
test_labels_numeric = [1 if x=='sarcastic' else 0 for x in test_labels]
In [163]:
# X_train, X_test, y_train, y_test = train_test_split(test_unigrams, test_labels_numeric, test_size=0.4, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(test_unigrams, test_labels, test_size=0.4, random_state=0)
In [171]:
lr_clf.score(X_test, y_test)
test_preds = lr_clf.predict(X_test)
In [168]:
clf.score(X_test, y_test)
Out[168]:
In [170]:
dt_clf.score(X_test, y_test)
Out[170]:
In [175]:
test_preds_numeric = [1 if x=='sarcastic' else 0 for x in test_preds]
y_test_numeric = [1 if x=='sarcastic' else 0 for x in y_test]
fpr_test, tpr_test, thresh_test = roc_curve(y_test_numeric, test_preds_numeric)
In [179]:
plt.plot(fpr_test, tpr_test, color='green')
plt.title("LogReg on test data")
Out[179]: