In [1]:
# supress the warning message
import warnings
warnings.filterwarnings('ignore')
In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score
in_file_name = 'SMSSpamCollection.txt'
df = pd.read_csv(in_file_name, delimiter='\t', header=None)
In [3]:
# Check the shape of the data
df.shape
Out[3]:
In [4]:
# Check the sample 5 samples
df.head()
Out[4]:
In [5]:
print df[df[0] == 'ham'][0].count()
In [6]:
# Split into train and test, convert to Tfidf vectors
# Fit the model and run predict to get the result
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])
vec = TfidfVectorizer()
X_train = vec.fit_transform(X_train_raw)
X_test = vec.transform(X_test_raw)
clf = LogisticRegression()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
In [7]:
pred.shape
Out[7]:
In [12]:
# Validate the perform
# confusion_matrix used to test binary classification problem, input pair like [1,0,1,0] and [1,1,1,1]
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
confusion_matrix = confusion_matrix(y_test, pred)
print confusion_matrix
In [13]:
plt.matshow(confusion_matrix)
plt.title("Confusion Matrix")
plt.colorbar()
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()
In [26]:
from sklearn.metrics import precision_score, recall_score, accuracy_score
precision_score(y_test.values, pred)
In [28]:
len(y_test.values)
Out[28]:
In [29]:
len(pred)
Out[29]:
In [30]:
pred
Out[30]:
In [31]:
In [ ]: