notebook.community

Edit and run



In [1]:

    
# supress the warning message
import warnings
warnings.filterwarnings('ignore')



In [2]:

    
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split, cross_val_score

in_file_name = 'SMSSpamCollection.txt'
df = pd.read_csv(in_file_name, delimiter='\t', header=None)



In [3]:

    
# Check the shape of the data
df.shape









    Out[3]:





(5572, 2)



In [4]:

    
# Check the sample 5 samples
df.head()









    Out[4]:






  
    
      
      0
      1
    
  
  
    
      0
        ham
       Go until jurong point, crazy.. Available only ...
    
    
      1
        ham
                           Ok lar... Joking wif u oni...
    
    
      2
       spam
       Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
        ham
       U dun say so early hor... U c already then say...
    
    
      4
        ham
       Nah I don't think he goes to usf, he lives aro...
    
  

5 rows × 2 columns



In [5]:

    
print df[df[0] == 'ham'][0].count()



In [6]:

    
# Split into train and test, convert to Tfidf vectors
# Fit the model and run predict to get the result
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])
vec = TfidfVectorizer()
X_train = vec.fit_transform(X_train_raw)
X_test = vec.transform(X_test_raw)
clf = LogisticRegression()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)



In [7]:

    
pred.shape









    Out[7]:





(1393,)



In [12]:

    
# Validate the perform
# confusion_matrix used to test binary classification problem, input pair like [1,0,1,0] and [1,1,1,1]
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

confusion_matrix = confusion_matrix(y_test, pred)
print confusion_matrix









    



[[1199    0]
 [  46  148]]



In [13]:

    
plt.matshow(confusion_matrix)
plt.title("Confusion Matrix")
plt.colorbar()
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()



In [26]:

    
from sklearn.metrics import precision_score, recall_score, accuracy_score
precision_score(y_test.values, pred)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-26-8e600fd54d74> in <module>()
      1 from sklearn.metrics import precision_score, recall_score, accuracy_score
----> 2 precision_score(y_test.values, pred)

/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.pyc in precision_score(y_true, y_pred, labels, pos_label, average, sample_weight)
   1201                                                  average=average,
   1202                                                  warn_for=('precision',),
-> 1203                                                  sample_weight=sample_weight)
   1204     return p
   1205 

/usr/local/lib/python2.7/dist-packages/sklearn/metrics/classification.pyc in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
    982                 else:
    983                     raise ValueError("pos_label=%r is not a valid label: %r" %
--> 984                                      (pos_label, present_labels))
    985             labels = [pos_label]
    986     if labels is None:

ValueError: pos_label=1 is not a valid label: array(['ham', 'spam'], 
      dtype='|S4')



In [28]:

    
len(y_test.values)









    Out[28]:





1393



In [29]:

    
len(pred)









    Out[29]:





1393



In [30]:

    
pred









    Out[30]:





array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype=object)



In [31]:









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-31-fe057c2600f7> in <module>()
----> 1 y_test.labels

/usr/lib/python2.7/dist-packages/pandas/core/generic.pyc in __getattr__(self, name)
   1813                 return self[name]
   1814             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1815                                  (type(self).__name__, name))
   1816 
   1817     def __setattr__(self, name, value):

AttributeError: 'Series' object has no attribute 'labels'



In [ ]:

	0	1
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...