In [1]:
import numpy as np
import pandas as pd
In [2]:
df = pd.read_csv('../TextFiles/smsspamcollection.tsv',sep='\t')
In [3]:
df.head()
Out[3]:
In [4]:
df.isnull().sum()
Out[4]:
In [5]:
df['label'].value_counts()
Out[5]:
In [6]:
df['label'].unique()
Out[6]:
In [7]:
df.describe()
Out[7]:
In [8]:
## plot the graph
import matplotlib.pyplot as plt
plt.xscale('log')
bins = 1.15 ** (np.arange(0,50))
plt.hist(df[df['label']=='ham']['length'], bins=bins, alpha=0.7)
plt.hist(df[df['label']=='spam']['length'], bins=bins, alpha=0.9)
plt.legend(('ham','spam'))
plt.show()
In [9]:
plt.xscale('log')
bins = 1.15 ** (np.arange(0,15))
plt.hist(df[df['label']=='ham']['punct'], bins=bins, alpha=0.7)
plt.hist(df[df['label']=='spam']['punct'], bins=bins, alpha=0.9)
plt.legend(('ham','spam'))
plt.show()
In [10]:
## split the data
from sklearn.model_selection import train_test_split
X = df[['length','punct']]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
In [11]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs')
model.fit(X_train,y_train)
Out[11]:
In [12]:
## test accuracy
from sklearn import metrics
prediction = model.predict(X_test)
metrics.accuracy_score(y_test, prediction)
Out[12]:
In [13]:
metrics.confusion_matrix(y_test, prediction)
Out[13]:
In [14]:
cm_df = pd.DataFrame(metrics.confusion_matrix(y_test,prediction),columns=['ham','spam'], index=['ham','spam'])
cm_df
Out[14]:
In [15]:
print(metrics.classification_report(y_test, prediction))
In [16]:
### Multi-nomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
modelnb = MultinomialNB()
modelnb.fit(X_train,y_train)
Out[16]:
In [17]:
prediction_nb = modelnb.predict(X_test)
print(metrics.classification_report(y_test,prediction_nb))
print(f'Accuracy Score : {metrics.accuracy_score(y_test,prediction_nb)}')
In [18]:
## support vector machines
from sklearn.svm import SVC
modelsvm = SVC()
modelsvm.fit(X_train, y_train)
prediction_svc = modelsvm.predict(X_test)
print(metrics.classification_report(y_test,prediction_svc))
print(f'Accuracy Score : {metrics.accuracy_score(y_test,prediction_svc)}')
In [19]:
from sklearn.model_selection import train_test_split
X = df['message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
In [20]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
In [21]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_cv = count_vect.fit_transform(X_train)
X_train_cv.shape
Out[21]:
In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_model = TfidfTransformer()
Xtrain_tfidf = tfidf_model.fit_transform(X_train_cv)
Xtrain_tfidf.shape
Out[22]:
In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec_model = TfidfVectorizer()
Xtrain_vectfidf = tfidf_vec_model.fit_transform(X_train)
Xtrain_vectfidf.shape
Out[23]:
In [24]:
from sklearn.svm import SVC
model = SVC()
model.fit(Xtrain_vectfidf,y_train) ## fitting the vector idf
Out[24]:
In [25]:
from sklearn.pipeline import Pipeline
text_clf_pipeline = Pipeline([('tfidfvector',TfidfVectorizer()),
('SVCModel',SVC())
])
text_clf_pipeline.fit(X_train,y_train)
Out[25]:
In [26]:
predictions = text_clf_pipeline.predict(X_test)
metrics.accuracy_score(y_test, predictions)
Out[26]:
In [27]:
print(metrics.classification_report(y_test, predictions))
In [30]:
text_clf_pipeline.predict(['hello, Rishu you have won a Free entry in 2 a wkly comp to win FA Cup'])
Out[30]: