In [7]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import csv
#from textblob import TextBlob
import sklearn
import seaborn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.learning_curve import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', quoting=csv.QUOTE_NONE,
names=["label", "message"])
messages.head()
Out[2]:
In [9]:
messages.groupby('label').describe()
Out[9]:
In [3]:
X = messages.message
y = messages.label.apply(lambda x: 1 if x == 'spam' else 0)
In [15]:
clf = CountVectorizer()
X_desc = clf.fit_transform(X).toarray()
In [19]:
X_desc # получена сильно разряженная матрица
Out[19]:
In [22]:
print (X.shape, X_desc.shape)
In [26]:
model = LogisticRegression().fit(X_desc,y )
In [28]:
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,scoring = 'f1').mean()
Out[28]:
In [40]:
str1 = "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB"
print(model.predict(clf.transform([str1]))[0])
In [4]:
str1 = 'FreeMsg: Txt: claim your reward of 3 hours talk time'
print(model.predict(clf.transform([str1]))[0])
str1 = "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$"
print(model.predict(clf.transform([str1]))[0])
str1 = "Have you visited the last lecture on physics?"
print(model.predict(clf.transform([str1]))[0])
str1 = "Only 99$"
print(model.predict(clf.transform([str1]))[0])
In [44]:
clf = CountVectorizer(ngram_range=(2,2))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
scoring = 'f1').mean()
Out[44]:
In [ ]:
clf = CountVectorizer(ngram_range=(3,3))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
scoring = 'f1').mean()
In [ ]:
clf = CountVectorizer(ngram_range=(1,3))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
scoring = 'f1').mean()
In [ ]:
from sklearn.naive_bayes import MultinomialNB
def diffParamsForCountVectorizerWithMulti(my_ngram_range):
my_count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=my_ngram_range)
my_features = my_count_vectorizer.fit_transform(X)
my_Multinominal = MultinomialNB()
my_scores = cross_val_score(my_Multinominal, my_features, list_marks, cv=10, scoring='f1_macro')
return np.average(my_scores)
#Наивный Байес действительно страдает от нехватки статистики по биаграммам и триаграммам
print (round(diffParamsForCountVectorizerWithMulti((2, 2)), 2), round(diffParamsForCountVectorizerWithMulti((3, 3)), 2), round(diffParamsForCountVectorizerWithMulti((1, 3)), 2))
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
my_features = tfidf_vectorizer.fit_transform(list_phrases)
my_logReg = LogisticRegression()
scores = cross_val_score(my_logReg, my_features, list_marks, cv=10, scoring='f1_macro')
print (np.average(scores))
In [ ]: