In [7]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import csv
#from textblob import TextBlob

import sklearn
import seaborn

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-7-74e314d1e592> in <module>()
     13 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
     14 from sklearn.naive_bayes import MultinomialNB
---> 15 from sklearn.svm import SVC, LinearSVC
     16 from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
     17 from sklearn.pipeline import Pipeline

~/anaconda3/lib/python3.6/site-packages/sklearn/svm/__init__.py in <module>()
     11 # License: BSD 3 clause (C) INRIA 2010
     12 
---> 13 from .classes import SVC, NuSVC, SVR, NuSVR, OneClassSVM, LinearSVC, \
     14         LinearSVR
     15 from .bounds import l1_min_c

~/anaconda3/lib/python3.6/site-packages/sklearn/svm/classes.py in <module>()
      4 from .base import _fit_liblinear, BaseSVC, BaseLibSVM
      5 from ..base import BaseEstimator, RegressorMixin
----> 6 from ..linear_model.base import LinearClassifierMixin, SparseCoefMixin, \
      7     LinearModel
      8 from ..utils import check_X_y

~/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/__init__.py in <module>()
     10 # complete documentation.
     11 
---> 12 from .base import LinearRegression
     13 
     14 from .bayes import BayesianRidge, ARDRegression

~/anaconda3/lib/python3.6/importlib/_bootstrap.py in _find_and_load(name, import_)

~/anaconda3/lib/python3.6/importlib/_bootstrap.py in _find_and_load_unlocked(name, import_)

~/anaconda3/lib/python3.6/importlib/_bootstrap.py in _load_unlocked(spec)

~/anaconda3/lib/python3.6/importlib/_bootstrap_external.py in exec_module(self, module)

~/anaconda3/lib/python3.6/importlib/_bootstrap_external.py in get_code(self, fullname)

~/anaconda3/lib/python3.6/importlib/_bootstrap_external.py in get_data(self, path)

KeyboardInterrupt: 

In [2]:
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
messages.head()


Out[2]:
label message
0 ham Go until jurong point, crazy.. Available only ...
1 ham Ok lar... Joking wif u oni...
2 spam Free entry in 2 a wkly comp to win FA Cup fina...
3 ham U dun say so early hor... U c already then say...
4 ham Nah I don't think he goes to usf, he lives aro...

In [9]:
messages.groupby('label').describe()


Out[9]:
message
count unique top freq
label
ham 4827 4518 Sorry, I'll call later 30
spam 747 653 Please call our customer service representativ... 4

In [3]:
X = messages.message
y = messages.label.apply(lambda x: 1 if x == 'spam' else 0)

In [15]:
clf = CountVectorizer()
X_desc = clf.fit_transform(X).toarray()

In [19]:
X_desc # получена сильно разряженная матрица


Out[19]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [22]:
print (X.shape, X_desc.shape)


(5574,) (5574, 8713)

In [26]:
model = LogisticRegression().fit(X_desc,y )

In [28]:
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,scoring = 'f1').mean()


Out[28]:
0.93334852685794145

In [40]:
str1 = "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB"
print(model.predict(clf.transform([str1]))[0])


1

In [4]:
str1 = 'FreeMsg: Txt: claim your reward of 3 hours talk time'
print(model.predict(clf.transform([str1]))[0])
 
str1 = "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$"
print(model.predict(clf.transform([str1]))[0])
 
str1 = "Have you visited the last lecture on physics?"
print(model.predict(clf.transform([str1]))[0])
 
str1 = "Only 99$"
print(model.predict(clf.transform([str1]))[0])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-003839113713> in <module>()
      1 str1 = 'FreeMsg: Txt: claim your reward of 3 hours talk time'
----> 2 print(model.predict(clf.transform([str1]))[0])
      3 
      4 str1 = "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$"
      5 print(model.predict(clf.transform([str1]))[0])

NameError: name 'model' is not defined

In [44]:
clf = CountVectorizer(ngram_range=(2,2))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
                scoring = 'f1').mean()


Out[44]:
0.82242206641871329

In [ ]:
clf = CountVectorizer(ngram_range=(3,3))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
                scoring = 'f1').mean()

In [ ]:
clf = CountVectorizer(ngram_range=(1,3))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
                scoring = 'f1').mean()

In [ ]:
from sklearn.naive_bayes import MultinomialNB

def diffParamsForCountVectorizerWithMulti(my_ngram_range):
    my_count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=my_ngram_range)
    my_features = my_count_vectorizer.fit_transform(X)
    
    my_Multinominal = MultinomialNB()
    my_scores = cross_val_score(my_Multinominal, my_features, list_marks, cv=10, scoring='f1_macro')
    
    return np.average(my_scores)
#Наивный Байес действительно страдает от нехватки статистики по биаграммам и триаграммам

print (round(diffParamsForCountVectorizerWithMulti((2, 2)), 2), round(diffParamsForCountVectorizerWithMulti((3, 3)), 2), round(diffParamsForCountVectorizerWithMulti((1, 3)), 2))

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
my_features = tfidf_vectorizer.fit_transform(list_phrases)
my_logReg = LogisticRegression()

scores = cross_val_score(my_logReg, my_features, list_marks, cv=10, scoring='f1_macro')

print (np.average(scores))

In [ ]: