notebook.community

Edit and run



In [7]:

    
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import csv
#from textblob import TextBlob

import sklearn
import seaborn

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-7-74e314d1e592> in <module>()
     13 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
     14 from sklearn.naive_bayes import MultinomialNB
---> 15 from sklearn.svm import SVC, LinearSVC
     16 from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
     17 from sklearn.pipeline import Pipeline

~/anaconda3/lib/python3.6/site-packages/sklearn/svm/__init__.py in <module>()
     11 # License: BSD 3 clause (C) INRIA 2010
     12 
---> 13 from .classes import SVC, NuSVC, SVR, NuSVR, OneClassSVM, LinearSVC, \
     14         LinearSVR
     15 from .bounds import l1_min_c

~/anaconda3/lib/python3.6/site-packages/sklearn/svm/classes.py in <module>()
      4 from .base import _fit_liblinear, BaseSVC, BaseLibSVM
      5 from ..base import BaseEstimator, RegressorMixin
----> 6 from ..linear_model.base import LinearClassifierMixin, SparseCoefMixin, \
      7     LinearModel
      8 from ..utils import check_X_y

~/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/__init__.py in <module>()
     10 # complete documentation.
     11 
---> 12 from .base import LinearRegression
     13 
     14 from .bayes import BayesianRidge, ARDRegression

~/anaconda3/lib/python3.6/importlib/_bootstrap.py in _find_and_load(name, import_)

~/anaconda3/lib/python3.6/importlib/_bootstrap.py in _find_and_load_unlocked(name, import_)

~/anaconda3/lib/python3.6/importlib/_bootstrap.py in _load_unlocked(spec)

~/anaconda3/lib/python3.6/importlib/_bootstrap_external.py in exec_module(self, module)

~/anaconda3/lib/python3.6/importlib/_bootstrap_external.py in get_code(self, fullname)

~/anaconda3/lib/python3.6/importlib/_bootstrap_external.py in get_data(self, path)

KeyboardInterrupt:



In [2]:

    
messages = pd.read_csv('SMSSpamCollection.txt', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
messages.head()









    Out[2]:







  
    
      
      label
      message
    
  
  
    
      0
      ham
      Go until jurong point, crazy.. Available only ...
    
    
      1
      ham
      Ok lar... Joking wif u oni...
    
    
      2
      spam
      Free entry in 2 a wkly comp to win FA Cup fina...
    
    
      3
      ham
      U dun say so early hor... U c already then say...
    
    
      4
      ham
      Nah I don't think he goes to usf, he lives aro...



In [9]:

    
messages.groupby('label').describe()









    Out[9]:







  
    
      
      message
    
    
      
      count
      unique
      top
      freq
    
    
      label
      
      
      
      
    
  
  
    
      ham
      4827
      4518
      Sorry, I'll call later
      30
    
    
      spam
      747
      653
      Please call our customer service representativ...
      4



In [3]:

    
X = messages.message
y = messages.label.apply(lambda x: 1 if x == 'spam' else 0)



In [15]:

    
clf = CountVectorizer()
X_desc = clf.fit_transform(X).toarray()



In [19]:

    
X_desc # получена сильно разряженная матрица









    Out[19]:





array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)



In [22]:

    
print (X.shape, X_desc.shape)









    



(5574,) (5574, 8713)



In [26]:

    
model = LogisticRegression().fit(X_desc,y )



In [28]:

    
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,scoring = 'f1').mean()









    Out[28]:





0.93334852685794145



In [40]:

    
str1 = "FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB"
print(model.predict(clf.transform([str1]))[0])



In [4]:

    
str1 = 'FreeMsg: Txt: claim your reward of 3 hours talk time'
print(model.predict(clf.transform([str1]))[0])
 
str1 = "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$"
print(model.predict(clf.transform([str1]))[0])
 
str1 = "Have you visited the last lecture on physics?"
print(model.predict(clf.transform([str1]))[0])
 
str1 = "Only 99$"
print(model.predict(clf.transform([str1]))[0])









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-003839113713> in <module>()
      1 str1 = 'FreeMsg: Txt: claim your reward of 3 hours talk time'
----> 2 print(model.predict(clf.transform([str1]))[0])
      3 
      4 str1 = "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$"
      5 print(model.predict(clf.transform([str1]))[0])

NameError: name 'model' is not defined



In [44]:

    
clf = CountVectorizer(ngram_range=(2,2))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
                scoring = 'f1').mean()









    Out[44]:





0.82242206641871329



In [ ]:

    
clf = CountVectorizer(ngram_range=(3,3))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
                scoring = 'f1').mean()



In [ ]:

    
clf = CountVectorizer(ngram_range=(1,3))
X_desc = clf.fit_transform(X).toarray()
np.random_state = 2
cross_val_score(LogisticRegression(),X_desc,y, cv = 10,
                scoring = 'f1').mean()



In [ ]:

    
from sklearn.naive_bayes import MultinomialNB

def diffParamsForCountVectorizerWithMulti(my_ngram_range):
    my_count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=my_ngram_range)
    my_features = my_count_vectorizer.fit_transform(X)
    
    my_Multinominal = MultinomialNB()
    my_scores = cross_val_score(my_Multinominal, my_features, list_marks, cv=10, scoring='f1_macro')
    
    return np.average(my_scores)
#Наивный Байес действительно страдает от нехватки статистики по биаграммам и триаграммам

print (round(diffParamsForCountVectorizerWithMulti((2, 2)), 2), round(diffParamsForCountVectorizerWithMulti((3, 3)), 2), round(diffParamsForCountVectorizerWithMulti((1, 3)), 2))



In [ ]:

    
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
my_features = tfidf_vectorizer.fit_transform(list_phrases)
my_logReg = LogisticRegression()

scores = cross_val_score(my_logReg, my_features, list_marks, cv=10, scoring='f1_macro')

print (np.average(scores))



In [ ]:

	label	message
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...

	message
	count	unique	top	freq
label
ham	4827	4518	Sorry, I'll call later	30
spam	747	653	Please call our customer service representativ...	4