notebook.community



In [1]:

    
# Based on
# https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/



In [2]:

    
import warnings
warnings.filterwarnings('ignore')



In [3]:

    
%matplotlib inline
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [18]:

    
text = ["Ronaldo did the free kick, yes Ronaldo", 
        "Messi did the penalty", 
        "A striker did the penalty"]

Count Vectorizer



In [19]:

    
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(text)









    Out[19]:





CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)



In [20]:

    
vectorizer.vocabulary_









    Out[20]:





{'did': 0,
 'free': 1,
 'kick': 2,
 'messi': 3,
 'penalty': 4,
 'ronaldo': 5,
 'striker': 6,
 'the': 7,
 'yes': 8}



In [21]:

    
vector = vectorizer.transform(text)



In [22]:

    
vector.shape









    Out[22]:





(3, 9)



In [23]:

    
type(vector)









    Out[23]:





scipy.sparse.csr.csr_matrix



In [24]:

    
vector.toarray()









    Out[24]:





array([[1, 1, 1, 0, 0, 2, 0, 1, 1],
       [1, 0, 0, 1, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 1, 1, 0]])

TF-IDF



In [29]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
vectorizer.fit(text)









    Out[29]:





TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=None, preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)



In [30]:

    
vector = vectorizer.transform(text)
vector.toarray()









    Out[30]:





array([[1.        , 1.69314718, 1.69314718, 0.        , 0.        ,
        3.38629436, 0.        , 1.        , 1.69314718],
       [1.        , 0.        , 0.        , 1.69314718, 1.28768207,
        0.        , 0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        , 0.        , 1.28768207,
        0.        , 1.69314718, 1.        , 0.        ]])



In [0]: