In [1]:
# Based on
# https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
%matplotlib inline
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [18]:
text = ["Ronaldo did the free kick, yes Ronaldo", 
        "Messi did the penalty", 
        "A striker did the penalty"]

Count Vectorizer


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(text)


Out[19]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [20]:
vectorizer.vocabulary_


Out[20]:
{'did': 0,
 'free': 1,
 'kick': 2,
 'messi': 3,
 'penalty': 4,
 'ronaldo': 5,
 'striker': 6,
 'the': 7,
 'yes': 8}

In [21]:
vector = vectorizer.transform(text)

In [22]:
vector.shape


Out[22]:
(3, 9)

In [23]:
type(vector)


Out[23]:
scipy.sparse.csr.csr_matrix

In [24]:
vector.toarray()


Out[24]:
array([[1, 1, 1, 0, 0, 2, 0, 1, 1],
       [1, 0, 0, 1, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 1, 0, 1, 1, 0]])

TF-IDF


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
vectorizer.fit(text)


Out[29]:
TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=None, preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [30]:
vector = vectorizer.transform(text)
vector.toarray()


Out[30]:
array([[1.        , 1.69314718, 1.69314718, 0.        , 0.        ,
        3.38629436, 0.        , 1.        , 1.69314718],
       [1.        , 0.        , 0.        , 1.69314718, 1.28768207,
        0.        , 0.        , 1.        , 0.        ],
       [1.        , 0.        , 0.        , 0.        , 1.28768207,
        0.        , 1.69314718, 1.        , 0.        ]])

In [0]: