In [2]:
import os
from collections import Counter
X = []
y = []

for file in os.listdir('codes/'):
    with open(os.path.join('codes', file)) as f:
        code = f.read()
    _, language = os.path.splitext(file)
    X.append(code)
    y.append(language[1:])

# Counter(y).most_common()

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

def mi_tokenizador(code):
    palabras = code.split()
    return palabras

c = TfidfVectorizer(tokenizer=mi_tokenizador)
# c.fit(X[:1])
c.fit(["hola mundo 99 algo,malo !"])
print(c.vocabulary_)
r=c.transform(['hola hola malo'])
r.todense()


{'hola': 3, '99': 1, 'algo,malo': 2, 'mundo': 4, '!': 0}
Out[44]:
matrix([[ 0.,  0.,  0.,  1.,  0.]])

In [37]:
r.todense()


Out[37]:
matrix([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]])