In [2]:
import os
from collections import Counter
X = []
y = []
for file in os.listdir('codes/'):
with open(os.path.join('codes', file)) as f:
code = f.read()
_, language = os.path.splitext(file)
X.append(code)
y.append(language[1:])
# Counter(y).most_common()
In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
def mi_tokenizador(code):
palabras = code.split()
return palabras
c = TfidfVectorizer(tokenizer=mi_tokenizador)
# c.fit(X[:1])
c.fit(["hola mundo 99 algo,malo !"])
print(c.vocabulary_)
r=c.transform(['hola hola malo'])
r.todense()
Out[44]:
In [37]:
r.todense()
Out[37]: