notebook.community

Edit and run



In [2]:

    
import os
from collections import Counter
X = []
y = []

for file in os.listdir('codes/'):
    with open(os.path.join('codes', file)) as f:
        code = f.read()
    _, language = os.path.splitext(file)
    X.append(code)
    y.append(language[1:])

# Counter(y).most_common()



In [44]:

    
from sklearn.feature_extraction.text import TfidfVectorizer

def mi_tokenizador(code):
    palabras = code.split()
    return palabras

c = TfidfVectorizer(tokenizer=mi_tokenizador)
# c.fit(X[:1])
c.fit(["hola mundo 99 algo,malo !"])
print(c.vocabulary_)
r=c.transform(['hola hola malo'])
r.todense()









    



{'hola': 3, '99': 1, 'algo,malo': 2, 'mundo': 4, '!': 0}






    Out[44]:





matrix([[ 0.,  0.,  0.,  1.,  0.]])



In [37]:

    
r.todense()









    Out[37]:





matrix([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]])