notebook.community

Edit and run



In [16]:

    
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
import re

def load_data():
    "returns a (X, y) readed from codes folder"
    X = []
    y = []
    for f in os.listdir('codes'):
        text = open(os.path.join('codes', f)).read()
        syntax = f.split('.')[-1]
        X.append(text)
        y.append(syntax.lower())
    return (X, y)

X, y = load_data()



In [45]:

    
def preprocessor(x):
    x = re.sub('\d+', 'NUMBER', x)
    x = re.sub('\".*?\"', 'STRING', x)
    return x



In [44]:

    
re.sub('\".*?\"', 'STRING', 'algo "tiene que pasar" ooo ')









    Out[44]:





'algo STRING ooo '



In [56]:

    
pipe = make_pipeline(
    TfidfVectorizer(
        ngram_range=(1, 2),
        token_pattern='(?u)\\b\\w\\w+\\b|\:|\;|\"|\'|#|\t',
        preprocessor=preprocessor
    ),
    LinearSVC()
)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
pipe.fit(X_train, y_train)

p = pipe.predict(X_val)

print(accuracy_score(p, y_val))









    



0.815730337079

Para probar/mejorar:

Probar agregar cosas en preprocessor (comentarios, otras formas de strings, numeros...)
Probar de agregar más simbolos en el tokenizer
Probar con distintos ngrams
Ver parametros max_df=1.0, min_df=1, max_features=None

Para ver:

Donde predice mal?
Los ejemplos que predice mal, son entendibles por una persona?
Hacer alguna grafica de la matriz de confusión (Por ej, por cada lenguaje cantidad de veces que predice bien)



In [55]:

    
# c =  CountVectorizer(ngram_range=(1, 1))
c =  TfidfVectorizer(ngram_range=(1, 1), 
                     token_pattern='(?u)\\b\\w\\w+\\b|\:|\;|\"|\'|#|!|\t',
                     preprocessor=preprocessor
                    )
corpus = ['Hola 667 10 como va', 'Bien, \t 9987 : :y vos?!']
c.fit(corpus)
c.vocabulary_ 
r = c.transform(corpus)
print(sorted(c.vocabulary_.items(), key=lambda x: x[1]))
r.toarray()









    



[('\t', 0), ('!', 1), (':', 2), ('Bien', 3), ('Hola', 4), ('NUMBER', 5), ('como', 6), ('va', 7), ('vos', 8)]






    Out[55]:





array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.44610081,
         0.6348088 ,  0.44610081,  0.44610081,  0.        ],
       [ 0.34287126,  0.34287126,  0.68574252,  0.34287126,  0.        ,
         0.24395573,  0.        ,  0.        ,  0.34287126]])