In [16]:
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
import re

def load_data():
    "returns a (X, y) readed from codes folder"
    X = []
    y = []
    for f in os.listdir('codes'):
        text = open(os.path.join('codes', f)).read()
        syntax = f.split('.')[-1]
        X.append(text)
        y.append(syntax.lower())
    return (X, y)

X, y = load_data()

In [45]:
def preprocessor(x):
    x = re.sub('\d+', 'NUMBER', x)
    x = re.sub('\".*?\"', 'STRING', x)
    return x

In [44]:
re.sub('\".*?\"', 'STRING', 'algo "tiene que pasar" ooo ')


Out[44]:
'algo STRING ooo '

In [56]:
pipe = make_pipeline(
    TfidfVectorizer(
        ngram_range=(1, 2),
        token_pattern='(?u)\\b\\w\\w+\\b|\:|\;|\"|\'|#|\t',
        preprocessor=preprocessor
    ),
    LinearSVC()
)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
pipe.fit(X_train, y_train)

p = pipe.predict(X_val)

print(accuracy_score(p, y_val))


0.815730337079

Para probar/mejorar:

  • Probar agregar cosas en preprocessor (comentarios, otras formas de strings, numeros...)
  • Probar de agregar más simbolos en el tokenizer
  • Probar con distintos ngrams
  • Ver parametros max_df=1.0, min_df=1, max_features=None

Para ver:

  • Donde predice mal?
  • Los ejemplos que predice mal, son entendibles por una persona?
  • Hacer alguna grafica de la matriz de confusión (Por ej, por cada lenguaje cantidad de veces que predice bien)

In [55]:
# c =  CountVectorizer(ngram_range=(1, 1))
c =  TfidfVectorizer(ngram_range=(1, 1), 
                     token_pattern='(?u)\\b\\w\\w+\\b|\:|\;|\"|\'|#|!|\t',
                     preprocessor=preprocessor
                    )
corpus = ['Hola 667 10 como va', 'Bien, \t 9987 : :y vos?!']
c.fit(corpus)
c.vocabulary_ 
r = c.transform(corpus)
print(sorted(c.vocabulary_.items(), key=lambda x: x[1]))
r.toarray()


[('\t', 0), ('!', 1), (':', 2), ('Bien', 3), ('Hola', 4), ('NUMBER', 5), ('como', 6), ('va', 7), ('vos', 8)]
Out[55]:
array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.44610081,
         0.6348088 ,  0.44610081,  0.44610081,  0.        ],
       [ 0.34287126,  0.34287126,  0.68574252,  0.34287126,  0.        ,
         0.24395573,  0.        ,  0.        ,  0.34287126]])