In [16]:
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
import re
def load_data():
"returns a (X, y) readed from codes folder"
X = []
y = []
for f in os.listdir('codes'):
text = open(os.path.join('codes', f)).read()
syntax = f.split('.')[-1]
X.append(text)
y.append(syntax.lower())
return (X, y)
X, y = load_data()
In [45]:
def preprocessor(x):
x = re.sub('\d+', 'NUMBER', x)
x = re.sub('\".*?\"', 'STRING', x)
return x
In [44]:
re.sub('\".*?\"', 'STRING', 'algo "tiene que pasar" ooo ')
Out[44]:
In [56]:
pipe = make_pipeline(
TfidfVectorizer(
ngram_range=(1, 2),
token_pattern='(?u)\\b\\w\\w+\\b|\:|\;|\"|\'|#|\t',
preprocessor=preprocessor
),
LinearSVC()
)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
pipe.fit(X_train, y_train)
p = pipe.predict(X_val)
print(accuracy_score(p, y_val))
In [55]:
# c = CountVectorizer(ngram_range=(1, 1))
c = TfidfVectorizer(ngram_range=(1, 1),
token_pattern='(?u)\\b\\w\\w+\\b|\:|\;|\"|\'|#|!|\t',
preprocessor=preprocessor
)
corpus = ['Hola 667 10 como va', 'Bien, \t 9987 : :y vos?!']
c.fit(corpus)
c.vocabulary_
r = c.transform(corpus)
print(sorted(c.vocabulary_.items(), key=lambda x: x[1]))
r.toarray()
Out[55]: