In [91]:
import chardet
import csv
import numpy
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density
from sklearn import metrics
import sys
from time import time
In [10]:
fileReader = csv.reader(open('/home/kref/final-project/data/training.csv'))
label = []
corpus = []
for row in fileReader:
corpus.append(row[0])
label.append(row[1])
In [11]:
len(corpus)
Out[11]:
In [165]:
myStopwords = ['Ollanta']
In [166]:
customStopwords = stopwords.words('spanish') + myStopwords
In [169]:
countVectorizer = CountVectorizer(min_df=1)
In [170]:
counts = countVectorizer.fit_transform(corpus)
In [171]:
print('number of features: %d ' % len(vectorizer.get_feature_names()))
In [9]:
analyze = vectorizer.build_analyzer()
In [10]:
analyze('#OllantaHumala es traidor')
Out[10]:
In [173]:
counts.toarray()[0]
Out[173]:
In [174]:
countVectorizer.transform(['Ollanta tiene la culpa']).toarray()
Out[174]:
In [175]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b',min_df=1)
In [176]:
brigramAnalyzer = bigram_vectorizer.build_analyzer()
In [177]:
brigramAnalyzer('Bi-grams are cool!!')
Out[177]:
In [178]:
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
In [179]:
X_2
Out[179]:
In [180]:
transformer = TfidfTransformer()
In [15]:
transformer
Out[15]:
In [181]:
tfidf = transformer.fit_transform(counts)
In [182]:
tfidf.toarray()[0]
Out[182]:
In [187]:
tfidVectorizer = TfidfVectorizer(min_df=1,stop_words=customStopwords)
In [188]:
analyzer = tfidVectorizer.build_analyzer()
In [189]:
analyzer('contento en Lima')
Out[189]:
In [190]:
tfid = tfidVectorizer.fit_transform(corpus)
In [191]:
tfid.shape
Out[191]:
In [192]:
tfidVectorizer.vocabulary_.get('culpa')
Out[192]:
In [118]:
tfidVectorizer.get_stop_words();
In [119]:
X = tfid.toarray()
In [120]:
tfidVectorizer.transform(['Ollanta tiene la culpa']).toarray()
Out[120]:
In [195]:
classifier = LinearSVC()
In [196]:
classifier.fit(X,label)
Out[196]:
In [197]:
predicted = classifier.predict(tfidVectorizer.transform(corpus).toarray())
In [198]:
numpy.mean(predicted == label)
Out[198]:
In [129]:
tfidVectorizer.transform('Estoy contento').toarray()
Out[129]:
In [199]:
classifier.predict(tfidVectorizer.transform(['Ollanta es corrupto','Ollanta genera confianza']))
Out[199]:
In [80]:
text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut"
In [83]:
text2 = b"holdselig sind deine Ger\xfcche"
In [84]:
text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00"
In [85]:
decoded = [x.decode(chardet.detect(x)['encoding'])
for x in (text1,text2,text3)]
In [86]:
v = CountVectorizer().fit(decoded).vocabulary_
In [97]:
hv = HashingVectorizer(n_features=20)
In [98]:
hv.transform(corpus)
Out[98]:
In [101]:
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self,doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
In [102]:
vect = CountVectorizer(tokenizer=LemmaTokenizer())
In [110]:
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space'
]
In [113]:
from sklearn.datasets import fetch_20newsgroups
In [118]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42,
remove=remove)
In [123]:
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target
In [125]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
In [126]:
print("n_samples: %d, n_features: %d" % X_train.shape)
In [130]:
print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()
In [131]:
feature_names = vectorizer.get_feature_names()
In [133]:
def trim(s):
"""Trim string to fit on terminal (assuming 80-column display)"""
return s if len(s) <= 80 else s[:77] + "..."
In [156]:
def benchmark(clf):
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("test time: %0.3fs" % test_time)
score = metrics.accuracy_score(y_test, pred)
print("accuracy: %0.3f" % score)
if hasattr(clf, 'coef_'):
print("dimensionality: %d" % clf.coef_.shape[1])
print("density: %f" % density(clf.coef_))
print("classification report:")
print(metrics.classification_report(y_test, pred, target_names=categories))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))
print()
clf_descr = str(clf).split('(')[0]
return clf_descr, score, train_time, test_time
In [157]:
results = []
for penalty in ["l2", "l1"]:
results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))
In [158]:
indices = numpy.arange(len(results))
In [159]:
results = [[x[i] for x in results] for i in range(4)]
In [161]:
clf_names, score, training_time, test_time = results
training_time = numpy.array(training_time) / numpy.max(training_time)
test_time = numpy.array(test_time) / numpy.max(test_time)
In [163]:
matplotlib.pyplot.figure(figsize=(12, 8))
matplotlib.pyplot.title("Score")
matplotlib.pyplot.barh(indices, score, .2, label="score", color='r')
matplotlib.pyplot.barh(indices + .3, training_time, .2, label="training time", color='g')
matplotlib.pyplot.barh(indices + .6, test_time, .2, label="test time", color='b')
matplotlib.pyplot.yticks(())
matplotlib.pyplot.legend(loc='best')
matplotlib.pyplot.subplots_adjust(left=.25)
matplotlib.pyplot.subplots_adjust(top=.95)
matplotlib.pyplot.subplots_adjust(bottom=.05)
In [166]:
for i, c in zip(indices, clf_names):
matplotlib.pyplot.text(-.3, i, c)
In [168]:
matplotlib.pyplot.show()
In [83]:
vectorizer.vocabulary_.get('ollanta')
Out[83]:
In [84]:
vectorizer.vocabulary_.get('tiene')
Out[84]:
In [85]:
vectorizer.vocabulary_.get('la')
Out[85]:
In [86]:
vectorizer.vocabulary_.get('culpa')
Out[86]:
In [ ]: