In [2]:
import chardet
import csv
import matplotlib.pyplot
import numpy
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density
from sklearn import metrics
import sys
from time import time
In [54]:
fileReader = csv.reader(open('/Users/keyvhinng/final-year-project/data/training.csv'))
label = []
corpus = []
for row in fileReader:
print(row)
corpus.append(row[0])
label.append(row[1])
In [57]:
len(corpus)
Out[57]:
In [4]:
vectorizer = CountVectorizer(min_df=1)
In [5]:
vectorizer
Out[5]:
In [6]:
X = vectorizer.fit_transform(corpus)
In [7]:
X
Out[7]:
In [8]:
print('number of features: %d ' % len(vectorizer.get_feature_names()))
In [9]:
analyze = vectorizer.build_analyzer()
In [10]:
analyze('#OllantaHumala es traidor')
Out[10]:
In [11]:
type(X)
Out[11]:
In [12]:
X.toarray()
Out[12]:
In [13]:
vectorizer.transform(['Ollanta tiene la culpa']).toarray()
Out[13]:
In [50]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b',min_df=1)
In [51]:
analyze = bigram_vectorizer.build_analyzer()
In [52]:
analyze('Bi-grams are cool!!')
Out[52]:
In [53]:
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
In [54]:
X_2
Out[54]:
In [56]:
feature_index = bigram_vectorizer.vocabulary_.get('is this')
In [57]:
X_2[:, feature_index]
Out[57]:
In [14]:
transformer = TfidfTransformer()
In [15]:
transformer
Out[15]:
In [16]:
counts = X.toarray()
In [17]:
type(counts)
Out[17]:
In [18]:
tfidf = transformer.fit_transform(counts)
In [19]:
tfidf
Out[19]:
In [20]:
tfidf.toarray()
Out[20]:
In [114]:
transformer.idf_
Out[114]:
In [21]:
vectorizer = TfidfVectorizer(min_df=1)
In [25]:
tfid = vectorizer.fit_transform(corpus)
In [29]:
X = tfid.toarray()
In [30]:
type(X)
Out[30]:
In [23]:
vectorizer.transform(['Ollanta tiene la culpa']).toarray()
Out[23]:
In [48]:
classifier = LinearSVC(dual=False, tol=1e-3)
In [49]:
classifier.fit(X,label)
In [80]:
text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut"
In [83]:
text2 = b"holdselig sind deine Ger\xfcche"
In [84]:
text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00"
In [85]:
decoded = [x.decode(chardet.detect(x)['encoding'])
for x in (text1,text2,text3)]
In [86]:
v = CountVectorizer().fit(decoded).vocabulary_
In [97]:
hv = HashingVectorizer(n_features=20)
In [98]:
hv.transform(corpus)
Out[98]:
In [101]:
class LemmaTokenizer(object):
def __init__(self):
self.wnl = WordNetLemmatizer()
def __call__(self,doc):
return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]
In [102]:
vect = CountVectorizer(tokenizer=LemmaTokenizer())
In [110]:
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space'
]
In [113]:
from sklearn.datasets import fetch_20newsgroups
In [117]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
shuffle=True, random_state=42,
remove=remove)
In [118]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
shuffle=True, random_state=42,
remove=remove)
In [123]:
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target
In [125]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)
In [126]:
print("n_samples: %d, n_features: %d" % X_train.shape)
In [130]:
print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()
In [131]:
feature_names = vectorizer.get_feature_names()
In [133]:
def trim(s):
"""Trim string to fit on terminal (assuming 80-column display)"""
return s if len(s) <= 80 else s[:77] + "..."
In [156]:
def benchmark(clf):
print('_' * 80)
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("test time: %0.3fs" % test_time)
score = metrics.accuracy_score(y_test, pred)
print("accuracy: %0.3f" % score)
if hasattr(clf, 'coef_'):
print("dimensionality: %d" % clf.coef_.shape[1])
print("density: %f" % density(clf.coef_))
print("classification report:")
print(metrics.classification_report(y_test, pred, target_names=categories))
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))
print()
clf_descr = str(clf).split('(')[0]
return clf_descr, score, train_time, test_time
In [157]:
results = []
for penalty in ["l2", "l1"]:
results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))
In [158]:
indices = numpy.arange(len(results))
In [159]:
results = [[x[i] for x in results] for i in range(4)]
In [161]:
clf_names, score, training_time, test_time = results
training_time = numpy.array(training_time) / numpy.max(training_time)
test_time = numpy.array(test_time) / numpy.max(test_time)
In [163]:
matplotlib.pyplot.figure(figsize=(12, 8))
matplotlib.pyplot.title("Score")
matplotlib.pyplot.barh(indices, score, .2, label="score", color='r')
matplotlib.pyplot.barh(indices + .3, training_time, .2, label="training time", color='g')
matplotlib.pyplot.barh(indices + .6, test_time, .2, label="test time", color='b')
matplotlib.pyplot.yticks(())
matplotlib.pyplot.legend(loc='best')
matplotlib.pyplot.subplots_adjust(left=.25)
matplotlib.pyplot.subplots_adjust(top=.95)
matplotlib.pyplot.subplots_adjust(bottom=.05)
In [166]:
for i, c in zip(indices, clf_names):
matplotlib.pyplot.text(-.3, i, c)
In [168]:
matplotlib.pyplot.show()
In [83]:
vectorizer.vocabulary_.get('ollanta')
Out[83]:
In [84]:
vectorizer.vocabulary_.get('tiene')
Out[84]:
In [85]:
vectorizer.vocabulary_.get('la')
Out[85]:
In [86]:
vectorizer.vocabulary_.get('culpa')
Out[86]:
In [ ]: