Opinion Mining on Twitter

(developed by Keyvhinng Espinoza)


In [91]:
import chardet
import csv
import numpy
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density
from sklearn import metrics
import sys
from time import time

Data


In [10]:
fileReader = csv.reader(open('/home/kref/final-project/data/training.csv'))
label = []
corpus = []
for row in fileReader:
    corpus.append(row[0])
    label.append(row[1])

In [11]:
len(corpus)


Out[11]:
50

stopwords


In [165]:
myStopwords = ['Ollanta']

In [166]:
customStopwords = stopwords.words('spanish') + myStopwords

Vectorization


In [169]:
countVectorizer = CountVectorizer(min_df=1)

In [170]:
counts = countVectorizer.fit_transform(corpus)

In [171]:
print('number of features: %d ' % len(vectorizer.get_feature_names()))


number of features: 182 

In [9]:
analyze = vectorizer.build_analyzer()

In [10]:
analyze('#OllantaHumala es traidor')


Out[10]:
['ollantahumala', 'es', 'traidor']

In [173]:
counts.toarray()[0]


Out[173]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [174]:
countVectorizer.transform(['Ollanta tiene la culpa']).toarray()


Out[174]:
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]])

bigram vectorizer


In [175]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b',min_df=1)

In [176]:
brigramAnalyzer = bigram_vectorizer.build_analyzer()

In [177]:
brigramAnalyzer('Bi-grams are cool!!')


Out[177]:
['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool']

In [178]:
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()

In [179]:
X_2


Out[179]:
array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Tf-if term weighting


In [180]:
transformer = TfidfTransformer()

In [15]:
transformer


Out[15]:
TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [181]:
tfidf = transformer.fit_transform(counts)

In [182]:
tfidf.toarray()[0]


Out[182]:
array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.42721946,
        0.        ,  0.        ,  0.        ,  0.3951567 ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.19546145,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.13309113,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.3951567 ,  0.        ,
        0.        ,  0.        ,  0.42721946,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.13309113,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.3951567 ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.30477687,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ])

Fit + Transform


In [187]:
tfidVectorizer = TfidfVectorizer(min_df=1,stop_words=customStopwords)

In [188]:
analyzer = tfidVectorizer.build_analyzer()

In [189]:
analyzer('contento en Lima')


Out[189]:
['contento', 'lima']

In [190]:
tfid = tfidVectorizer.fit_transform(corpus)

In [191]:
tfid.shape


Out[191]:
(50, 182)

In [192]:
tfidVectorizer.vocabulary_.get('culpa')


Out[192]:
33

In [118]:
tfidVectorizer.get_stop_words();

In [119]:
X = tfid.toarray()

In [120]:
tfidVectorizer.transform(['Ollanta tiene la culpa']).toarray()


Out[120]:
array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.96253084,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.27117225,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ]])

Classification


In [195]:
classifier = LinearSVC()

In [196]:
classifier.fit(X,label)


Out[196]:
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [197]:
predicted = classifier.predict(tfidVectorizer.transform(corpus).toarray())

In [198]:
numpy.mean(predicted == label)


Out[198]:
1.0

In [129]:
tfidVectorizer.transform('Estoy contento').toarray()


Out[129]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [199]:
classifier.predict(tfidVectorizer.transform(['Ollanta es corrupto','Ollanta genera confianza']))


Out[199]:
array(['A', 'C'], 
      dtype='<U1')

text encoding


In [80]:
text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut"

In [83]:
text2 = b"holdselig sind deine Ger\xfcche"

In [84]:
text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00"

In [85]:
decoded = [x.decode(chardet.detect(x)['encoding'])
          for x in (text1,text2,text3)]

In [86]:
v = CountVectorizer().fit(decoded).vocabulary_

In [97]:
hv = HashingVectorizer(n_features=20)

In [98]:
hv.transform(corpus)


Out[98]:
<4x20 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [101]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self,doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [102]:
vect = CountVectorizer(tokenizer=LemmaTokenizer())

Classification of text documents using sparse features


In [110]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]

In [113]:
from sklearn.datasets import fetch_20newsgroups

In [118]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

In [123]:
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

In [125]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)

In [126]:
print("n_samples: %d, n_features: %d" % X_train.shape)


n_samples: 2034, n_features: 33810

In [130]:
print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()


Extracting features from the test data using the same vectorizer
done in 0.679856s at 4.218MB/s
n_samples: 1353, n_features: 33810


In [131]:
feature_names = vectorizer.get_feature_names()

In [133]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

In [156]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    
    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=categories))

    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [157]:
results = []
for penalty in ["l2", "l1"]:
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))


________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
train time: 0.369s
test time:  0.059s
/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
accuracy:   0.900
dimensionality: 33810
density: 1.000000
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.87      0.83      0.85       319
talk.religion.misc       0.91      0.98      0.95       389
     comp.graphics       0.96      0.95      0.95       394
         sci.space       0.83      0.79      0.81       251

       avg / total       0.90      0.90      0.90      1353

confusion matrix:
[[266   7   8  38]
 [  2 381   3   3]
 [  1  20 373   0]
 [ 38   9   6 198]]

________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0)
train time: 0.741s
test time:  0.004s
accuracy:   0.873
dimensionality: 33810
density: 0.005553
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.85      0.75      0.80       319
talk.religion.misc       0.89      0.97      0.93       389
     comp.graphics       0.94      0.94      0.94       394
         sci.space       0.76      0.78      0.77       251

       avg / total       0.87      0.87      0.87      1353

confusion matrix:
[[238  14  11  56]
 [  0 378   7   4]
 [  2  22 369   1]
 [ 39  12   4 196]]


In [158]:
indices = numpy.arange(len(results))

In [159]:
results = [[x[i] for x in results] for i in range(4)]

In [161]:
clf_names, score, training_time, test_time = results
training_time = numpy.array(training_time) / numpy.max(training_time)
test_time = numpy.array(test_time) / numpy.max(test_time)

In [163]:
matplotlib.pyplot.figure(figsize=(12, 8))
matplotlib.pyplot.title("Score")
matplotlib.pyplot.barh(indices, score, .2, label="score", color='r')
matplotlib.pyplot.barh(indices + .3, training_time, .2, label="training time", color='g')
matplotlib.pyplot.barh(indices + .6, test_time, .2, label="test time", color='b')
matplotlib.pyplot.yticks(())
matplotlib.pyplot.legend(loc='best')
matplotlib.pyplot.subplots_adjust(left=.25)
matplotlib.pyplot.subplots_adjust(top=.95)
matplotlib.pyplot.subplots_adjust(bottom=.05)

In [166]:
for i, c in zip(indices, clf_names):
    matplotlib.pyplot.text(-.3, i, c)

In [168]:
matplotlib.pyplot.show()

In [83]:
vectorizer.vocabulary_.get('ollanta')


Out[83]:
143

In [84]:
vectorizer.vocabulary_.get('tiene')


Out[84]:
205

In [85]:
vectorizer.vocabulary_.get('la')


Out[85]:
111

In [86]:
vectorizer.vocabulary_.get('culpa')


Out[86]:
39

In [ ]: