Opinion Mining on Twitter

(developed by Keyvhinng Espinoza)



In [91]:

    
import chardet
import csv
import numpy
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density
from sklearn import metrics
import sys
from time import time

Data



In [10]:

    
fileReader = csv.reader(open('/home/kref/final-project/data/training.csv'))
label = []
corpus = []
for row in fileReader:
    corpus.append(row[0])
    label.append(row[1])



In [11]:

    
len(corpus)









    Out[11]:





50

stopwords



In [165]:

    
myStopwords = ['Ollanta']



In [166]:

    
customStopwords = stopwords.words('spanish') + myStopwords

Vectorization



In [169]:

    
countVectorizer = CountVectorizer(min_df=1)



In [170]:

    
counts = countVectorizer.fit_transform(corpus)



In [171]:

    
print('number of features: %d ' % len(vectorizer.get_feature_names()))









    



number of features: 182



In [9]:

    
analyze = vectorizer.build_analyzer()



In [10]:

    
analyze('#OllantaHumala es traidor')









    Out[10]:





['ollantahumala', 'es', 'traidor']



In [173]:

    
counts.toarray()[0]









    Out[173]:





array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)



In [174]:

    
countVectorizer.transform(['Ollanta tiene la culpa']).toarray()









    Out[174]:





array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]])

bigram vectorizer



In [175]:

    
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b',min_df=1)



In [176]:

    
brigramAnalyzer = bigram_vectorizer.build_analyzer()



In [177]:

    
brigramAnalyzer('Bi-grams are cool!!')









    Out[177]:





['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool']



In [178]:

    
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()



In [179]:

    
X_2









    Out[179]:





array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

Tf-if term weighting



In [180]:

    
transformer = TfidfTransformer()



In [15]:

    
transformer









    Out[15]:





TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)



In [181]:

    
tfidf = transformer.fit_transform(counts)



In [182]:

    
tfidf.toarray()[0]









    Out[182]:





array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.42721946,
        0.        ,  0.        ,  0.        ,  0.3951567 ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.19546145,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.13309113,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.3951567 ,  0.        ,
        0.        ,  0.        ,  0.42721946,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.13309113,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.3951567 ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.30477687,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ])

Fit + Transform



In [187]:

    
tfidVectorizer = TfidfVectorizer(min_df=1,stop_words=customStopwords)



In [188]:

    
analyzer = tfidVectorizer.build_analyzer()



In [189]:

    
analyzer('contento en Lima')









    Out[189]:





['contento', 'lima']



In [190]:

    
tfid = tfidVectorizer.fit_transform(corpus)



In [191]:

    
tfid.shape









    Out[191]:





(50, 182)



In [192]:

    
tfidVectorizer.vocabulary_.get('culpa')









    Out[192]:





33



In [118]:

    
tfidVectorizer.get_stop_words();



In [119]:

    
X = tfid.toarray()



In [120]:

    
tfidVectorizer.transform(['Ollanta tiene la culpa']).toarray()









    Out[120]:





array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.96253084,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.27117225,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ]])

Classification



In [195]:

    
classifier = LinearSVC()



In [196]:

    
classifier.fit(X,label)









    Out[196]:





LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)



In [197]:

    
predicted = classifier.predict(tfidVectorizer.transform(corpus).toarray())



In [198]:

    
numpy.mean(predicted == label)









    Out[198]:





1.0



In [129]:

    
tfidVectorizer.transform('Estoy contento').toarray()









    Out[129]:





array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])



In [199]:

    
classifier.predict(tfidVectorizer.transform(['Ollanta es corrupto','Ollanta genera confianza']))









    Out[199]:





array(['A', 'C'], 
      dtype='<U1')

text encoding



In [80]:

    
text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut"



In [83]:

    
text2 = b"holdselig sind deine Ger\xfcche"



In [84]:

    
text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00"



In [85]:

    
decoded = [x.decode(chardet.detect(x)['encoding'])
          for x in (text1,text2,text3)]



In [86]:

    
v = CountVectorizer().fit(decoded).vocabulary_



In [97]:

    
hv = HashingVectorizer(n_features=20)



In [98]:

    
hv.transform(corpus)









    Out[98]:





<4x20 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>



In [101]:

    
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self,doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]



In [102]:

    
vect = CountVectorizer(tokenizer=LemmaTokenizer())

Classification of text documents using sparse features



In [110]:

    
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]



In [113]:

    
from sklearn.datasets import fetch_20newsgroups



In [118]:

    
data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)



In [123]:

    
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target



In [125]:

    
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)



In [126]:

    
print("n_samples: %d, n_features: %d" % X_train.shape)









    



n_samples: 2034, n_features: 33810



In [130]:

    
print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()









    



Extracting features from the test data using the same vectorizer
done in 0.679856s at 4.218MB/s
n_samples: 1353, n_features: 33810



In [131]:

    
feature_names = vectorizer.get_feature_names()



In [133]:

    
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."



In [156]:

    
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    
    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=categories))

    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time



In [157]:

    
results = []
for penalty in ["l2", "l1"]:
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))









    



________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
train time: 0.369s
test time:  0.059s





    



/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)






    



accuracy:   0.900
dimensionality: 33810
density: 1.000000
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.87      0.83      0.85       319
talk.religion.misc       0.91      0.98      0.95       389
     comp.graphics       0.96      0.95      0.95       394
         sci.space       0.83      0.79      0.81       251

       avg / total       0.90      0.90      0.90      1353

confusion matrix:
[[266   7   8  38]
 [  2 381   3   3]
 [  1  20 373   0]
 [ 38   9   6 198]]

________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0)
train time: 0.741s
test time:  0.004s
accuracy:   0.873
dimensionality: 33810
density: 0.005553
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.85      0.75      0.80       319
talk.religion.misc       0.89      0.97      0.93       389
     comp.graphics       0.94      0.94      0.94       394
         sci.space       0.76      0.78      0.77       251

       avg / total       0.87      0.87      0.87      1353

confusion matrix:
[[238  14  11  56]
 [  0 378   7   4]
 [  2  22 369   1]
 [ 39  12   4 196]]



In [158]:

    
indices = numpy.arange(len(results))



In [159]:

    
results = [[x[i] for x in results] for i in range(4)]



In [161]:

    
clf_names, score, training_time, test_time = results
training_time = numpy.array(training_time) / numpy.max(training_time)
test_time = numpy.array(test_time) / numpy.max(test_time)



In [163]:

    
matplotlib.pyplot.figure(figsize=(12, 8))
matplotlib.pyplot.title("Score")
matplotlib.pyplot.barh(indices, score, .2, label="score", color='r')
matplotlib.pyplot.barh(indices + .3, training_time, .2, label="training time", color='g')
matplotlib.pyplot.barh(indices + .6, test_time, .2, label="test time", color='b')
matplotlib.pyplot.yticks(())
matplotlib.pyplot.legend(loc='best')
matplotlib.pyplot.subplots_adjust(left=.25)
matplotlib.pyplot.subplots_adjust(top=.95)
matplotlib.pyplot.subplots_adjust(bottom=.05)



In [166]:

    
for i, c in zip(indices, clf_names):
    matplotlib.pyplot.text(-.3, i, c)



In [168]:

    
matplotlib.pyplot.show()



In [83]:

    
vectorizer.vocabulary_.get('ollanta')









    Out[83]:





143



In [84]:

    
vectorizer.vocabulary_.get('tiene')









    Out[84]:





205



In [85]:

    
vectorizer.vocabulary_.get('la')









    Out[85]:





111



In [86]:

    
vectorizer.vocabulary_.get('culpa')









    Out[86]:





39



In [ ]: