Opinion Mining on Twitter

(developed by Keyvhinng Espinoza)



In [2]:

    
import chardet
import csv
import matplotlib.pyplot
import numpy
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density
from sklearn import metrics
import sys
from time import time

Data



In [54]:

    
fileReader = csv.reader(open('/Users/keyvhinng/final-year-project/data/training.csv'))
label = []
corpus = []
for row in fileReader:
    print(row)
    corpus.append(row[0])
    label.append(row[1])



In [57]:

    
len(corpus)









    Out[57]:





41

Vectorization



In [4]:

    
vectorizer = CountVectorizer(min_df=1)



In [5]:

    
vectorizer









    Out[5]:





CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)



In [6]:

    
X = vectorizer.fit_transform(corpus)



In [7]:

    
X









    Out[7]:





<50x227 sparse matrix of type '<class 'numpy.int64'>'
	with 477 stored elements in Compressed Sparse Row format>



In [8]:

    
print('number of features: %d ' % len(vectorizer.get_feature_names()))









    



number of features: 227



In [9]:

    
analyze = vectorizer.build_analyzer()



In [10]:

    
analyze('#OllantaHumala es traidor')









    Out[10]:





['ollantahumala', 'es', 'traidor']



In [11]:

    
type(X)









    Out[11]:





scipy.sparse.csr.csr_matrix



In [12]:

    
X.toarray()









    Out[12]:





array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)



In [13]:

    
vectorizer.transform(['Ollanta tiene la culpa']).toarray()









    Out[13]:





array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]])

bigram vectorizer



In [50]:

    
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b',min_df=1)



In [51]:

    
analyze = bigram_vectorizer.build_analyzer()



In [52]:

    
analyze('Bi-grams are cool!!')









    Out[52]:





['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool']



In [53]:

    
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()



In [54]:

    
X_2









    Out[54]:





array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)



In [56]:

    
feature_index =  bigram_vectorizer.vocabulary_.get('is this')



In [57]:

    
X_2[:, feature_index]









    Out[57]:





array([0, 0, 0, 1], dtype=int64)

Tf-if term weighting



In [14]:

    
transformer = TfidfTransformer()



In [15]:

    
transformer









    Out[15]:





TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)



In [16]:

    
counts = X.toarray()



In [17]:

    
type(counts)









    Out[17]:





numpy.ndarray



In [18]:

    
tfidf = transformer.fit_transform(counts)



In [19]:

    
tfidf









    Out[19]:





<50x227 sparse matrix of type '<class 'numpy.float64'>'
	with 477 stored elements in Compressed Sparse Row format>



In [20]:

    
tfidf.toarray()









    Out[20]:





array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])



In [114]:

    
transformer.idf_









    Out[114]:





array([ 4.23867845,  4.23867845,  4.23867845,  3.54553127,  3.83321334,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.83321334,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.83321334,
        4.23867845,  4.23867845,  4.23867845,  3.54553127,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        3.14006616,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  3.83321334,  4.23867845,  4.23867845,
        4.23867845,  1.7537718 ,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  3.54553127,  4.23867845,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  2.62924054,  2.53393036,  4.23867845,  2.73460106,
        4.23867845,  3.83321334,  4.23867845,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  3.83321334,  4.23867845,
        3.83321334,  4.23867845,  2.98591548,  3.83321334,  4.23867845,
        4.23867845,  3.54553127,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  1.19415601,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  3.83321334,  4.23867845,  4.23867845,
        4.23867845,  2.36687628,  3.83321334,  3.83321334,  4.23867845,
        4.23867845,  3.54553127,  4.23867845,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        3.14006616,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  3.54553127,  4.23867845,
        4.23867845,  4.23867845,  3.83321334,  4.23867845,  2.2927683 ,
        4.23867845,  4.23867845,  3.83321334,  1.19415601,  2.98591548,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.54553127,
        4.23867845,  4.23867845,  4.23867845,  3.54553127,  4.23867845,
        4.23867845,  4.23867845,  3.83321334,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  3.54553127,  2.73460106,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.83321334,
        3.54553127,  2.85238409,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  2.44691898,  3.83321334,  4.23867845,
        4.23867845,  3.54553127,  4.23867845,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.32238772,
        4.23867845,  3.83321334,  4.23867845,  4.23867845,  4.23867845,
        3.83321334,  4.23867845,  2.73460106,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        3.32238772,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        3.83321334,  3.83321334])



In [21]:

    
vectorizer = TfidfVectorizer(min_df=1)



In [25]:

    
tfid = vectorizer.fit_transform(corpus)



In [29]:

    
X = tfid.toarray()



In [30]:

    
type(X)









    Out[30]:





numpy.ndarray



In [23]:

    
vectorizer.transform(['Ollanta tiene la culpa']).toarray()









    Out[23]:





array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.64668647,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.36110946,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.18218993,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.64668647,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ]])

Classification



In [48]:

    
classifier = LinearSVC(dual=False, tol=1e-3)



In [49]:

    
classifier.fit(X,label)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-49-21acb826360e> in <module>()
----> 1 classifier.fit(X,label)

/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py in fit(self, X, y)
    198 
    199         X, y = check_X_y(X, y, accept_sparse='csr',
--> 200                          dtype=np.float64, order="C")
    201         self.classes_ = np.unique(y)
    202 

/usr/local/lib/python3.4/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)
    452         y = y.astype(np.float64)
    453 
--> 454     check_consistent_length(X, y)
    455 
    456     return X, y

/usr/local/lib/python3.4/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    172     if len(uniques) > 1:
    173         raise ValueError("Found arrays with inconsistent numbers of samples: "
--> 174                          "%s" % str(uniques))
    175 
    176 

ValueError: Found arrays with inconsistent numbers of samples: [25 50]

text encoding



In [80]:

    
text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut"



In [83]:

    
text2 = b"holdselig sind deine Ger\xfcche"



In [84]:

    
text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00"



In [85]:

    
decoded = [x.decode(chardet.detect(x)['encoding'])
          for x in (text1,text2,text3)]



In [86]:

    
v = CountVectorizer().fit(decoded).vocabulary_



In [97]:

    
hv = HashingVectorizer(n_features=20)



In [98]:

    
hv.transform(corpus)









    Out[98]:





<4x20 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>



In [101]:

    
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self,doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]



In [102]:

    
vect = CountVectorizer(tokenizer=LemmaTokenizer())

Classification of text documents using sparse features



In [110]:

    
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]



In [113]:

    
from sklearn.datasets import fetch_20newsgroups



In [117]:

    
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)









    



/usr/local/lib/python3.4/site-packages/sklearn/datasets/twenty_newsgroups.py:89: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
  logger.warn("Downloading dataset from %s (14 MB)", URL)
WARNING:sklearn.datasets.twenty_newsgroups:Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)



In [118]:

    
data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)



In [123]:

    
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target



In [125]:

    
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)



In [126]:

    
print("n_samples: %d, n_features: %d" % X_train.shape)









    



n_samples: 2034, n_features: 33810



In [130]:

    
print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()









    



Extracting features from the test data using the same vectorizer
done in 0.679856s at 4.218MB/s
n_samples: 1353, n_features: 33810



In [131]:

    
feature_names = vectorizer.get_feature_names()



In [133]:

    
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."



In [156]:

    
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    
    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=categories))

    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time



In [157]:

    
results = []
for penalty in ["l2", "l1"]:
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))









    



________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
train time: 0.369s
test time:  0.059s





    



/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)






    



accuracy:   0.900
dimensionality: 33810
density: 1.000000
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.87      0.83      0.85       319
talk.religion.misc       0.91      0.98      0.95       389
     comp.graphics       0.96      0.95      0.95       394
         sci.space       0.83      0.79      0.81       251

       avg / total       0.90      0.90      0.90      1353

confusion matrix:
[[266   7   8  38]
 [  2 381   3   3]
 [  1  20 373   0]
 [ 38   9   6 198]]

________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0)
train time: 0.741s
test time:  0.004s
accuracy:   0.873
dimensionality: 33810
density: 0.005553
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.85      0.75      0.80       319
talk.religion.misc       0.89      0.97      0.93       389
     comp.graphics       0.94      0.94      0.94       394
         sci.space       0.76      0.78      0.77       251

       avg / total       0.87      0.87      0.87      1353

confusion matrix:
[[238  14  11  56]
 [  0 378   7   4]
 [  2  22 369   1]
 [ 39  12   4 196]]



In [158]:

    
indices = numpy.arange(len(results))



In [159]:

    
results = [[x[i] for x in results] for i in range(4)]



In [161]:

    
clf_names, score, training_time, test_time = results
training_time = numpy.array(training_time) / numpy.max(training_time)
test_time = numpy.array(test_time) / numpy.max(test_time)



In [163]:

    
matplotlib.pyplot.figure(figsize=(12, 8))
matplotlib.pyplot.title("Score")
matplotlib.pyplot.barh(indices, score, .2, label="score", color='r')
matplotlib.pyplot.barh(indices + .3, training_time, .2, label="training time", color='g')
matplotlib.pyplot.barh(indices + .6, test_time, .2, label="test time", color='b')
matplotlib.pyplot.yticks(())
matplotlib.pyplot.legend(loc='best')
matplotlib.pyplot.subplots_adjust(left=.25)
matplotlib.pyplot.subplots_adjust(top=.95)
matplotlib.pyplot.subplots_adjust(bottom=.05)



In [166]:

    
for i, c in zip(indices, clf_names):
    matplotlib.pyplot.text(-.3, i, c)



In [168]:

    
matplotlib.pyplot.show()



In [83]:

    
vectorizer.vocabulary_.get('ollanta')









    Out[83]:





143



In [84]:

    
vectorizer.vocabulary_.get('tiene')









    Out[84]:





205



In [85]:

    
vectorizer.vocabulary_.get('la')









    Out[85]:





111



In [86]:

    
vectorizer.vocabulary_.get('culpa')









    Out[86]:





39



In [ ]: