Opinion Mining on Twitter

(developed by Keyvhinng Espinoza)


In [2]:
import chardet
import csv
import matplotlib.pyplot
import numpy
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.svm import LinearSVC
from sklearn.utils.extmath import density
from sklearn import metrics
import sys
from time import time

Data


In [54]:
fileReader = csv.reader(open('/Users/keyvhinng/final-year-project/data/training.csv'))
label = []
corpus = []
for row in fileReader:
    print(row)
    corpus.append(row[0])
    label.append(row[1])

In [57]:
len(corpus)


Out[57]:
41

Vectorization


In [4]:
vectorizer = CountVectorizer(min_df=1)

In [5]:
vectorizer


Out[5]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
X = vectorizer.fit_transform(corpus)

In [7]:
X


Out[7]:
<50x227 sparse matrix of type '<class 'numpy.int64'>'
	with 477 stored elements in Compressed Sparse Row format>

In [8]:
print('number of features: %d ' % len(vectorizer.get_feature_names()))


number of features: 227 

In [9]:
analyze = vectorizer.build_analyzer()

In [10]:
analyze('#OllantaHumala es traidor')


Out[10]:
['ollantahumala', 'es', 'traidor']

In [11]:
type(X)


Out[11]:
scipy.sparse.csr.csr_matrix

In [12]:
X.toarray()


Out[12]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
vectorizer.transform(['Ollanta tiene la culpa']).toarray()


Out[13]:
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]])

bigram vectorizer


In [50]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern=r'\b\w+\b',min_df=1)

In [51]:
analyze = bigram_vectorizer.build_analyzer()

In [52]:
analyze('Bi-grams are cool!!')


Out[52]:
['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool']

In [53]:
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()

In [54]:
X_2


Out[54]:
array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [56]:
feature_index =  bigram_vectorizer.vocabulary_.get('is this')

In [57]:
X_2[:, feature_index]


Out[57]:
array([0, 0, 0, 1], dtype=int64)

Tf-if term weighting


In [14]:
transformer = TfidfTransformer()

In [15]:
transformer


Out[15]:
TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [16]:
counts = X.toarray()

In [17]:
type(counts)


Out[17]:
numpy.ndarray

In [18]:
tfidf = transformer.fit_transform(counts)

In [19]:
tfidf


Out[19]:
<50x227 sparse matrix of type '<class 'numpy.float64'>'
	with 477 stored elements in Compressed Sparse Row format>

In [20]:
tfidf.toarray()


Out[20]:
array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [114]:
transformer.idf_


Out[114]:
array([ 4.23867845,  4.23867845,  4.23867845,  3.54553127,  3.83321334,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.83321334,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.83321334,
        4.23867845,  4.23867845,  4.23867845,  3.54553127,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        3.14006616,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  3.83321334,  4.23867845,  4.23867845,
        4.23867845,  1.7537718 ,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  3.54553127,  4.23867845,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  2.62924054,  2.53393036,  4.23867845,  2.73460106,
        4.23867845,  3.83321334,  4.23867845,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  3.83321334,  4.23867845,
        3.83321334,  4.23867845,  2.98591548,  3.83321334,  4.23867845,
        4.23867845,  3.54553127,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  1.19415601,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  3.83321334,  4.23867845,  4.23867845,
        4.23867845,  2.36687628,  3.83321334,  3.83321334,  4.23867845,
        4.23867845,  3.54553127,  4.23867845,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        3.14006616,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  3.54553127,  4.23867845,
        4.23867845,  4.23867845,  3.83321334,  4.23867845,  2.2927683 ,
        4.23867845,  4.23867845,  3.83321334,  1.19415601,  2.98591548,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.54553127,
        4.23867845,  4.23867845,  4.23867845,  3.54553127,  4.23867845,
        4.23867845,  4.23867845,  3.83321334,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  3.54553127,  2.73460106,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.83321334,
        3.54553127,  2.85238409,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  2.44691898,  3.83321334,  4.23867845,
        4.23867845,  3.54553127,  4.23867845,  3.83321334,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  3.32238772,
        4.23867845,  3.83321334,  4.23867845,  4.23867845,  4.23867845,
        3.83321334,  4.23867845,  2.73460106,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        3.32238772,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        4.23867845,  4.23867845,  4.23867845,  4.23867845,  4.23867845,
        3.83321334,  3.83321334])

In [21]:
vectorizer = TfidfVectorizer(min_df=1)

In [25]:
tfid = vectorizer.fit_transform(corpus)

In [29]:
X = tfid.toarray()

In [30]:
type(X)


Out[30]:
numpy.ndarray

In [23]:
vectorizer.transform(['Ollanta tiene la culpa']).toarray()


Out[23]:
array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.64668647,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.36110946,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.18218993,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.64668647,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ]])

Classification


In [48]:
classifier = LinearSVC(dual=False, tol=1e-3)

In [49]:
classifier.fit(X,label)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-49-21acb826360e> in <module>()
----> 1 classifier.fit(X,label)

/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py in fit(self, X, y)
    198 
    199         X, y = check_X_y(X, y, accept_sparse='csr',
--> 200                          dtype=np.float64, order="C")
    201         self.classes_ = np.unique(y)
    202 

/usr/local/lib/python3.4/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric)
    452         y = y.astype(np.float64)
    453 
--> 454     check_consistent_length(X, y)
    455 
    456     return X, y

/usr/local/lib/python3.4/site-packages/sklearn/utils/validation.py in check_consistent_length(*arrays)
    172     if len(uniques) > 1:
    173         raise ValueError("Found arrays with inconsistent numbers of samples: "
--> 174                          "%s" % str(uniques))
    175 
    176 

ValueError: Found arrays with inconsistent numbers of samples: [25 50]

text encoding


In [80]:
text1 = b"Sei mir gegr\xc3\xbc\xc3\x9ft mein Sauerkraut"

In [83]:
text2 = b"holdselig sind deine Ger\xfcche"

In [84]:
text3 = b"\xff\xfeA\x00u\x00f\x00 \x00F\x00l\x00\xfc\x00g\x00e\x00l\x00n\x00 \x00d\x00e\x00s\x00 \x00G\x00e\x00s\x00a\x00n\x00g\x00e\x00s\x00,\x00 \x00H\x00e\x00r\x00z\x00l\x00i\x00e\x00b\x00c\x00h\x00e\x00n\x00,\x00 \x00t\x00r\x00a\x00g\x00 \x00i\x00c\x00h\x00 \x00d\x00i\x00c\x00h\x00 \x00f\x00o\x00r\x00t\x00"

In [85]:
decoded = [x.decode(chardet.detect(x)['encoding'])
          for x in (text1,text2,text3)]

In [86]:
v = CountVectorizer().fit(decoded).vocabulary_

In [97]:
hv = HashingVectorizer(n_features=20)

In [98]:
hv.transform(corpus)


Out[98]:
<4x20 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [101]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self,doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [102]:
vect = CountVectorizer(tokenizer=LemmaTokenizer())

Classification of text documents using sparse features


In [110]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]

In [113]:
from sklearn.datasets import fetch_20newsgroups

In [117]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)


/usr/local/lib/python3.4/site-packages/sklearn/datasets/twenty_newsgroups.py:89: DeprecationWarning: The 'warn' method is deprecated, use 'warning' instead
  logger.warn("Downloading dataset from %s (14 MB)", URL)
WARNING:sklearn.datasets.twenty_newsgroups:Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)

In [118]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

In [123]:
# split a training set and a test set
y_train, y_test = data_train.target, data_test.target

In [125]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)

In [126]:
print("n_samples: %d, n_features: %d" % X_train.shape)


n_samples: 2034, n_features: 33810

In [130]:
print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()


Extracting features from the test data using the same vectorizer
done in 0.679856s at 4.218MB/s
n_samples: 1353, n_features: 33810


In [131]:
feature_names = vectorizer.get_feature_names()

In [133]:
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."

In [156]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)

    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)

    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

    
    print("classification report:")
    print(metrics.classification_report(y_test, pred, target_names=categories))

    
    print("confusion matrix:")
    print(metrics.confusion_matrix(y_test, pred))

    print()
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score, train_time, test_time

In [157]:
results = []
for penalty in ["l2", "l1"]:
    results.append(benchmark(LinearSVC(loss='l2', penalty=penalty, dual=False, tol=1e-3)))


________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.001, verbose=0)
train time: 0.369s
test time:  0.059s
/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
/usr/local/lib/python3.4/site-packages/sklearn/svm/classes.py:192: DeprecationWarning: loss='l2' has been deprecated in favor of loss='squared_hinge' as of 0.16. Backward compatibility for the loss='l2' will be removed in 1.0
  DeprecationWarning)
accuracy:   0.900
dimensionality: 33810
density: 1.000000
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.87      0.83      0.85       319
talk.religion.misc       0.91      0.98      0.95       389
     comp.graphics       0.96      0.95      0.95       394
         sci.space       0.83      0.79      0.81       251

       avg / total       0.90      0.90      0.90      1353

confusion matrix:
[[266   7   8  38]
 [  2 381   3   3]
 [  1  20 373   0]
 [ 38   9   6 198]]

________________________________________________________________________________
Training: 
LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='l2', max_iter=1000, multi_class='ovr',
     penalty='l1', random_state=None, tol=0.001, verbose=0)
train time: 0.741s
test time:  0.004s
accuracy:   0.873
dimensionality: 33810
density: 0.005553
classification report:
                    precision    recall  f1-score   support

       alt.atheism       0.85      0.75      0.80       319
talk.religion.misc       0.89      0.97      0.93       389
     comp.graphics       0.94      0.94      0.94       394
         sci.space       0.76      0.78      0.77       251

       avg / total       0.87      0.87      0.87      1353

confusion matrix:
[[238  14  11  56]
 [  0 378   7   4]
 [  2  22 369   1]
 [ 39  12   4 196]]


In [158]:
indices = numpy.arange(len(results))

In [159]:
results = [[x[i] for x in results] for i in range(4)]

In [161]:
clf_names, score, training_time, test_time = results
training_time = numpy.array(training_time) / numpy.max(training_time)
test_time = numpy.array(test_time) / numpy.max(test_time)

In [163]:
matplotlib.pyplot.figure(figsize=(12, 8))
matplotlib.pyplot.title("Score")
matplotlib.pyplot.barh(indices, score, .2, label="score", color='r')
matplotlib.pyplot.barh(indices + .3, training_time, .2, label="training time", color='g')
matplotlib.pyplot.barh(indices + .6, test_time, .2, label="test time", color='b')
matplotlib.pyplot.yticks(())
matplotlib.pyplot.legend(loc='best')
matplotlib.pyplot.subplots_adjust(left=.25)
matplotlib.pyplot.subplots_adjust(top=.95)
matplotlib.pyplot.subplots_adjust(bottom=.05)

In [166]:
for i, c in zip(indices, clf_names):
    matplotlib.pyplot.text(-.3, i, c)

In [168]:
matplotlib.pyplot.show()

In [83]:
vectorizer.vocabulary_.get('ollanta')


Out[83]:
143

In [84]:
vectorizer.vocabulary_.get('tiene')


Out[84]:
205

In [85]:
vectorizer.vocabulary_.get('la')


Out[85]:
111

In [86]:
vectorizer.vocabulary_.get('culpa')


Out[86]:
39

In [ ]: