1. Preprocesar el texto


In [1]:
import pandas as pd

In [2]:
DATASET_PATH = "../data/maybe_troll_judge1.csv"
pd_dataset = pd.DataFrame.from_csv(DATASET_PATH)
pd_dataset.head()


Out[2]:
tweet_text judge_1
0 @1800flowers Ads pay 4 Limbaugh to call us str... FALSE
1 Stop the spread of cultural marxism. #Feminazi... TRUE
2 #VivasNosQueremos\n#NantzinVive en mi corazón\... FALSE
3 RT @jalgete: Todo es "violencia de género" y "... FALSE
4 RT @jalgete: Todo es "violencia de género" y "... FALSE

In [3]:
pd_dataset.shape


Out[3]:
(100, 2)

In [4]:
pd_dataset.dropna(inplace=True)
pd_dataset.shape


Out[4]:
(99, 2)

In [5]:
pd_dataset =  pd_dataset[pd_dataset['judge_1'] != 'NONE']

In [6]:
pd_dataset.shape


Out[6]:
(90, 2)

In [7]:
# df.ix[selection criteria, columns I want] = value
pd_dataset.ix[pd_dataset['judge_1'] == 'TRUE', 'judge_1'] = 1
pd_dataset.ix[pd_dataset['judge_1'] == 'FALSE', 'judge_1'] = 2
pd_dataset.head()


Out[7]:
tweet_text judge_1
0 @1800flowers Ads pay 4 Limbaugh to call us str... 2
1 Stop the spread of cultural marxism. #Feminazi... 1
2 #VivasNosQueremos\n#NantzinVive en mi corazón\... 2
3 RT @jalgete: Todo es "violencia de género" y "... 2
4 RT @jalgete: Todo es "violencia de género" y "... 2

In [8]:
X = pd_dataset['tweet_text'].tolist()
y_true = pd_dataset['judge_1'].tolist()

In [9]:
import re
def clean_data(tweet):
    # Remove URL and replace them with a token
    URL_REGEX = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    tweet = re.sub(URL_REGEX, '<url>', tweet, flags=re.MULTILINE)
    
    # Remove usernames and replace them with a token
    tweet = re.sub("@([A-Za-z0-9_]+)", "<user>", tweet)

    # Remove repeated spaces
    tweet = re.sub(r"\s{2,}", " ", tweet)

    # If a character is repeated more than 4 time, keep only 3 repetitions.
    tweet = re.sub(r'(.)\1{4,}', r'\1\1\1', tweet)
    return tweet

In [10]:
X_cleaned = [clean_data(tweet) for tweet in X]

In [11]:
X_train, X_test = X_cleaned[:80], X_cleaned[80:]
y_train, y_test = y_true[:80], y_true[80:]

2. Representación del texto


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer implements both tokenization and occurrence counting in a single class:
count_vect = CountVectorizer()

In [13]:
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)
                                          
print("En los {} tweets de entrenamiento habían {} palabras distintas ".format(X_train_counts.shape[0],
                                                                               X_train_counts.shape[1]))


En los 80 tweets de entrenamiento habían 493 palabras distintas 

3. Entrenar un clasificador


In [14]:
import numpy as np
from sklearn.naive_bayes import GaussianNB

# GaussianNB implements the Gaussian Naive Bayes algorithm for classification. 
# The likelihood of the features is assumed to be Gaussian.
classifier = GaussianNB()
classifier.fit(X_train_counts.todense(),np.array(y_train).astype(int)) 
prediction = classifier.predict(X_test_counts.todense())
prediction


Out[14]:
array([2, 2, 2, 1, 2, 1, 2, 2, 2, 2])

4. Verificar un clasificador


In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction, target_names=['troll','no troll']))


             precision    recall  f1-score   support

      troll       1.00      0.33      0.50         6
   no troll       0.50      1.00      0.67         4

avg / total       0.80      0.60      0.57        10


In [ ]: