1. Preprocesar el texto



In [1]:

    
import pandas as pd



In [2]:

    
DATASET_PATH = "../data/maybe_troll_judge1.csv"
pd_dataset = pd.DataFrame.from_csv(DATASET_PATH)
pd_dataset.head()









    Out[2]:






  
    
      
      tweet_text
      judge_1
    
  
  
    
      0
      @1800flowers Ads pay 4 Limbaugh to call us str...
      FALSE
    
    
      1
      Stop the spread of cultural marxism. #Feminazi...
      TRUE
    
    
      2
      #VivasNosQueremos\n#NantzinVive en mi corazón\...
      FALSE
    
    
      3
      RT @jalgete: Todo es "violencia de género" y "...
      FALSE
    
    
      4
      RT @jalgete: Todo es "violencia de género" y "...
      FALSE



In [3]:

    
pd_dataset.shape









    Out[3]:





(100, 2)



In [4]:

    
pd_dataset.dropna(inplace=True)
pd_dataset.shape









    Out[4]:





(99, 2)



In [5]:

    
pd_dataset =  pd_dataset[pd_dataset['judge_1'] != 'NONE']



In [6]:

    
pd_dataset.shape









    Out[6]:





(90, 2)



In [7]:

    
# df.ix[selection criteria, columns I want] = value
pd_dataset.ix[pd_dataset['judge_1'] == 'TRUE', 'judge_1'] = 1
pd_dataset.ix[pd_dataset['judge_1'] == 'FALSE', 'judge_1'] = 2
pd_dataset.head()









    Out[7]:






  
    
      
      tweet_text
      judge_1
    
  
  
    
      0
      @1800flowers Ads pay 4 Limbaugh to call us str...
      2
    
    
      1
      Stop the spread of cultural marxism. #Feminazi...
      1
    
    
      2
      #VivasNosQueremos\n#NantzinVive en mi corazón\...
      2
    
    
      3
      RT @jalgete: Todo es "violencia de género" y "...
      2
    
    
      4
      RT @jalgete: Todo es "violencia de género" y "...
      2



In [8]:

    
X = pd_dataset['tweet_text'].tolist()
y_true = pd_dataset['judge_1'].tolist()



In [9]:

    
import re
def clean_data(tweet):
    # Remove URL and replace them with a token
    URL_REGEX = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    tweet = re.sub(URL_REGEX, '<url>', tweet, flags=re.MULTILINE)
    
    # Remove usernames and replace them with a token
    tweet = re.sub("@([A-Za-z0-9_]+)", "<user>", tweet)

    # Remove repeated spaces
    tweet = re.sub(r"\s{2,}", " ", tweet)

    # If a character is repeated more than 4 time, keep only 3 repetitions.
    tweet = re.sub(r'(.)\1{4,}', r'\1\1\1', tweet)
    return tweet



In [10]:

    
X_cleaned = [clean_data(tweet) for tweet in X]



In [11]:

    
X_train, X_test = X_cleaned[:80], X_cleaned[80:]
y_train, y_test = y_true[:80], y_true[80:]

2. Representación del texto



In [12]:

    
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer implements both tokenization and occurrence counting in a single class:
count_vect = CountVectorizer()



In [13]:

    
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)
                                          
print("En los {} tweets de entrenamiento habían {} palabras distintas ".format(X_train_counts.shape[0],
                                                                               X_train_counts.shape[1]))









    



En los 80 tweets de entrenamiento habían 493 palabras distintas

3. Entrenar un clasificador



In [14]:

    
import numpy as np
from sklearn.naive_bayes import GaussianNB

# GaussianNB implements the Gaussian Naive Bayes algorithm for classification. 
# The likelihood of the features is assumed to be Gaussian.
classifier = GaussianNB()
classifier.fit(X_train_counts.todense(),np.array(y_train).astype(int)) 
prediction = classifier.predict(X_test_counts.todense())
prediction









    Out[14]:





array([2, 2, 2, 1, 2, 1, 2, 2, 2, 2])

4. Verificar un clasificador



In [15]:

    
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction, target_names=['troll','no troll']))









    



             precision    recall  f1-score   support

      troll       1.00      0.33      0.50         6
   no troll       0.50      1.00      0.67         4

avg / total       0.80      0.60      0.57        10



In [ ]:

	tweet_text	judge_1
0	@1800flowers Ads pay 4 Limbaugh to call us str...	FALSE
1	Stop the spread of cultural marxism. #Feminazi...	TRUE
2	#VivasNosQueremos\n#NantzinVive en mi corazón\...	FALSE
3	RT @jalgete: Todo es "violencia de género" y "...	FALSE
4	RT @jalgete: Todo es "violencia de género" y "...	FALSE