In [1]:
import pandas as pd
In [2]:
DATASET_PATH = "../data/maybe_troll_judge1.csv"
pd_dataset = pd.DataFrame.from_csv(DATASET_PATH)
pd_dataset.head()
Out[2]:
In [3]:
pd_dataset.shape
Out[3]:
In [4]:
pd_dataset.dropna(inplace=True)
pd_dataset.shape
Out[4]:
In [5]:
pd_dataset = pd_dataset[pd_dataset['judge_1'] != 'NONE']
In [6]:
pd_dataset.shape
Out[6]:
In [7]:
# df.ix[selection criteria, columns I want] = value
pd_dataset.ix[pd_dataset['judge_1'] == 'TRUE', 'judge_1'] = 1
pd_dataset.ix[pd_dataset['judge_1'] == 'FALSE', 'judge_1'] = 2
pd_dataset.head()
Out[7]:
In [8]:
X = pd_dataset['tweet_text'].tolist()
y_true = pd_dataset['judge_1'].tolist()
In [9]:
import re
def clean_data(tweet):
# Remove URL and replace them with a token
URL_REGEX = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
tweet = re.sub(URL_REGEX, '<url>', tweet, flags=re.MULTILINE)
# Remove usernames and replace them with a token
tweet = re.sub("@([A-Za-z0-9_]+)", "<user>", tweet)
# Remove repeated spaces
tweet = re.sub(r"\s{2,}", " ", tweet)
# If a character is repeated more than 4 time, keep only 3 repetitions.
tweet = re.sub(r'(.)\1{4,}', r'\1\1\1', tweet)
return tweet
In [10]:
X_cleaned = [clean_data(tweet) for tweet in X]
In [11]:
X_train, X_test = X_cleaned[:80], X_cleaned[80:]
y_train, y_test = y_true[:80], y_true[80:]
In [12]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer implements both tokenization and occurrence counting in a single class:
count_vect = CountVectorizer()
In [13]:
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)
print("En los {} tweets de entrenamiento habían {} palabras distintas ".format(X_train_counts.shape[0],
X_train_counts.shape[1]))
In [14]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
# GaussianNB implements the Gaussian Naive Bayes algorithm for classification.
# The likelihood of the features is assumed to be Gaussian.
classifier = GaussianNB()
classifier.fit(X_train_counts.todense(),np.array(y_train).astype(int))
prediction = classifier.predict(X_test_counts.todense())
prediction
Out[14]:
In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction, target_names=['troll','no troll']))
In [ ]: