In [57]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.cluster.vq import whiten
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from collections import defaultdict
from collections import Counter
In [34]:
# INPUT
f_authorship = '2k_users/authorship.csv'
### PREPROCESSING
df = pd.read_csv(f_authorship)
print(df.shape)
df.drop_duplicates()
# remove tweets with no words (just emojis)
df = df[ [(len(word_tokenizer.tokenize(x.lower())) > 0) for x in df['text']] ]
print(df.shape)
np.random.seed(1)
df = df.reindex(np.random.permutation(df.index))
X = df.loc[:, 'text'].values
y = df.loc[:, 'user_id'].values
le = LabelEncoder()
y = le.fit_transform(y)
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=1)
In [35]:
df.head(3)
Out[35]:
In [61]:
# create feature vectors
num_docs = X_train.shape[0]
# words/sentence avg., words/sentence std., unique words/total words
# commas/semicolons/periods per sentence
fvs_punct_lexical = np.zeros((num_docs, 6), np.float64)
for i, doc in enumerate(X_train):
# PUNCTUATION FEATURES
# note: the nltk.word_tokenize includes punctuation
tokens = nltk.word_tokenize(doc) ###
# LEXICAL FEATURES
words = word_tokenizer.tokenize(doc) ###
sentences = sentence_tokenizer.tokenize(doc)
vocab = set(words)
words_per_sentence = np.array([len(word_tokenizer.tokenize(s)) for s in sentences])
# Commas per sentence
fvs_punct_lexical[i, 0] = tokens.count(',') / float(len(sentences))
# Semicolons per sentence
fvs_punct_lexical[i, 1] = tokens.count(';') / float(len(sentences))
# Colons per sentence
fvs_punct_lexical[i, 2] = tokens.count(':') / float(len(sentences))
# average number of words per sentence
fvs_punct_lexical[i, 3] = words_per_sentence.mean()
# sentence length variation
fvs_punct_lexical[i, 4] = words_per_sentence.std()
# Lexical diversity - proportion of unique words
fvs_punct_lexical[i, 5] = len(vocab) / float(len(words))
# apply whitening to decorrelate the features
#fvs_punct_lexical = whiten(fvs_lexical)
In [62]:
# get most common words in the whole book
NUM_TOP_WORDS = 1000
all_text = ' '.join(X_train)
all_tokens = nltk.word_tokenize(all_text)
fdist = nltk.FreqDist(all_tokens)
vocab = fdist.most_common(NUM_TOP_WORDS)
# use sklearn to create the bag for words feature vector for each chapter
tweet_token = TweetTokenizer()
vectorizer = CountVectorizer(vocabulary=vocab,
tokenizer=tweet_token.tokenize)
fvs_bow = vectorizer.fit_transform(X_train).toarray().astype(np.float64)
# normalise by rows
#from sklearn.preprocessing import normalize
#fvs_bow = normalize(fvs_bow, norm='l2', axis=1)
In [63]:
# get part of speech for each token in each chapter
def token_to_pos(doc):
tweet_token = TweetTokenizer()
tokens = tweet_token.tokenize(text)
return [tag for word, tag in nltk.pos_tag(tokens)]
docs_pos = [token_to_pos(doc) for doc in X_train]
# count frequencies for common POS types
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
poss = [ item for sublist in docs_pos for item in sublist ]
pos_dist = Counter(poss)
pos_list = [ pos for pos, count in pos_dist.most_common() ]
fvs_syntax = np.array([ [doc.count(pos) for pos in pos_list] for doc in docs_pos]).astype(np.float64)
# normalise by dividing each row by number of tokens in the chapter
# fvs_syntax /= np.c_[np.array([len(doc) for doc in docs_pos])]
In [68]:
def PredictAuthors(fvs, X_train, y_train):
sc = StandardScaler()
fvs_std = sc.fit_transform(fvs)
km = KMeans(n_clusters=2,
init='k-means++',
n_init=300,
max_iter=1000,
tol=1e-6,
verbose=0)
km.fit_predict(fvs_std)
c_docs = defaultdict(list)
c_ids = defaultdict(list)
# organize documents into their clusters
for i, label in enumerate(km.labels_):
c_docs[label].append(X_train[i])
c_ids[label].append(y_train[i])
# get distribution of documents for each cluster
for label in c_ids.keys():
c_ids[label] = Counter(c_ids[label])
return km, c_docs, c_ids
In [69]:
km, c_docs, c_ids = PredictAuthors(fvs_punct_lexical, X_train, y_train)
for label in c_ids.keys():
print(c_ids[label].most_common())
for label, tweet in c_docs.items():
print('---------------------------------------------------- Cluster {}'.format(label))
for text in tweet[:3]:
print(text)
print('\n\n')
In [70]:
km, c_docs, c_ids = PredictAuthors(fvs_bow, X_train, y_train)
for label in c_ids.keys():
print(c_ids[label].most_common())
for label, tweet in c_docs.items():
print('---------------------------------------------------- Cluster {}'.format(label))
for text in tweet[:3]:
print(text)
print('\n\n')
In [71]:
km, c_docs, c_ids = PredictAuthors(fvs_syntax, X_train, y_train)
for label in c_ids.keys():
print(c_ids[label].most_common())
for label, tweet in c_docs.items():
print('---------------------------------------------------- Cluster {}'.format(label))
for text in tweet[:3]:
print(text)
print('\n\n')
In [ ]: