In [57]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import TweetTokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

from sklearn.preprocessing import LabelEncoder                                  
from sklearn.model_selection import train_test_split  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.cluster.vq import whiten
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

from collections import defaultdict
from collections import Counter

Input Data


In [34]:
# INPUT                                                                                                                                     
f_authorship = '2k_users/authorship.csv'                                                      
                                                                                
### PREPROCESSING                                                               
df = pd.read_csv(f_authorship) 
print(df.shape)
df.drop_duplicates()  
# remove tweets with no words (just emojis)
df = df[ [(len(word_tokenizer.tokenize(x.lower())) > 0) for x in df['text']] ]
print(df.shape)
np.random.seed(1)                                                               
df = df.reindex(np.random.permutation(df.index))                                                                  
                                                                                
X = df.loc[:, 'text'].values                                                    
y = df.loc[:, 'user_id'].values                                                 
le = LabelEncoder()                                                             
y = le.fit_transform(y)                                                         
                                                                                
# Train test split                                                              
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=1)


(3991, 3)
(3988, 3)

In [35]:
df.head(3)


Out[35]:
text id user_id
187 "I like the comfort of knowing that women are ... 304293352017379329 265463749
3080 Best @ATLASexperiment section ever. MT @claran... 502575966347354112 174312391
1811 I feel like a kid learning to walk #StringTheo... 378295513910902784 265463749

Feature Vectors

Lexicanl Features


In [61]:
# create feature vectors
num_docs = X_train.shape[0]

# words/sentence avg., words/sentence std., unique words/total words
# commas/semicolons/periods per sentence
fvs_punct_lexical = np.zeros((num_docs, 6), np.float64)
for i, doc in enumerate(X_train):  
    # PUNCTUATION FEATURES
    # note: the nltk.word_tokenize includes punctuation
    tokens = nltk.word_tokenize(doc)           ###
    # LEXICAL FEATURES
    words = word_tokenizer.tokenize(doc)       ###
    sentences = sentence_tokenizer.tokenize(doc)
    vocab = set(words)
    words_per_sentence = np.array([len(word_tokenizer.tokenize(s)) for s in sentences])
 
    # Commas per sentence
    fvs_punct_lexical[i, 0] = tokens.count(',') / float(len(sentences))
    # Semicolons per sentence
    fvs_punct_lexical[i, 1] = tokens.count(';') / float(len(sentences))
    # Colons per sentence
    fvs_punct_lexical[i, 2] = tokens.count(':') / float(len(sentences))
    
    # average number of words per sentence
    fvs_punct_lexical[i, 3] = words_per_sentence.mean()
    # sentence length variation
    fvs_punct_lexical[i, 4] = words_per_sentence.std()
    # Lexical diversity - proportion of unique words
    fvs_punct_lexical[i, 5] = len(vocab) / float(len(words))
    
    
# apply whitening to decorrelate the features
#fvs_punct_lexical = whiten(fvs_lexical)

Bag of Words


In [62]:
# get most common words in the whole book
NUM_TOP_WORDS = 1000
all_text = ' '.join(X_train)
all_tokens = nltk.word_tokenize(all_text)
fdist = nltk.FreqDist(all_tokens)
vocab = fdist.most_common(NUM_TOP_WORDS)

# use sklearn to create the bag for words feature vector for each chapter
tweet_token = TweetTokenizer()
vectorizer = CountVectorizer(vocabulary=vocab, 
                             tokenizer=tweet_token.tokenize)
fvs_bow = vectorizer.fit_transform(X_train).toarray().astype(np.float64)

# normalise by rows
#from sklearn.preprocessing import normalize
#fvs_bow = normalize(fvs_bow, norm='l2', axis=1)

Syntatic Features


In [63]:
# get part of speech for each token in each chapter
def token_to_pos(doc):
    tweet_token = TweetTokenizer()
    tokens = tweet_token.tokenize(text)
    return [tag for word, tag in nltk.pos_tag(tokens)]

docs_pos = [token_to_pos(doc) for doc in X_train]

# count frequencies for common POS types
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
poss = [ item for sublist in docs_pos for item in sublist ]
pos_dist = Counter(poss)
pos_list = [ pos for pos, count in pos_dist.most_common() ]

fvs_syntax = np.array([ [doc.count(pos) for pos in pos_list] for doc in docs_pos]).astype(np.float64)
 
# normalise by dividing each row by number of tokens in the chapter
# fvs_syntax /= np.c_[np.array([len(doc) for doc in docs_pos])]

Clustering


In [68]:
def PredictAuthors(fvs, X_train, y_train):
    sc = StandardScaler()
    fvs_std = sc.fit_transform(fvs)
    
    km = KMeans(n_clusters=2, 
                init='k-means++', 
                n_init=300, 
                max_iter=1000,
                tol=1e-6,
                verbose=0)
    km.fit_predict(fvs_std)

    c_docs = defaultdict(list)
    c_ids = defaultdict(list)
    # organize documents into their clusters
    for i, label in enumerate(km.labels_):
        c_docs[label].append(X_train[i])
        c_ids[label].append(y_train[i])
    # get distribution of documents for each cluster
    for label in c_ids.keys():
        c_ids[label] = Counter(c_ids[label])
    
    return km, c_docs, c_ids

Results


In [69]:
km, c_docs, c_ids = PredictAuthors(fvs_punct_lexical, X_train, y_train) 

for label in c_ids.keys():
    print(c_ids[label].most_common())
for label, tweet in c_docs.items():
    print('---------------------------------------------------- Cluster {}'.format(label))
    for text in tweet[:3]:
        print(text)
    print('\n\n')


[(1, 1465), (0, 1149)]
[(0, 845), (1, 529)]
---------------------------------------------------- Cluster 1
Graduate school is way better than undergrad!
@Lucretius21c @TariqYasmine It seems that @AstroKatie was missed in the making of this list. :(
@xoxosussyxoxo lol thanks but I don't get no money for that



---------------------------------------------------- Cluster 0
The Louis Lyons #arXiv paper that @jonmbutterworth mentioned in his post on #statistics in particle #physics: http://t.co/gCEN9QQuHP
@AnElizardbreath Welcome to Twitter. It takes some building and getting used to, but is extremely useful.
@roseveleth That people think they actually have an opinion on things they know nothing about confuses me. They just have nothing.




In [70]:
km, c_docs, c_ids = PredictAuthors(fvs_bow, X_train, y_train) 

for label in c_ids.keys():
    print(c_ids[label].most_common())
for label, tweet in c_docs.items():
    print('---------------------------------------------------- Cluster {}'.format(label))
    for text in tweet[:3]:
        print(text)
    print('\n\n')


[(1, 1994), (0, 1994)]
---------------------------------------------------- Cluster 0
Graduate school is way better than undergrad!
The Louis Lyons #arXiv paper that @jonmbutterworth mentioned in his post on #statistics in particle #physics: http://t.co/gCEN9QQuHP
@AnElizardbreath Welcome to Twitter. It takes some building and getting used to, but is extremely useful.




In [71]:
km, c_docs, c_ids = PredictAuthors(fvs_syntax, X_train, y_train) 

for label in c_ids.keys():
    print(c_ids[label].most_common())
for label, tweet in c_docs.items():
    print('---------------------------------------------------- Cluster {}'.format(label))
    for text in tweet[:3]:
        print(text)
    print('\n\n')


[(1, 1994), (0, 1994)]
---------------------------------------------------- Cluster 0
Graduate school is way better than undergrad!
The Louis Lyons #arXiv paper that @jonmbutterworth mentioned in his post on #statistics in particle #physics: http://t.co/gCEN9QQuHP
@AnElizardbreath Welcome to Twitter. It takes some building and getting used to, but is extremely useful.




In [ ]: