In [ ]:
import scipy as sp
import sklearn
import os
import nltk
%pylab inline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [ ]:
from BeautifulSoup import BeautifulSoup

In [ ]:
from sklearn.preprocessing import scale
import re
from nltk.corpus import stopwords
URL_REGEX = re.compile(r'(https?|ftp)://[^\s]*')
REPLY_REGEX = re.compile(r'@username')

URL_TAG = 'URL'
REPLY_TAG = 'REP'

# PREPROCESS XML DOCUMENTS
def extract_from_xml(xml):
    bs = BeautifulSoup(xml)
    # tab na pocetku
    return [document.text.rstrip('\t') for document in bs.findAll('document')]

def trim_multiple_repeats(string):
    # coooooool-> coool
    # use 3 to make distinct from 2 repeats that are common in 
    # some languages
    return re.sub(r'(.)\1{3,}', r'\1\1\1', string) 

def preprocess_tweet(text, language):
    text = URL_REGEX.sub(URL_TAG, text)
    text = REPLY_REGEX.sub(REPLY_TAG, text)
    text = trim_multiple_repeats(text)
    
    
    letters_only = re.sub("[^a-zA-Z]", " ", text)
    words = letters_only.lower().split()                            
    stops = set(stopwords.words(language))                  
    meaningful_words = [w for w in words if not w in stops]  
    return( " ".join( meaningful_words ))

def get_smile_cnt(string):
    return len(re.findall(r'(?::|;|:\'|=)(?:-)?(?:\)|\(|D|P|d|p|3)|<3|</3|xd|xD|XD',string))

def cnt_long_repeats(string):
    return len(re.findall(r'(\w)\1{3,}', string))

def cnt_replys(string):
    return len(re.findall(r'@username', string))

def cnt_hashtags(string):
    return len(re.findall(r'#(\w+)', string))

def cnt_exclamations(string):
    return len(re.findall(r'!+', string))

Preprocess data

Strukture za pohranu dataseta


In [ ]:
LABELS = ['userid', 'gender', 'age_group',
          'extroverted', 'stable', 'agreeable',
          'conscientious', 'open']
TYPES = ['string'] * 3 + ['float'] * 5

''' User class used to store parsed data'''
class User(object):

    def __init__(self, line):
        self.labels = LABELS

        parts = map(str.strip, line.split(FIELDS_DELIMITER))
        if len(parts) == 1:
            parts = [parts[0]] + [''] * 7
            
            self.userid = parts[0]
            self.gender = None
            self.age_group = None
            self.extroverted = None
            self.stable = None
            self.agreeable = None
            self.conscientious = None
            self.open = None
        
        else:
            self.userid = parts[0]
            self.gender = parts[1]
            self.age_group = parts[2]

            self.extroverted = float(parts[3])
            self.stable = float(parts[4])
            self.agreeable = float(parts[5])
            self.conscientious = float(parts[6])
            self.open = float(parts[7])
        self.documents = []

    def user_details(self):
        return [self.userid, self.gender,
                self.age_group, self.extroverted, self.stable,
                self.agreeable, self.conscientious, self.open]

    def user_documents(self):
        return self.documents
    
    def merge_documents(self):
        return '\n'.join(self.documents)

In [ ]:
TRUTH_FILE = 'truth.txt'
FIELDS_DELIMITER = ':::'
LABELS = ['userid', 'gender', 'age_group',
          'extroverted', 'stable', 'agreeable',
          'conscientious', 'open']

'''Dataset wrapper - parses, cleans and stores user data (documents and truth)
'''
class Dataset(object):

    def __init__(self, path, language):
        path += language
        if not os.path.exists(path) or not os.path.isdir(path):
            raise Exception('No such dir ' + path)
        
        self.language = language
        self.path = path
        self.users = {}
        self.X = []
        self.y = []
        self.load()
        self.labels = LABELS

    def load(self):
        user_files = filter(lambda name: name != TRUTH_FILE, os.listdir(self.path))
        truth = os.path.join(self.path, TRUTH_FILE)
        assert os.path.isfile(truth)

        # load truth
        with open(truth, 'r') as f:
            for line in f:
                user = User(line)
                self.users[user.userid] = user

        # load texts
        for path in user_files:
            user = os.path.splitext(path)[0]
            path = os.path.join(self.path, path)

            with open(path, 'r') as xml:
                content = extract_from_xml(xml.read())
                if not self.users.has_key(user):
                    self.users[user] = User(user)
                self.users[user].documents = content
                
    def simplify_documents(self):
        for key, user in self.users.items():
            self.users[key].documents = map(lambda x: preprocess_tweet(x, self.language), self.users[key].documents)
    
    def store_as_samples(self):
        for id, user in self.users.items():
            self.X.append(user.merge_documents())
            self.y.append(np.array(user.user_details()))
             
        self.X = np.array(self.X)
        self.y = np.array(self.y)
        
    def get_documents(self):
        x = []
        for _, user in self.users.items():
            x.append(user.merge_documents())
        return np.array(x)
        
    def get_tweet_len_stats(self):
        avgs = []
        stds = []
        
        for _, user in self.users.items():
            lens = map(len, user.documents)
            avgs.append(np.mean(lens))
            stds.append(np.std(lens))
        return avgs, stds
    
    def get_word_len_stats(self):
        avgs = []
        stds = []
        
        for _, user in self.users.items():
            words = []
            for doc in user.documents:
                words.extend(doc.split())
            
            lens = map(len, words)
            
            
            avgs.append(np.mean(lens))
            stds.append(np.std(lens))
            
        return avgs, stds
        
    def get_samples(self, feature='all'):
        if feature == 'all':
            return self.X, self.y

        feature_col = [i for i, lab in enumerate(self.labels) if lab in feature]
        if len(feature_col)==0:
            raise Exception('Invalid feature %s\nValid features %s' %
                            (feature, ', '.join(self.labels)))

        return self.X, np.array(([ ', '.join(i) for i in self.y[:, feature_col]]))        

def append_numeric_feature(features, new_feature):
    try:
        return np.column_stack((features,scale(map(float, new_feature))))    
    except Exception as e:
        print 'ERROR', str(e)
        for f in new_feature:
            try:
                float(f)
            except:
                print f
                print "error"
                
def append_numeric_list_of_features(features, new_features):
    for f in new_features:
        features = append_numeric_feature(features, f)
    return features
                

# len, len_std, cim
def additional_features(dataset):
    documents = dataset.get_documents()
    
    avg_len, std_len = dataset.get_tweet_len_stats()
    word_len, word_std = dataset.get_word_len_stats()
    
    
    smile_cnt = []
    exclamations = []
    
    hashtags = []
    mentions = []
    
    repeats = []
    
    
    for _, user in dataset.users.items():
        cnts = map(get_smile_cnt, user.documents)
        smile_cnt.append(np.mean(cnts))
        
        cnts = map(cnt_exclamations, user.documents)
        #map(lambda s: s.count('!'), user.documents)
        exclamations.append(np.mean(cnts))
        

        cnts = map(cnt_long_repeats, user.documents)
        repeats.append(np.mean(cnts))
        
        cnts = map(cnt_hashtags, user.documents)
        hashtags.append(np.mean(cnts))
        
        cnts = map(cnt_replys, user.documents)
        mentions.append(np.mean(cnts))
    
    
    return [avg_len, std_len, word_len, word_std, smile_cnt, exclamations, hashtags, mentions, repeats]

In [ ]:
EXTRA = ['avg_len', 'std_len', 'word_len',
                                  'word_std', 'smile_cnt', 'exclamations',
                                  'hashtags', 'mentions', 'repeats']
def split_samples(X, y):
    ys = sorted(set(y))
    groups = []
    
    for y_val in ys:
        groups.append([i for i, yy in enumerate(y) if yy == y_val])
    return groups, ys

def print_class_stats(groups, labels):
    #[avg_len, std_len, word_len, word_std, smile_cnt, exclamations, hashtags, mentions, repeats]
    for id, feature in enumerate(EXTRA):
        print feature
        for label, ids in zip(labels, groups):
            print "%s; cnt: %d" % (label, len(ids)),
            extras = [extra_features[id][i] for i in ids]
            print '; average: ', np.mean(extras)
        print

def get_class_stats(groups, labels):
    rv = {}
    #[avg_len, std_len, word_len, word_std, smile_cnt, exclamations, hashtags, mentions, repeats]
    for id, feature in enumerate(['avg_len', 'std_len', 'word_len',
                                  'word_std', 'smile_cnt', 'exclamations',
                                  'hashtags', 'mentions', 'repeats']):
        for label, ids in zip(labels, groups):
            extras = [extra_features[id][i] for i in ids]
            
            rv.setdefault(feature, []).append('%.3f' % np.mean(extras))
    return rv

Pomocni razredi


In [ ]:
from sklearn.base import TransformerMixin
'''Transformes sparse matrix to dense - ex. for NaiveBayes'''
class DenseTransformer(TransformerMixin):

    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

In [ ]:
# bellow code taken and adapted from example
# @ http://scikit-learn.org/stable/auto_examples/plot_learning_curve.html
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 10)):

    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix,precision_recall_curve, mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def printScore(y_true,y_pred,average='micro',pos_label=None):
    print average+" scores:"
    print "\t P  = %s" % precision_score(y_true,y_pred,average=average,pos_label=pos_label)
    print "\t R  = %s" % recall_score(y_true,y_pred,average=average,pos_label=pos_label)
    print "\t F1 = %s" % f1_score(y_true,y_pred,average=average,pos_label=pos_label)
        


def modelEvaluator(X, y, model, parameters, classifier = True, scoring = None, num_folds = 3,test_size = 0.3,ylim=None,train_sizes_lncurv=np.linspace(.1, 1.0, 10), verbose=False):
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    grid_cv = GridSearchCV( model, parameters, scoring = scoring, n_jobs = -1, verbose = 1, cv = num_folds)
    grid_cv.fit(X_train,y_train)
    
    
    estimator = grid_cv.best_estimator_
    if verbose:
        print 'Model %s' % estimator
        print 'Model best_params: %s' % grid_cv.best_params_
        print 'Model score : %s' % estimator.score(X_test,y_test)
    
    y_pred = estimator.predict(X_test)
        
    if not verbose and classifier: return estimator
    
    if classifier == True:
        print "Confusion matrix:\n %s" % confusion_matrix(y_test,y_pred)
        if len(set(y)) == 2:
            printScore(y_test,y_pred,'binary',list(set(y))[0])
        else:
            printScore(y_test,y_pred,'macro')
            printScore(y_test,y_pred) 
        plot_learning_curve(estimator, "Learning curve " + str(model).split('(')[0], X, y,  ylim=ylim, cv=num_folds,train_sizes=train_sizes_lncurv)
    else:
        sqr = sqrt(mean_squared_error(estimator.predict(X_test), y_test))
        if verbose : print 'RMSE: ', sqr
        return estimator, sqr, grid_cv.best_params_
    return estimator

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import  GaussianNB
from sklearn.pipeline import  Pipeline 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier, DummyRegressor

def evaluateClassifier(sample_name, dataset_path, language, models, parameters, num_folds=10, k=10, verbose=False):
    
    vectorize = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
    vectorize_pipe = Pipeline([('vectorize', vectorize), ('densen', DenseTransformer())])
    dataset = Dataset(dataset_path, language)
    extra_features = additional_features(dataset)
    dataset.simplify_documents()
    dataset.store_as_samples()
    X, y = dataset.get_samples(sample_name)
    Xvec = vectorize_pipe.fit_transform(X)
    Xvec = append_numeric_list_of_features(Xvec, extra_features)
    sel = SelectKBest(f_classif, k=k)
    Xvec = sel.fit_transform(Xvec,y)
    
    for model, parameter in zip(models,parameters):
        if verbose: print '\n', str(model).split('(')[0], language, '\n' 
        modelEvaluator(Xvec, y, model, parameter, num_folds=num_folds, verbose=verbose)

In [ ]:
dataset_path = './dataset/'
d = Dataset(dataset_path,'english')
extra_features = additional_features(d)
d.simplify_documents()
d.store_as_samples()

X, y = d.get_samples('age_group')
groups, labels = split_samples(X, y)
print_class_stats(groups, labels)

X, y = d.get_samples('gender')
groups, labels = split_samples(X, y)
print_class_stats(groups, labels)

In [ ]:
models = [DummyClassifier(), GaussianNB(), LinearSVC(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
parameters = [{}, {}, { 'C':linspace(1, 100,10)},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':linspace(1, 100,10)},{}, {}]

dataset_path = './dataset/'

In [ ]:
language = 'english'

In [ ]:
print '\n' +  'age_group:'
evaluateClassifier('age_group', dataset_path, language, models, parameters, k=2000, verbose=True)

In [ ]:
print '\n' +  'gender:' 
evaluateClassifier('gender', dataset_path, language, models, parameters, k=1500, verbose=True)

In [ ]:
language = 'italian'

In [ ]:
print '\n' +  'age_group:'
evaluateClassifier('age_group', dataset_path, language, models, parameters, k=2000, verbose=True)

In [ ]:
print '\n' +  'gender:' 
evaluateClassifier('gender', dataset_path, language, models, parameters, k=1500, verbose=True)

In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import MultinomialNB


def evaluate_model(sample_name, dataset_path, language, models, paramters, k=1500, num_folds=10, scoring='mean_squared_error',classifier=False, verbose=True):
    vectorize = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
    
    d = Dataset(dataset_path, language)
    extra_features = additional_features(d)
    d.simplify_documents()
    d.store_as_samples()
    X, y = d.get_samples(sample_name)
    
    
    y = np.array(y, dtype=float)
    vectorize = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
    vectorize_pipe = Pipeline([('vectorize', vectorize), ('densen', DenseTransformer())])

    Xvec = vectorize_pipe.fit_transform(X)
    Xvec = append_numeric_list_of_features(Xvec, extra_features)
    
    sel = SelectKBest(f_classif, k=k)
    Xvec = sel.fit_transform(Xvec,y)
    
    bestModel = None
    bestRMSE = 1
    bestParam = {}
    for model, parameter in zip(models,parameters):
        if verbose: print '\n', str(model).split('(')[0], language, '\n' 
        m, rmse, par = modelEvaluator(Xvec, y, model, parameter, num_folds=num_folds,scoring=scoring,classifier=classifier, verbose=verbose)
        if rmse < bestRMSE:
            bestModel = model
            bestRMSE = rmse
            bestParam = par
    print "\n\n BEST MODEL:\n" + str(bestModel).split('(')[0]+"\n"
    print 'Model %s' % bestModel
    print 'Model RMSE : %s \n' % bestRMSE 
    return bestModel

In [ ]:
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(),SVR()]
parameters = [{},{},{},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':linspace(1, 100,10)}]
language = 'english'

In [ ]:
evaluate_model('extroverted', dataset_path, language, models, parameters, k=1500)

In [ ]:
evaluate_model('stable',dataset_path, language, models, parameters, k=1500)

In [ ]:
evaluate_model('agreeable',dataset_path, language, models, parameters, k=1500)

In [ ]:
evaluate_model('conscientious', dataset_path, language, models, parameters, k=1500)

In [ ]:
evaluate_model('open',dataset_path, language, models, parameters, k=1500)

Demo

Classifiers


In [ ]:
dataset_path = './dataset/'
language = 'english'

models = [DummyClassifier(), GaussianNB(), LinearSVC(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
parameters = [{}, {}, { 'C':linspace(1, 100,10)},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':linspace(1, 100,10)},{}, {}]

In [ ]:
evaluateClassifier('age_group', dataset_path, language, models, parameters, k=2000, verbose=True)

Regresion


In [ ]:
language = 'english'
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(),SVR()]
parameters = [{},{},{},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':linspace(1, 100,10)}]

In [ ]:
evaluate_model('extroverted', dataset_path, language, models, parameters)

In [ ]: