In [ ]:
import scipy as sp
import sklearn
import os
import nltk
%pylab inline
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=VisibleDeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
In [ ]:
from BeautifulSoup import BeautifulSoup
In [ ]:
from sklearn.preprocessing import scale
import re
from nltk.corpus import stopwords
URL_REGEX = re.compile(r'(https?|ftp)://[^\s]*')
REPLY_REGEX = re.compile(r'@username')
URL_TAG = 'URL'
REPLY_TAG = 'REP'
# PREPROCESS XML DOCUMENTS
def extract_from_xml(xml):
bs = BeautifulSoup(xml)
# tab na pocetku
return [document.text.rstrip('\t') for document in bs.findAll('document')]
def trim_multiple_repeats(string):
# coooooool-> coool
# use 3 to make distinct from 2 repeats that are common in
# some languages
return re.sub(r'(.)\1{3,}', r'\1\1\1', string)
def preprocess_tweet(text, language):
text = URL_REGEX.sub(URL_TAG, text)
text = REPLY_REGEX.sub(REPLY_TAG, text)
text = trim_multiple_repeats(text)
letters_only = re.sub("[^a-zA-Z]", " ", text)
words = letters_only.lower().split()
stops = set(stopwords.words(language))
meaningful_words = [w for w in words if not w in stops]
return( " ".join( meaningful_words ))
def get_smile_cnt(string):
return len(re.findall(r'(?::|;|:\'|=)(?:-)?(?:\)|\(|D|P|d|p|3)|<3|</3|xd|xD|XD',string))
def cnt_long_repeats(string):
return len(re.findall(r'(\w)\1{3,}', string))
def cnt_replys(string):
return len(re.findall(r'@username', string))
def cnt_hashtags(string):
return len(re.findall(r'#(\w+)', string))
def cnt_exclamations(string):
return len(re.findall(r'!+', string))
Preprocess data
Strukture za pohranu dataseta
In [ ]:
LABELS = ['userid', 'gender', 'age_group',
'extroverted', 'stable', 'agreeable',
'conscientious', 'open']
TYPES = ['string'] * 3 + ['float'] * 5
''' User class used to store parsed data'''
class User(object):
def __init__(self, line):
self.labels = LABELS
parts = map(str.strip, line.split(FIELDS_DELIMITER))
if len(parts) == 1:
parts = [parts[0]] + [''] * 7
self.userid = parts[0]
self.gender = None
self.age_group = None
self.extroverted = None
self.stable = None
self.agreeable = None
self.conscientious = None
self.open = None
else:
self.userid = parts[0]
self.gender = parts[1]
self.age_group = parts[2]
self.extroverted = float(parts[3])
self.stable = float(parts[4])
self.agreeable = float(parts[5])
self.conscientious = float(parts[6])
self.open = float(parts[7])
self.documents = []
def user_details(self):
return [self.userid, self.gender,
self.age_group, self.extroverted, self.stable,
self.agreeable, self.conscientious, self.open]
def user_documents(self):
return self.documents
def merge_documents(self):
return '\n'.join(self.documents)
In [ ]:
TRUTH_FILE = 'truth.txt'
FIELDS_DELIMITER = ':::'
LABELS = ['userid', 'gender', 'age_group',
'extroverted', 'stable', 'agreeable',
'conscientious', 'open']
'''Dataset wrapper - parses, cleans and stores user data (documents and truth)
'''
class Dataset(object):
def __init__(self, path, language):
path += language
if not os.path.exists(path) or not os.path.isdir(path):
raise Exception('No such dir ' + path)
self.language = language
self.path = path
self.users = {}
self.X = []
self.y = []
self.load()
self.labels = LABELS
def load(self):
user_files = filter(lambda name: name != TRUTH_FILE, os.listdir(self.path))
truth = os.path.join(self.path, TRUTH_FILE)
assert os.path.isfile(truth)
# load truth
with open(truth, 'r') as f:
for line in f:
user = User(line)
self.users[user.userid] = user
# load texts
for path in user_files:
user = os.path.splitext(path)[0]
path = os.path.join(self.path, path)
with open(path, 'r') as xml:
content = extract_from_xml(xml.read())
if not self.users.has_key(user):
self.users[user] = User(user)
self.users[user].documents = content
def simplify_documents(self):
for key, user in self.users.items():
self.users[key].documents = map(lambda x: preprocess_tweet(x, self.language), self.users[key].documents)
def store_as_samples(self):
for id, user in self.users.items():
self.X.append(user.merge_documents())
self.y.append(np.array(user.user_details()))
self.X = np.array(self.X)
self.y = np.array(self.y)
def get_documents(self):
x = []
for _, user in self.users.items():
x.append(user.merge_documents())
return np.array(x)
def get_tweet_len_stats(self):
avgs = []
stds = []
for _, user in self.users.items():
lens = map(len, user.documents)
avgs.append(np.mean(lens))
stds.append(np.std(lens))
return avgs, stds
def get_word_len_stats(self):
avgs = []
stds = []
for _, user in self.users.items():
words = []
for doc in user.documents:
words.extend(doc.split())
lens = map(len, words)
avgs.append(np.mean(lens))
stds.append(np.std(lens))
return avgs, stds
def get_samples(self, feature='all'):
if feature == 'all':
return self.X, self.y
feature_col = [i for i, lab in enumerate(self.labels) if lab in feature]
if len(feature_col)==0:
raise Exception('Invalid feature %s\nValid features %s' %
(feature, ', '.join(self.labels)))
return self.X, np.array(([ ', '.join(i) for i in self.y[:, feature_col]]))
def append_numeric_feature(features, new_feature):
try:
return np.column_stack((features,scale(map(float, new_feature))))
except Exception as e:
print 'ERROR', str(e)
for f in new_feature:
try:
float(f)
except:
print f
print "error"
def append_numeric_list_of_features(features, new_features):
for f in new_features:
features = append_numeric_feature(features, f)
return features
# len, len_std, cim
def additional_features(dataset):
documents = dataset.get_documents()
avg_len, std_len = dataset.get_tweet_len_stats()
word_len, word_std = dataset.get_word_len_stats()
smile_cnt = []
exclamations = []
hashtags = []
mentions = []
repeats = []
for _, user in dataset.users.items():
cnts = map(get_smile_cnt, user.documents)
smile_cnt.append(np.mean(cnts))
cnts = map(cnt_exclamations, user.documents)
#map(lambda s: s.count('!'), user.documents)
exclamations.append(np.mean(cnts))
cnts = map(cnt_long_repeats, user.documents)
repeats.append(np.mean(cnts))
cnts = map(cnt_hashtags, user.documents)
hashtags.append(np.mean(cnts))
cnts = map(cnt_replys, user.documents)
mentions.append(np.mean(cnts))
return [avg_len, std_len, word_len, word_std, smile_cnt, exclamations, hashtags, mentions, repeats]
In [ ]:
EXTRA = ['avg_len', 'std_len', 'word_len',
'word_std', 'smile_cnt', 'exclamations',
'hashtags', 'mentions', 'repeats']
def split_samples(X, y):
ys = sorted(set(y))
groups = []
for y_val in ys:
groups.append([i for i, yy in enumerate(y) if yy == y_val])
return groups, ys
def print_class_stats(groups, labels):
#[avg_len, std_len, word_len, word_std, smile_cnt, exclamations, hashtags, mentions, repeats]
for id, feature in enumerate(EXTRA):
print feature
for label, ids in zip(labels, groups):
print "%s; cnt: %d" % (label, len(ids)),
extras = [extra_features[id][i] for i in ids]
print '; average: ', np.mean(extras)
print
def get_class_stats(groups, labels):
rv = {}
#[avg_len, std_len, word_len, word_std, smile_cnt, exclamations, hashtags, mentions, repeats]
for id, feature in enumerate(['avg_len', 'std_len', 'word_len',
'word_std', 'smile_cnt', 'exclamations',
'hashtags', 'mentions', 'repeats']):
for label, ids in zip(labels, groups):
extras = [extra_features[id][i] for i in ids]
rv.setdefault(feature, []).append('%.3f' % np.mean(extras))
return rv
Pomocni razredi
In [ ]:
from sklearn.base import TransformerMixin
'''Transformes sparse matrix to dense - ex. for NaiveBayes'''
class DenseTransformer(TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.todense()
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
In [ ]:
# bellow code taken and adapted from example
# @ http://scikit-learn.org/stable/auto_examples/plot_learning_curve.html
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 10)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix,precision_recall_curve, mean_squared_error
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
def printScore(y_true,y_pred,average='micro',pos_label=None):
print average+" scores:"
print "\t P = %s" % precision_score(y_true,y_pred,average=average,pos_label=pos_label)
print "\t R = %s" % recall_score(y_true,y_pred,average=average,pos_label=pos_label)
print "\t F1 = %s" % f1_score(y_true,y_pred,average=average,pos_label=pos_label)
def modelEvaluator(X, y, model, parameters, classifier = True, scoring = None, num_folds = 3,test_size = 0.3,ylim=None,train_sizes_lncurv=np.linspace(.1, 1.0, 10), verbose=False):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
grid_cv = GridSearchCV( model, parameters, scoring = scoring, n_jobs = -1, verbose = 1, cv = num_folds)
grid_cv.fit(X_train,y_train)
estimator = grid_cv.best_estimator_
if verbose:
print 'Model %s' % estimator
print 'Model best_params: %s' % grid_cv.best_params_
print 'Model score : %s' % estimator.score(X_test,y_test)
y_pred = estimator.predict(X_test)
if not verbose and classifier: return estimator
if classifier == True:
print "Confusion matrix:\n %s" % confusion_matrix(y_test,y_pred)
if len(set(y)) == 2:
printScore(y_test,y_pred,'binary',list(set(y))[0])
else:
printScore(y_test,y_pred,'macro')
printScore(y_test,y_pred)
plot_learning_curve(estimator, "Learning curve " + str(model).split('(')[0], X, y, ylim=ylim, cv=num_folds,train_sizes=train_sizes_lncurv)
else:
sqr = sqrt(mean_squared_error(estimator.predict(X_test), y_test))
if verbose : print 'RMSE: ', sqr
return estimator, sqr, grid_cv.best_params_
return estimator
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier, DummyRegressor
def evaluateClassifier(sample_name, dataset_path, language, models, parameters, num_folds=10, k=10, verbose=False):
vectorize = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
vectorize_pipe = Pipeline([('vectorize', vectorize), ('densen', DenseTransformer())])
dataset = Dataset(dataset_path, language)
extra_features = additional_features(dataset)
dataset.simplify_documents()
dataset.store_as_samples()
X, y = dataset.get_samples(sample_name)
Xvec = vectorize_pipe.fit_transform(X)
Xvec = append_numeric_list_of_features(Xvec, extra_features)
sel = SelectKBest(f_classif, k=k)
Xvec = sel.fit_transform(Xvec,y)
for model, parameter in zip(models,parameters):
if verbose: print '\n', str(model).split('(')[0], language, '\n'
modelEvaluator(Xvec, y, model, parameter, num_folds=num_folds, verbose=verbose)
In [ ]:
dataset_path = './dataset/'
d = Dataset(dataset_path,'english')
extra_features = additional_features(d)
d.simplify_documents()
d.store_as_samples()
X, y = d.get_samples('age_group')
groups, labels = split_samples(X, y)
print_class_stats(groups, labels)
X, y = d.get_samples('gender')
groups, labels = split_samples(X, y)
print_class_stats(groups, labels)
In [ ]:
models = [DummyClassifier(), GaussianNB(), LinearSVC(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
parameters = [{}, {}, { 'C':linspace(1, 100,10)},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':linspace(1, 100,10)},{}, {}]
dataset_path = './dataset/'
In [ ]:
language = 'english'
In [ ]:
print '\n' + 'age_group:'
evaluateClassifier('age_group', dataset_path, language, models, parameters, k=2000, verbose=True)
In [ ]:
print '\n' + 'gender:'
evaluateClassifier('gender', dataset_path, language, models, parameters, k=1500, verbose=True)
In [ ]:
language = 'italian'
In [ ]:
print '\n' + 'age_group:'
evaluateClassifier('age_group', dataset_path, language, models, parameters, k=2000, verbose=True)
In [ ]:
print '\n' + 'gender:'
evaluateClassifier('gender', dataset_path, language, models, parameters, k=1500, verbose=True)
In [ ]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.naive_bayes import MultinomialNB
def evaluate_model(sample_name, dataset_path, language, models, paramters, k=1500, num_folds=10, scoring='mean_squared_error',classifier=False, verbose=True):
vectorize = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
d = Dataset(dataset_path, language)
extra_features = additional_features(d)
d.simplify_documents()
d.store_as_samples()
X, y = d.get_samples(sample_name)
y = np.array(y, dtype=float)
vectorize = TfidfVectorizer(analyzer='char', ngram_range=(3,3))
vectorize_pipe = Pipeline([('vectorize', vectorize), ('densen', DenseTransformer())])
Xvec = vectorize_pipe.fit_transform(X)
Xvec = append_numeric_list_of_features(Xvec, extra_features)
sel = SelectKBest(f_classif, k=k)
Xvec = sel.fit_transform(Xvec,y)
bestModel = None
bestRMSE = 1
bestParam = {}
for model, parameter in zip(models,parameters):
if verbose: print '\n', str(model).split('(')[0], language, '\n'
m, rmse, par = modelEvaluator(Xvec, y, model, parameter, num_folds=num_folds,scoring=scoring,classifier=classifier, verbose=verbose)
if rmse < bestRMSE:
bestModel = model
bestRMSE = rmse
bestParam = par
print "\n\n BEST MODEL:\n" + str(bestModel).split('(')[0]+"\n"
print 'Model %s' % bestModel
print 'Model RMSE : %s \n' % bestRMSE
return bestModel
In [ ]:
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(),SVR()]
parameters = [{},{},{},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':linspace(1, 100,10)}]
language = 'english'
In [ ]:
evaluate_model('extroverted', dataset_path, language, models, parameters, k=1500)
In [ ]:
evaluate_model('stable',dataset_path, language, models, parameters, k=1500)
In [ ]:
evaluate_model('agreeable',dataset_path, language, models, parameters, k=1500)
In [ ]:
evaluate_model('conscientious', dataset_path, language, models, parameters, k=1500)
In [ ]:
evaluate_model('open',dataset_path, language, models, parameters, k=1500)
In [ ]:
dataset_path = './dataset/'
language = 'english'
models = [DummyClassifier(), GaussianNB(), LinearSVC(), SVC(), DecisionTreeClassifier(), RandomForestClassifier()]
parameters = [{}, {}, { 'C':linspace(1, 100,10)},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':linspace(1, 100,10)},{}, {}]
In [ ]:
evaluateClassifier('age_group', dataset_path, language, models, parameters, k=2000, verbose=True)
In [ ]:
language = 'english'
models = [LinearRegression(), DecisionTreeRegressor(), RandomForestRegressor(),SVR()]
parameters = [{},{},{},{'kernel':('linear', 'rbf','poly','sigmoid'), 'C':linspace(1, 100,10)}]
In [ ]:
evaluate_model('extroverted', dataset_path, language, models, parameters)
In [ ]: