Import required libraries


In [3]:
! sudo pip install pandas
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
from sklearn.cross_validation import train_test_split


Requirement already satisfied (use --upgrade to upgrade): pandas in /usr/local/lib/python3.4/dist-packages
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0 in /usr/lib/python3/dist-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): python-dateutil>=2 in /usr/local/lib/python3.4/dist-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): pytz>=2011k in /root/.local/lib/python3.4/site-packages (from pandas)
Requirement already satisfied (use --upgrade to upgrade): six>=1.5 in /usr/local/lib/python3.4/dist-packages (from python-dateutil>=2->pandas)
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

 Load train and test data


In [4]:
df_data = pd.read_csv("./enwiki.draft_quality.50k_stratified.feature_labels.tsv", sep="\t")
df_data["OK"] = df_data['draft_quality'].apply(lambda x: x == "OK")
df_data["spam"] = df_data['draft_quality'].apply(lambda x: x == "spam")
df_data["vandalism"] = df_data['draft_quality'].apply(lambda x: x == "vandalism")
df_data["attack"] = df_data['draft_quality'].apply(lambda x: x == "attack")

In [15]:
df_data.head()


Out[15]:
feature.wikitext.revision.chars feature.wikitext.revision.whitespace_chars feature.wikitext.revision.markup_chars feature.wikitext.revision.cjk_chars feature.wikitext.revision.entity_chars feature.wikitext.revision.url_chars feature.wikitext.revision.word_chars feature.wikitext.revision.uppercase_word_chars feature.wikitext.revision.punctuation_chars feature.wikitext.revision.break_chars ... feature.enwiki.revision.cn_templates.1 feature.(enwiki.revision.cn_templates / max(wikitext.revision.content_chars, 1)).1 feature.enwiki.main_article_templates feature.(enwiki.main_article_templates / max(wikitext.revision.content_chars, 1)) feature.(english.stemmed.revision.stems_length / max(wikitext.revision.content_chars, 1)) draft_quality OK spam vandalism attack
0 54 6 4 0 0 0 32 8 0 0 ... 0 0.0 0 0.0 0.653061 OK True False False False
1 61 7 8 0 0 0 45 8 0 0 ... 0 0.0 0 0.0 1.952381 OK True False False False
2 1914 218 106 0 0 224 1170 37 32 10 ... 0 0.0 0 0.0 0.821457 OK True False False False
3 1007 110 96 0 0 0 727 31 22 8 ... 0 0.0 0 0.0 0.904632 OK True False False False
4 687 82 89 0 0 39 407 0 9 7 ... 0 0.0 0 0.0 1.641256 OK True False False False

5 rows × 92 columns


In [16]:
features = df_data.columns[:-5]
len(features)
data = df_data[features]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Select feature columns


In [ ]:


In [ ]:

Select target columns


In [73]:
targets = ["spam", "OK", "vandalism", "attack"]

Train and Run models

Defining models


In [75]:
models = [dict(), dict(), dict(), dict()]
models[0]["model"] = RandomForestClassifier(n_jobs=64)
models[0]["name"] = "RandomForestClassifier"
models[1]["model"] = GradientBoostingClassifier()
models[1]["name"] = "GradientBoostingClassifier"
models[2]["model"] = SVC()
models[2]["name"] = "SVC"
models[3]["model"] = GaussianNB()
models[3]["name"] = "GaussianNB"

Adding model parametes


In [76]:
for model in models : 
    model["params"] = model["model"].get_params()

Training models


In [77]:
for model in models : 
    for category in targets : 
        X_train, X_test, y_train, y_test = train_test_split(df_data[features], df_data[category], test_size=0.4, random_state=0)
        
        model["model"].fit(X_train, y_train)
        y_pred = model["model"].predict(X_test)
        model["metrics_for_" + category] = precision_recall_fscore_support(y_test, y_pred, average='binary')
    mean_precision = 0.0
    mean_recall = 0.0
    for category in targets : 
        mean_precision += model["metrics_for_" + category][0]
        mean_recall += model["metrics_for_" + category][1]
    model["mean_precision"] = mean_precision / len(targets)
    model["mean_recall"] = mean_recall / len(targets)

Evaluation

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.
The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

In [78]:
for model in models: 
    print ("%s\n\tPrecision: %.2f\n\tRecall: %.2f\n\n" % (model["name"],  model["mean_precision"], model["mean_recall"]))


RandomForestClassifier
	Precision: 0.69
	Recall: 0.54


GradientBoostingClassifier
	Precision: 0.72
	Recall: 0.56


SVC
	Precision: 0.67
	Recall: 0.46


GaussianNB
	Precision: 0.30
	Recall: 0.75



In [80]:
models


Out[80]:
[{'mean_precision': 0.69314216782224825,
  'mean_recall': 0.54143947489662647,
  'metrics_for_OK': (0.95641838351822506,
   0.92234215302321143,
   0.93907123753950905,
   None),
  'metrics_for_attack': (0.36548223350253806,
   0.088452088452088448,
   0.14243323442136496,
   None),
  'metrics_for_spam': (0.84326064686082247,
   0.8130379568223508,
   0.82787356321839078,
   None),
  'metrics_for_vandalism': (0.6074074074074074,
   0.34192570128885519,
   0.43754547659471255,
   None),
  'model': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
              max_depth=None, max_features='auto', max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=64,
              oob_score=False, random_state=None, verbose=0,
              warm_start=False),
  'name': 'RandomForestClassifier',
  'params': {'bootstrap': True,
   'class_weight': None,
   'criterion': 'gini',
   'max_depth': None,
   'max_features': 'auto',
   'max_leaf_nodes': None,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'n_estimators': 10,
   'n_jobs': 64,
   'oob_score': False,
   'random_state': None,
   'verbose': 0,
   'warm_start': False}},
 {'mean_precision': 0.72271138618009134,
  'mean_recall': 0.56430114789910635,
  'metrics_for_OK': (0.94545629758118921,
   0.93714776960550195,
   0.94128369951069757,
   None),
  'metrics_for_attack': (0.47340425531914893,
   0.10933660933660934,
   0.17764471057884232,
   None),
  'metrics_for_spam': (0.83568269762299618,
   0.85325243403414708,
   0.84437617817496347,
   None),
  'metrics_for_vandalism': (0.63630229419703099,
   0.35746777862016682,
   0.45776699029126211,
   None),
  'model': GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
                max_depth=3, max_features=None, max_leaf_nodes=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=100,
                presort='auto', random_state=None, subsample=1.0, verbose=0,
                warm_start=False),
  'name': 'GradientBoostingClassifier',
  'params': {'init': None,
   'learning_rate': 0.1,
   'loss': 'deviance',
   'max_depth': 3,
   'max_features': None,
   'max_leaf_nodes': None,
   'min_samples_leaf': 1,
   'min_samples_split': 2,
   'min_weight_fraction_leaf': 0.0,
   'n_estimators': 100,
   'presort': 'auto',
   'random_state': None,
   'subsample': 1.0,
   'verbose': 0,
   'warm_start': False}},
 {'mean_precision': 0.66753987287151406,
  'mean_recall': 0.46079958644687213,
  'metrics_for_OK': (0.97944630872483218,
   0.66911834941255133,
   0.79507405936098963,
   None),
  'metrics_for_attack': (0.47058823529411764,
   0.0098280098280098278,
   0.01925391095066185,
   None),
  'metrics_for_spam': (0.5693312966734555,
   0.946662903908565,
   0.7110381007895713,
   None),
  'metrics_for_vandalism': (0.65079365079365081,
   0.21758908263836241,
   0.32613636363636367,
   None),
  'model': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False),
  'name': 'SVC',
  'params': {'C': 1.0,
   'cache_size': 200,
   'class_weight': None,
   'coef0': 0.0,
   'decision_function_shape': None,
   'degree': 3,
   'gamma': 'auto',
   'kernel': 'rbf',
   'max_iter': -1,
   'probability': False,
   'random_state': None,
   'shrinking': True,
   'tol': 0.001,
   'verbose': False}},
 {'mean_precision': 0.30001932908522894,
  'mean_recall': 0.75231439950948586,
  'metrics_for_OK': (0.51765943590000496,
   0.98700926545037726,
   0.67913243509694365,
   None),
  'metrics_for_attack': (0.042308902170493864,
   0.99140049140049136,
   0.081154465004022527,
   None),
  'metrics_for_spam': (0.34347976478727082,
   0.98080993368138847,
   0.50878348704435672,
   None),
  'metrics_for_vandalism': (0.29662921348314608,
   0.050037907505686124,
   0.085630879013947434,
   None),
  'model': GaussianNB(),
  'name': 'GaussianNB',
  'params': {}}]

In [83]:
from sklearn.externals import joblib

In [85]:
joblib.dump(models, 'models.pkl')
models2 = joblib.load('models.pkl')

In [ ]: