In [1]:
# Install some more libs
! sudo pip install pandas
! sudo pip install matplotlib
! sudo apt-get -y install python3-tk
In [2]:
# import required libs
from revscoring.dependencies import solve
from revscoring.features import wikitext
import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [3]:
# Load dataset
df = pd.read_csv("enwiki.draft_quality.75_not_OK_sample.censored.tsv", sep="\t")
df.head()
Out[3]:
In [4]:
# The number of characters
chars = lambda x:list(solve([ wikitext.revision.chars], cache={'datasource.revision.text': x}))[0]
df["chars"] = df["censored_text"].apply(chars)
# whitespace_chars
whitespace_chars = lambda x:list(solve([ wikitext.revision.whitespace_chars], cache={'datasource.revision.text': x}))[0]
df["whitespace_chars"] = df["censored_text"].apply(whitespace_chars)
# The number of wikitext markup characters
markup_chars = lambda x:list(solve([ wikitext.revision.markup_chars], cache={'datasource.revision.text': x}))[0]
df["markup_chars"] = df["censored_text"].apply(markup_chars)
# The number of Chinese/Japanese/Korean characters
cjk_chars = lambda x:list(solve([ wikitext.revision.cjk_chars], cache={'datasource.revision.text': x}))[0]
df["cjk_chars"] = df["censored_text"].apply(cjk_chars)
# The number of HTML entity characters
entity_chars = lambda x:list(solve([ wikitext.revision.entity_chars], cache={'datasource.revision.text': x}))[0]
df["entity_chars"] = df["censored_text"].apply(entity_chars)
# The number of URL characters
url_chars = lambda x:list(solve([ wikitext.revision.url_chars], cache={'datasource.revision.text': x}))[0]
df["url_chars"] = df["censored_text"].apply(url_chars)
# The number of word characters
word_chars = lambda x:list(solve([ wikitext.revision.word_chars], cache={'datasource.revision.text': x}))[0]
df["word_chars"] = df["censored_text"].apply(word_chars)
# The number of UPPERCASE WORD characters
uppercase_word_chars = lambda x:list(solve([ wikitext.revision.uppercase_word_chars], cache={'datasource.revision.text': x}))[0]
df["uppercase_word_chars"] = df["censored_text"].apply(uppercase_word_chars)
# The number of punctuation characters
punctuation_chars = lambda x:list(solve([ wikitext.revision.punctuation_chars], cache={'datasource.revision.text': x}))[0]
df["punctuation_chars"] = df["censored_text"].apply(punctuation_chars)
# The number of break characters
break_chars = lambda x:list(solve([ wikitext.revision.break_chars], cache={'datasource.revision.text': x}))[0]
df["break_chars"] = df["censored_text"].apply(break_chars)
# The length of the most longest character repetition
longest_repeated_char = lambda x:list(solve([ wikitext.revision.longest_repeated_char], cache={'datasource.revision.text': x}))[0]
df["longest_repeated_char"] = df["censored_text"].apply(longest_repeated_char)
In [5]:
# The number of tokens
tokens = lambda x:list(solve([ wikitext.revision.tokens], cache={'datasource.revision.text': x}))[0]
df["tokens"] = df["censored_text"].apply(tokens)
# The number of number tokens
numbers = lambda x:list(solve([ wikitext.revision.numbers], cache={'datasource.revision.text': x}))[0]
df["numbers"] = df["censored_text"].apply(numbers)
# The number of whitespace tokens
whitespaces = lambda x:list(solve([ wikitext.revision.whitespaces], cache={'datasource.revision.text': x}))[0]
df["whitespaces"] = df["censored_text"].apply(whitespaces)
# The number of markup tokens
markups = lambda x:list(solve([ wikitext.revision.markups], cache={'datasource.revision.text': x}))[0]
df["markups"] = df["censored_text"].apply(markups)
# The number of Chinese/Japanese/Korean tokens
cjks = lambda x:list(solve([ wikitext.revision.cjks], cache={'datasource.revision.text': x}))[0]
df["cjks"] = df["censored_text"].apply(cjks)
# The number of HTML entity tokens
entities = lambda x:list(solve([ wikitext.revision.entities], cache={'datasource.revision.text': x}))[0]
df["entities"] = df["censored_text"].apply(entities)
# The number of URL tokens
urls = lambda x:list(solve([ wikitext.revision.urls], cache={'datasource.revision.text': x}))[0]
df["urls"] = df["censored_text"].apply(urls)
# The number of word tokens
words = lambda x:list(solve([ wikitext.revision.words], cache={'datasource.revision.text': x}))[0]
df["words"] = df["censored_text"].apply(words)
# The number of UPPERCASE word tokens
uppercase_words = lambda x:list(solve([ wikitext.revision.uppercase_words], cache={'datasource.revision.text': x}))[0]
df["uppercase_words"] = df["censored_text"].apply(uppercase_words)
# The number of punctuation tokens
punctuations = lambda x:list(solve([ wikitext.revision.punctuations], cache={'datasource.revision.text': x}))[0]
df["punctuations"] = df["censored_text"].apply(punctuations)
# The number of break tokens
breaks = lambda x:list(solve([ wikitext.revision.breaks], cache={'datasource.revision.text': x}))[0]
df["breaks"] = df["censored_text"].apply(breaks)
# The length of the longest token
longest_token = lambda x:list(solve([ wikitext.revision.longest_token], cache={'datasource.revision.text': x}))[0]
df["longest_token"] = df["censored_text"].apply(longest_token)
# The length of the longest word-token
longest_word = lambda x:list(solve([ wikitext.revision.longest_word], cache={'datasource.revision.text': x}))[0]
df["longest_word"] = df["censored_text"].apply(longest_word)
In [6]:
# The number of characters of viewable content (no markup or templates)
content_chars = lambda x:list(solve([ wikitext.revision.content_chars], cache={'datasource.revision.text': x}))[0]
df["content_chars"] = df["censored_text"].apply(content_chars)
# The number of headings
headings = lambda x:list(solve([ wikitext.revision.headings], cache={'datasource.revision.text': x}))[0]
df["headings"] = df["censored_text"].apply(headings)
# The number of external links
external_links = lambda x:list(solve([ wikitext.revision.external_links], cache={'datasource.revision.text': x}))[0]
df["external_links"] = df["censored_text"].apply(external_links)
# The number of wikilinks (internal to other pages in the wiki)
wikilinks = lambda x:list(solve([ wikitext.revision.wikilinks], cache={'datasource.revision.text': x}))[0]
df["wikilinks"] = df["censored_text"].apply(wikilinks)
# The number of HTML tags
tags = lambda x:list(solve([ wikitext.revision.tags], cache={'datasource.revision.text': x}))[0]
df["tags"] = df["censored_text"].apply(tags)
# The number of <ref> tags
ref_tags = lambda x:list(solve([ wikitext.revision.ref_tags], cache={'datasource.revision.text': x}))[0]
df["ref_tags"] = df["censored_text"].apply(ref_tags)
# The number of templates
templates = lambda x:list(solve([ wikitext.revision.templates], cache={'datasource.revision.text': x}))[0]
df["templates"] = df["censored_text"].apply(templates)
In [ ]:
In [7]:
df["whitespace_chars_norm"] = df["whitespace_chars"] / df["chars"]
df["markup_chars_norm"] = df["markup_chars"] / df["chars"]
df["cjk_chars_norm"] = df["cjk_chars"] / df["chars"]
df["entity_chars_norm"] = df["entity_chars"] / df["chars"]
df["url_chars_norm"] = df["url_chars"] / df["chars"]
df["word_chars_norm"] = df["word_chars"] / df["chars"]
df["uppercase_word_chars_norm"] = df["uppercase_word_chars"] / df["chars"]
df["punctuation_chars_norm"] = df["punctuation_chars"] / df["chars"]
df["break_chars_norm"] = df["break_chars"] / df["chars"]
df["longest_repeated_char_norm"] = df["longest_repeated_char"] / df["chars"]
In [8]:
df["numbers_norm"] = df["numbers"] / df["tokens"]
df["whitespaces_norm"] = df["whitespaces"] / df["tokens"]
df["markups_norm"] = df["markups"] / df["tokens"]
df["cjks_norm"] = df["cjks"] / df["tokens"]
df["entities_norm"] = df["entities"] / df["tokens"]
df["urls_norm"] = df["urls"] / df["tokens"]
df["words_norm"] = df["words"] / df["tokens"]
df["uppercase_words_norm"] = df["uppercase_words"] / df["tokens"]
df["punctuations_norm"] = df["punctuations"] / df["tokens"]
df["breaks_norm"] = df["breaks"] / df["tokens"]
df["longest_token_norm"] = df["longest_token"] / df["tokens"]
In [9]:
### Recap the columns in the main dataframe
df.columns
Out[9]:
In [ ]:
In [ ]:
In [10]:
### We consider only the features we've defined above
features = df.columns[6:]
### We consider only the features we've defined above
target = df.columns[4]
In [11]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# create a base classifier used to evaluate a subset of attributes
model = LogisticRegression()
# create the RFE model and select 4 attributes
rfe = RFE(model, 4)
rfe = rfe.fit(df[features], df[target])
# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)
features[rfe.support_]
Out[11]:
In [12]:
rfe.score(df[features], df[target])
Out[12]:
In [13]:
# Feature Importance
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(df[features], df[target])
# display the relative importance of each attribute
print(model.feature_importances_)
In [14]:
model.score(df[features], df[target])
Out[14]:
In [15]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.08, random_state=0)
In [16]:
model2 = ExtraTreesClassifier()
model2.fit(X_train, y_train)
Out[16]:
In [17]:
model2.score(X_test,y_test)
Out[17]:
In [18]:
y_test
Out[18]:
In [19]:
df.describe()
Out[19]:
In [20]:
df.boxplot(by='draft_quality', column=['external_links', 'ref_tags', 'whitespaces_norm', 'longest_repeated_char_norm'], figsize=(15,15))
Out[20]:
In [21]:
X = df[features]
y = df[target]
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
random_state=0)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()
In [22]:
forest.score(X,y)
Out[22]:
In [23]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
modelKbest = SelectKBest(chi2, k=6)
fit=modelKbest.fit(df[features], df[target])
newFeatures = fit.transform(df[features])
print(fit.scores_)
#print(newFeatures[0:5,:])
print(fit.get_support())
features[fit.get_support()]
Out[23]:
In [28]:
from sklearn.decomposition import PCA
from sklearn import preprocessing
newf=preprocessing.scale(df[features]) #normalisation
pca = PCA(n_components=10)
pca.fit(newf)
print(pca.explained_variance_ratio_)
In [25]:
In [ ]: