In [1]:
import datapot as dp
from datapot import datasets
In [2]:
import pandas as pd
from __future__ import print_function
import sys
import bz2
import time
import xgboost as xgb
from sklearn.model_selection import cross_val_score
Usage example for unstructured textual bzip2-compressed data
datapot.fit method subsamples the data to detect language and choose corresponding stopwords and stemming.
For each review datapot.transform generates an SVD-compressed 12-dimensional tfidf-vector representation.
In [3]:
data_imdb = datasets.load_imdb()
In [4]:
datapot_imdb = dp.DataPot()
In [5]:
t0 = time.time()
datapot_imdb.detect(data_imdb)
print('detect time:', time.time()-t0)
datapot_imdb
Out[5]:
In [6]:
datapot_imdb.remove_transformer('sentiment', 0)
Out[6]:
In [7]:
t0 = time.time()
datapot_imdb.fit(data_imdb, verbose=True)
Out[7]:
In [8]:
print('fit time:', time.time()-t0)
In [9]:
t0 = time.time()
df_imdb = datapot_imdb.transform(data_imdb)
print('transform time:', time.time()-t0)
In [10]:
df_imdb.head()
Out[10]:
In [11]:
X = df_imdb.drop(['sentiment'], axis=1)
y = df_imdb['sentiment']
In [12]:
model = xgb.XGBClassifier()
cv_score = cross_val_score(model, X, y, cv=5)
assert all(i > 0.5 for i in cv_score), 'Low score!'
print('Cross-val score:', cv_score)
model.fit(X, y)
fi = model.feature_importances_
print('Feature importance:')
print(*(list(zip(X.columns, fi))), sep='\n')
In [13]:
data_job = datasets.load_job_salary()
In [14]:
datapot_job = dp.DataPot()
In [15]:
t0 = time.time()
datapot_job.detect(data_job)
print('detect time:', time.time()-t0)
datapot_job
Out[15]:
In [16]:
t0 = time.time()
datapot_job.fit(data_job, verbose=True)
print('fit time:', time.time()-t0)
In [17]:
t0 = time.time()
df_job = datapot_job.transform(data_job)
print('transform time:', time.time()-t0)
In [18]:
print(df_job.columns)
print(df_job.shape)
df_job.head()
Out[18]:
In [19]:
X_job = df_job.drop(['SalaryNormalized', 'Id'], axis=1)
y_job = pd.qcut(df_job['SalaryNormalized'].values, q=2, labels=[0,1]).ravel()
model = xgb.XGBClassifier()
cv_score_job = cross_val_score(model, X_job, y_job, cv=5)
print('Cross-val score:', cv_score_job)
assert all(i > 0.5 for i in cv_score_job), 'Low score!'
model.fit(X_job, y_job)
fi_job = model.feature_importances_
print('Feature importance:')
print(*(list(zip(X_job.columns, fi_job))), sep='\n')
In [ ]: