In [1]:
    
import datapot as dp
from datapot import datasets
    
In [2]:
    
import pandas as pd
from __future__ import print_function
import sys
import bz2
import time
import xgboost as xgb
from sklearn.model_selection import cross_val_score
    
Usage example for unstructured textual bzip2-compressed data
datapot.fit method subsamples the data to detect language and choose corresponding stopwords and stemming.
For each review datapot.transform generates an SVD-compressed 12-dimensional tfidf-vector representation.
In [3]:
    
data_imdb = datasets.load_imdb()
    
In [4]:
    
datapot_imdb = dp.DataPot()
    
In [5]:
    
t0 = time.time()
datapot_imdb.detect(data_imdb)
print('detect time:', time.time()-t0)
datapot_imdb
    
    
    Out[5]:
In [6]:
    
datapot_imdb.remove_transformer('sentiment', 0)
    
    Out[6]:
In [7]:
    
t0 = time.time()
datapot_imdb.fit(data_imdb, verbose=True)
    
    
    Out[7]:
In [8]:
    
print('fit time:', time.time()-t0)
    
    
In [9]:
    
t0 = time.time()
df_imdb = datapot_imdb.transform(data_imdb)
print('transform time:', time.time()-t0)
    
    
    
In [10]:
    
df_imdb.head()
    
    Out[10]:
In [11]:
    
X = df_imdb.drop(['sentiment'], axis=1)
y = df_imdb['sentiment']
    
In [12]:
    
model = xgb.XGBClassifier()
cv_score = cross_val_score(model, X, y, cv=5)
assert all(i > 0.5 for i in cv_score), 'Low score!'
print('Cross-val score:', cv_score)
model.fit(X, y)
fi = model.feature_importances_
print('Feature importance:')
print(*(list(zip(X.columns, fi))), sep='\n')
    
    
In [13]:
    
data_job = datasets.load_job_salary()
    
In [14]:
    
datapot_job = dp.DataPot()
    
In [15]:
    
t0 = time.time()
datapot_job.detect(data_job)
print('detect time:', time.time()-t0)
datapot_job
    
    
    Out[15]:
In [16]:
    
t0 = time.time()
datapot_job.fit(data_job, verbose=True)
print('fit time:', time.time()-t0)
    
    
In [17]:
    
t0 = time.time()
df_job = datapot_job.transform(data_job)
print('transform time:', time.time()-t0)
    
    
    
In [18]:
    
print(df_job.columns)
print(df_job.shape)
df_job.head()
    
    
    Out[18]:
In [19]:
    
X_job = df_job.drop(['SalaryNormalized', 'Id'], axis=1)
y_job = pd.qcut(df_job['SalaryNormalized'].values, q=2, labels=[0,1]).ravel()
model = xgb.XGBClassifier()
cv_score_job = cross_val_score(model, X_job, y_job, cv=5)
print('Cross-val score:', cv_score_job)
assert all(i > 0.5 for i in cv_score_job), 'Low score!'
model.fit(X_job, y_job)
fi_job = model.feature_importances_
print('Feature importance:')
print(*(list(zip(X_job.columns, fi_job))), sep='\n')
    
    
In [ ]: