In [1]:
import datapot as dp
from datapot import datasets
In [2]:
import pandas as pd
from __future__ import print_function
import sys
import bz2
import time
import xgboost as xgb
from sklearn.model_selection import cross_val_score
import datapot as dp
from datapot.utils import csv_to_jsonlines
In [3]:
transactions = pd.read_csv('../data/transactions.csv')
transactions.head()
Out[3]:
Creating the DataPot object.
In [4]:
datapot = dp.DataPot()
In [5]:
from datapot.utils import csv_to_jsonlines
csv_to_jsonlines('../data/transactions.csv', '../data/transactions.jsonlines')
In [6]:
data_trns = open('../data/transactions.jsonlines')
data_trns.readline()
Out[6]:
Let's call the fit method. It automatically finds appropriate transformers for the fields of jsonlines file. The parameter 'limit' means how many objects will be used to detect the right transformers.
In [7]:
datapot.detect(data_trns, limit=100)
Out[7]:
In [8]:
t0 = time.time()
datapot.fit(data_trns, verbose=True)
print('fit time:', time.time()-t0)
In [9]:
datapot
Out[9]:
Let's remove the SVDOneHotTransformer
In [10]:
datapot.remove_transformer('merchant_id', 0)
Out[10]:
In [11]:
t0 = time.time()
df_trns = datapot.transform(data_trns)
print('transform time:', time.time()-t0)
In [12]:
df_trns.head()
Out[12]:
In [ ]:
https://www.kaggle.com/c/word2vec-nlp-tutorial/data
datapot.fit method subsamples the data to detect language and choose corresponding stopwords and stemming.
For each review datapot.transform generates an SVD-compressed 12-dimensional tfidf-vector representation.
In [13]:
import datapot as dp
from datapot import datasets
Load data from datapot.datasets
In [14]:
data_imdb = datasets.load_imdb()
Or load directly from file
In [15]:
data_imdb = bz2.BZ2File('data/imdb.jsonlines.bz2')
In [16]:
datapot_imdb = dp.DataPot()
In [17]:
t0 = time.time()
datapot_imdb.detect(data_imdb)
print('detect time:', time.time()-t0)
datapot_imdb
Out[17]:
In [18]:
datapot_imdb.remove_transformer('sentiment', 0)
Out[18]:
In [19]:
t0 = time.time()
datapot_imdb.fit(data_imdb, verbose=True)
Out[19]:
In [20]:
print('fit time:', time.time()-t0)
In [21]:
t0 = time.time()
df_imdb = datapot_imdb.transform(data_imdb)
print('transform time:', time.time()-t0)
In [22]:
df_imdb.head()
Out[22]:
In [23]:
X = df_imdb.drop(['sentiment'], axis=1)
y = df_imdb['sentiment']
In [24]:
model = xgb.XGBClassifier()
cv_score = cross_val_score(model, X, y, cv=5)
assert all(i > 0.5 for i in cv_score), 'Low score!'
print('Cross-val score:', cv_score)
model.fit(X, y)
fi = model.feature_importances_
print('Feature importance:')
print(*(list(zip(X.columns, fi))), sep='\n')
In [25]:
from datapot import datasets
data_job = datasets.load_job_salary()
# Or load from file%:
# data_job = bz2.BZ2File('datapot/data/job.jsonlines.bz2')
In [26]:
datapot_job = dp.DataPot()
In [27]:
t0 = time.time()
datapot_job.detect(data_job)
print('detect time:', time.time()-t0)
datapot_job
Out[27]:
In [28]:
t0 = time.time()
datapot_job.fit(data_job, verbose=True)
print('fit time:', time.time()-t0)
In [29]:
t0 = time.time()
df_job = datapot_job.transform(data_job)
print('transform time:', time.time()-t0)
In [30]:
print(df_job.columns)
print(df_job.shape)
df_job.head()
Out[30]:
In [31]:
X_job = df_job.drop(['SalaryNormalized', 'Id'], axis=1)
y_job = pd.qcut(df_job['SalaryNormalized'].values, q=2, labels=[0,1]).ravel()
model = xgb.XGBClassifier()
cv_score_job = cross_val_score(model, X_job, y_job, cv=5)
print('Cross-val score:', cv_score_job)
assert all(i > 0.5 for i in cv_score_job), 'Low score!'
model.fit(X_job, y_job)
fi_job = model.feature_importances_
print('Feature importance:')
print(*(list(zip(X_job.columns, fi_job))), sep='\n')
In [ ]: