DataPot Usage Examples


In [1]:
import datapot as dp
from datapot import datasets

In [2]:
import pandas as pd
from __future__ import print_function
import sys
import bz2
import time
import xgboost as xgb
from sklearn.model_selection import cross_val_score

Bag of Words Meets Bags of Popcorn

Usage example for unstructured textual bzip2-compressed data

datapot.fit method subsamples the data to detect language and choose corresponding stopwords and stemming.

For each review datapot.transform generates an SVD-compressed 12-dimensional tfidf-vector representation.


In [3]:
data_imdb = datasets.load_imdb()
# imdb.jsonlines example: {"id":"5814_8", "sentiment":1, "review":"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay..."}

In [4]:
datapot_imdb = dp.DataPot()

In [5]:
t0 = time.time()
datapot_imdb.detect(data_imdb)
print('detect time:', time.time()-t0)
datapot_imdb


detect time: 0.16987323760986328
Out[5]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	('id', [NumericTransformer])
	('sentiment', [SVDOneHotTransformer, NumericTransformer])
	('review', [TfidfTransformer])

In [6]:
datapot_imdb.remove_transformer('sentiment', 0)


Out[6]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	('id', [NumericTransformer])
	('sentiment', [NumericTransformer])
	('review', [TfidfTransformer])

In [7]:
t0 = time.time()
datapot_imdb.fit(data_imdb, verbose=True)


fit transformers...
fit: ('id', [NumericTransformer])
fit: ('sentiment', [NumericTransformer])
fit: ('review', [TfidfTransformer])
fit transformers...OK
num of new features: 14
Out[7]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: 14
features to transform: 
	('id', [NumericTransformer])
	('sentiment', [NumericTransformer])
	('review', [TfidfTransformer])

In [8]:
print('fit time:', time.time()-t0)


fit time: 4.025353908538818

In [9]:
t0 = time.time()
df_imdb = datapot_imdb.transform(data_imdb)
print('transform time:', time.time()-t0)


transform time: 2.889500856399536
/usr/local/lib/python3.6/site-packages/datapot/__init__.py:137: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)

In [10]:
df_imdb.head()


Out[10]:
id sentiment review_0 review_1 review_2 review_3 review_4 review_5 review_6 review_7 review_8 review_9 review_10 review_11
0 58148.0 1.0 0.033875 0.066193 0.045845 0.000000 0.030801 0.117007 0.039461 0.034391 0.013357 0.045964 0.110228 0.013034
1 23819.0 1.0 0.063596 0.000000 0.021531 0.005730 0.019549 0.021693 0.042250 0.075540 0.014507 0.000000 0.013740 0.000000
2 77593.0 0.0 0.097346 0.018320 0.003082 0.007248 0.000000 0.000000 0.020481 0.000000 0.173069 0.005666 0.000000 0.000000
3 36304.0 0.0 0.126593 0.035629 0.011719 0.006712 0.000000 0.027863 0.082539 0.053115 0.054410 0.001494 0.008290 0.000000
4 94958.0 1.0 0.064116 0.000286 0.010714 0.039639 0.000355 0.034792 0.001935 0.016130 0.118334 0.045972 0.022172 0.001082

In [11]:
X = df_imdb.drop(['sentiment'], axis=1)
y = df_imdb['sentiment']

In [12]:
model = xgb.XGBClassifier()
cv_score = cross_val_score(model, X, y, cv=5)
assert all(i > 0.5 for i in cv_score), 'Low score!'
print('Cross-val score:', cv_score)

model.fit(X, y)
fi = model.feature_importances_

print('Feature importance:')
print(*(list(zip(X.columns, fi))), sep='\n')


Cross-val score: [ 0.72027972  0.73426573  0.734       0.73273273  0.70770771]
Feature importance:
('id', 0.16571428)
('review_0', 0.047619049)
('review_1', 0.079999998)
('review_2', 0.049523808)
('review_3', 0.051428571)
('review_4', 0.068571426)
('review_5', 0.053333335)
('review_6', 0.068571426)
('review_7', 0.09142857)
('review_8', 0.13142857)
('review_9', 0.085714288)
('review_10', 0.057142857)
('review_11', 0.049523808)

Job Salary Prediction

Usage example for unstructured textual bzip2-compressed data


In [13]:
data_job = datasets.load_job_salary()
# jobs.jsonlines example: {"Id":12612628, "Title":"Engineering Systems Analyst","FullDescription":"Engineering Systems Analyst Dorking Surrey Salary ****K Our client is located in Dorking, Surrey and are looking for Engineering Systems Analyst our client provides specialist software development Keywords Mathematical Modelling, Risk Analysis, System Modelling, Optimisation, MISER, PIONEEER Engineering Systems Analyst Dorking Surrey Salary ****K", "LocationNormalized":"Dorking", "ContractType":null, "ContractTime":"permanent", "Company":"Gregory Martin International", "Category":"Engineering Jobs", "SalaryNormalized":25000}

In [14]:
datapot_job = dp.DataPot()

In [15]:
t0 = time.time()
datapot_job.detect(data_job)
print('detect time:', time.time()-t0)
datapot_job


detect time: 0.04318118095397949
Out[15]:
DataPot class instance
 - number of features without transformation: 9
 - number of new features: Unknown
features to transform: 
	('Id', [NumericTransformer])
	('FullDescription', [TfidfTransformer])
	('ContractType', [SVDOneHotTransformer])
	('ContractTime', [SVDOneHotTransformer])
	('Category', [SVDOneHotTransformer])
	('SalaryNormalized', [NumericTransformer])

In [16]:
t0 = time.time()
datapot_job.fit(data_job, verbose=True)
print('fit time:', time.time()-t0)


fit transformers...
fit: ('Id', [NumericTransformer])
fit: ('FullDescription', [TfidfTransformer])
fit: ('ContractType', [SVDOneHotTransformer])
fit: ('ContractTime', [SVDOneHotTransformer])
fit: ('Category', [SVDOneHotTransformer])
fit: ('SalaryNormalized', [NumericTransformer])
fit transformers...OK
num of new features: 38
fit time: 2.232983112335205

In [17]:
t0 = time.time()
df_job = datapot_job.transform(data_job)
print('transform time:', time.time()-t0)


transform time: 1.4404969215393066
/usr/local/lib/python3.6/site-packages/datapot/__init__.py:137: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)

In [18]:
print(df_job.columns)
print(df_job.shape)
df_job.head()


Index(['Id', 'FullDescription_0', 'FullDescription_1', 'FullDescription_2',
       'FullDescription_3', 'FullDescription_4', 'FullDescription_5',
       'FullDescription_6', 'FullDescription_7', 'FullDescription_8',
       'FullDescription_9', 'FullDescription_10', 'FullDescription_11',
       'ContractType_None', 'ContractType_full_time', 'ContractType_part_time',
       'ContractTime_permanent', 'ContractTime_None', 'ContractTime_contract',
       'Category_Engineering Jobs', 'Category_HR & Recruitment Jobs',
       'Category_Accounting & Finance Jobs',
       'Category_Healthcare & Nursing Jobs', 'Category_Other/General Jobs',
       'Category_Hospitality & Catering Jobs', 'Category_IT Jobs',
       'Category_Customer Services Jobs', 'Category_Travel Jobs',
       'Category_Sales Jobs', 'Category_Manufacturing Jobs',
       'Category_Teaching Jobs', 'Category_Creative & Design Jobs',
       'Category_Trade & Construction Jobs', 'Category_Property Jobs',
       'Category_Admin Jobs', 'Category_Legal Jobs', 'Category_Retail Jobs',
       'SalaryNormalized'],
      dtype='object')
(2000, 38)
Out[18]:
Id FullDescription_0 FullDescription_1 FullDescription_2 FullDescription_3 FullDescription_4 FullDescription_5 FullDescription_6 FullDescription_7 FullDescription_8 ... Category_Sales Jobs Category_Manufacturing Jobs Category_Teaching Jobs Category_Creative & Design Jobs Category_Trade & Construction Jobs Category_Property Jobs Category_Admin Jobs Category_Legal Jobs Category_Retail Jobs SalaryNormalized
0 12612628.0 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.149956 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 25000.0
1 12612830.0 0.013061 0.000000 0.0 0.007217 0.010779 0.016543 0.0 0.221581 0.016949 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30000.0
2 12612844.0 0.040344 0.000187 0.0 0.000000 0.003482 0.000267 0.0 0.097930 0.011786 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 30000.0
3 12613049.0 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.142673 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 27500.0
4 12613647.0 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.0 0.116688 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 25000.0

5 rows × 38 columns


In [19]:
X_job = df_job.drop(['SalaryNormalized', 'Id'], axis=1)
y_job = pd.qcut(df_job['SalaryNormalized'].values, q=2, labels=[0,1]).ravel()

model = xgb.XGBClassifier()
cv_score_job = cross_val_score(model, X_job, y_job, cv=5)
print('Cross-val score:', cv_score_job)
assert all(i > 0.5 for i in cv_score_job), 'Low score!'

model.fit(X_job, y_job)
fi_job = model.feature_importances_

print('Feature importance:')
print(*(list(zip(X_job.columns, fi_job))), sep='\n')


Cross-val score: [ 0.71321696  0.83541147  0.71        0.72932331  0.72681704]
Feature importance:
('FullDescription_0', 0.061016951)
('FullDescription_1', 0.13898306)
('FullDescription_2', 0.084745765)
('FullDescription_3', 0.04576271)
('FullDescription_4', 0.038983051)
('FullDescription_5', 0.094915256)
('FullDescription_6', 0.14745763)
('FullDescription_7', 0.066101693)
('FullDescription_8', 0.079661019)
('FullDescription_9', 0.074576274)
('FullDescription_10', 0.044067796)
('FullDescription_11', 0.055932205)
('ContractType_None', 0.0)
('ContractType_full_time', 0.018644068)
('ContractType_part_time', 0.016949153)
('ContractTime_permanent', 0.010169491)
('ContractTime_None', 0.0016949152)
('ContractTime_contract', 0.0)
('Category_Engineering Jobs', 0.0)
('Category_HR & Recruitment Jobs', 0.0)
('Category_Accounting & Finance Jobs', 0.0)
('Category_Healthcare & Nursing Jobs', 0.0)
('Category_Other/General Jobs', 0.0)
('Category_Hospitality & Catering Jobs', 0.0)
('Category_IT Jobs', 0.0084745763)
('Category_Customer Services Jobs', 0.0084745763)
('Category_Travel Jobs', 0.0033898305)
('Category_Sales Jobs', 0.0)
('Category_Manufacturing Jobs', 0.0)
('Category_Teaching Jobs', 0.0)
('Category_Creative & Design Jobs', 0.0)
('Category_Trade & Construction Jobs', 0.0)
('Category_Property Jobs', 0.0)
('Category_Admin Jobs', 0.0)
('Category_Legal Jobs', 0.0)
('Category_Retail Jobs', 0.0)

In [ ]: