DataPot for Texts Example

Bag of Words Meets Bags of Popcorn

This example show how datapot works with text data.

https://www.kaggle.com/c/word2vec-nlp-tutorial/data


In [1]:
from time import time

start = time()
import datapot as dp

In [2]:
datapot = dp.DataPot()

In [3]:
train_file = open('../data/imdbHTMLReviewsTrainData.jsonlines')

In [4]:
datapot.detect(train_file)


Out[4]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	(u'review', [TfidfTransformer])
	(u'sentiment', [SVDOneHotTransformer, NumericTransformer])

In order to apply word2vec with pre-trained Google News corpus (3 billion running words) word vector model download it here:

https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download


In [5]:
from datapot.transformer.text_transformer import Word2VecTransformer

In [6]:
datapot.add_transformer('review', Word2VecTransformer('GoogleNews-vectors-negative300.bin.gz'))


Out[6]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	(u'review', [TfidfTransformer, Word2VecTransformer])
	(u'sentiment', [SVDOneHotTransformer, NumericTransformer])

In [7]:
from datapot.transformer.identity_transformer import IdentityTransformer

datapot.add_transformer('id', IdentityTransformer())


Out[7]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	(u'review', [TfidfTransformer, Word2VecTransformer])
	(u'id', [IdentityTransformer])
	(u'sentiment', [SVDOneHotTransformer, NumericTransformer])

In [8]:
datapot


Out[8]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	(u'review', [TfidfTransformer, Word2VecTransformer])
	(u'id', [IdentityTransformer])
	(u'sentiment', [SVDOneHotTransformer, NumericTransformer])

In [9]:
datapot.remove_transformer('sentiment', 0).remove_transformer('review', 0)


Out[9]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: Unknown
features to transform: 
	(u'review', [Word2VecTransformer])
	(u'id', [IdentityTransformer])
	(u'sentiment', [NumericTransformer])

In [10]:
datapot.fit(train_file)


['__class__', '__contains__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_save_specials', '_smart_save', 'accuracy', 'doesnt_match', 'evaluate_word_pairs', 'index2word', 'init_sims', 'load', 'load_word2vec_format', 'log_accuracy', 'log_evaluate_word_pairs', 'most_similar', 'most_similar_cosmul', 'n_similarity', 'save', 'save_word2vec_format', 'similar_by_vector', 'similar_by_word', 'similarity', 'syn0', 'syn0norm', 'vocab', 'wmdistance', 'word_vec']
Out[10]:
DataPot class instance
 - number of features without transformation: 3
 - number of new features: 302
features to transform: 
	(u'review', [Word2VecTransformer])
	(u'id', [IdentityTransformer])
	(u'sentiment', [NumericTransformer])

In [11]:
df = datapot.transform(train_file)


/home/yury/anaconda3/envs/py2.7/lib/python2.7/site-packages/datapot/__init__.py:137: FutureWarning: convert_objects is deprecated.  Use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  return pd.DataFrame(data=np.hstack(columns), columns=names).convert_objects(convert_numeric=True)

In [12]:
df.head()


Out[12]:
review_w2c_0 review_w2c_1 review_w2c_2 review_w2c_3 review_w2c_4 review_w2c_5 review_w2c_6 review_w2c_7 review_w2c_8 review_w2c_9 ... review_w2c_292 review_w2c_293 review_w2c_294 review_w2c_295 review_w2c_296 review_w2c_297 review_w2c_298 review_w2c_299 id sentiment
0 0.052496 0.030112 0.047812 0.116629 -0.063779 0.011063 0.039467 -0.074222 0.079232 0.050276 ... -0.079211 0.026400 -0.063537 -0.017119 0.007378 -0.044263 0.034460 -0.018368 5814_8 1
1 0.055470 0.047159 0.020710 0.103501 -0.051608 -0.006560 0.048293 -0.070138 0.054732 0.058845 ... -0.080714 0.027905 -0.041170 -0.009959 0.015125 -0.031857 0.047146 -0.014142 2381_9 1
2 0.034234 0.053978 0.008341 0.073953 -0.061673 -0.004418 0.014665 -0.075994 0.053972 0.078901 ... -0.075499 0.049075 -0.041116 -0.007737 0.008588 -0.035842 0.023027 0.007629 7759_3 0
3 0.053569 0.026639 0.028236 0.090700 -0.072726 0.004725 0.053708 -0.081408 0.068755 0.075209 ... -0.070612 0.025403 -0.026793 0.004640 0.015853 -0.027212 0.050098 -0.001557 3630_4 0
4 0.059918 0.026791 0.030766 0.078921 -0.072972 0.001130 0.042963 -0.068914 0.067305 0.075777 ... -0.077818 0.022897 -0.046225 -0.000431 0.022938 -0.051477 0.001565 0.006204 9495_8 1

5 rows × 302 columns


In [13]:
X = df.drop(['sentiment', 'id'], axis=1)
y = df.sentiment

In [14]:
from xgboost import XGBClassifier

clf = XGBClassifier().fit(X, y)


/home/yury/anaconda3/envs/py2.7/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [15]:
test_f = open('data/imdbHTMLReviewsTestData.jsonlines')

test_df = datapot.transform(test_f)

In [16]:
test_id = test_df.id
X_test = test_df.drop(['id', 'sentiment'], axis=1)

In [17]:
import pandas as pd

answer_df = pd.DataFrame()
answer_df['id'] = test_df.id
answer_df['sentiment'] = clf.predict(X_test)
answer_df.to_csv("baseline_answer.csv", index=False)

In [18]:
print(time() - start)


275.767914057