In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime

from baselines import load_comments_and_labels, assemble_data, one_hot
from serialization import save_pipeline, load_pipeline
import os
import joblib


Using TensorFlow backend.

In [17]:
task = 'recipient'
cv_name = 'linear_word_oh'


time: 1.09 ms

In [18]:
#load data
data = load_comments_and_labels(task)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-18-bb346be17671> in <module>()
      1 #load data
----> 2 data = load_comments_and_labels(task)

/Users/ellerywulczyn/detox/src/modeling/baselines.py in load_comments_and_labels(task)
    230     for split in splits:
    231         path = os.path.join(base_path, split, 'annotations.tsv')
--> 232         df = pd.read_csv(path, sep = '\t')
    233         #print(df.shape)
    234         #print(len(df['rev_id'].unique()))

/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    527                     skip_blank_lines=skip_blank_lines)
    528 
--> 529         return _read(filepath_or_buffer, kwds)
    530 
    531     parser_f.__name__ = name

/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    303         return parser
    304 
--> 305     return parser.read()
    306 
    307 _parser_defaults = {

/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
    761                 raise ValueError('skip_footer not supported for iteration')
    762 
--> 763         ret = self._engine.read(nrows)
    764 
    765         if self.options.get('as_recarray'):

/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/pandas/io/parsers.py in read(self, nrows)
   1211     def read(self, nrows=None):
   1212         try:
-> 1213             data = self._reader.read(nrows)
   1214         except StopIteration:
   1215             if self._first_chunk:

KeyboardInterrupt: 
time: 12.4 s

In [ ]:
X_train, y_train = assemble_data(data, 'comments', 'empirical_dist', splits = ['train'])
X_dev, y_dev = assemble_data(data,  'comments', 'empirical_dist', splits = ['dev'])

In [30]:
n_max = 10000


time: 853 µs

In [31]:
X_train = X_train[:n_max]
X_dev = X_dev[:n_max]
y_train = y_train[:n_max]
y_dev = y_dev[:n_max]


time: 1.84 ms

In [32]:
cv_path = '../../models/cv'
results_path = '../../models/%s/%s' % ('test', cv_name)
if not os.path.exists(results_path):
    os.makedirs(results_path)

model = load_pipeline(cv_path, cv_name)


time: 2.55 ms

In [33]:
#model.predict(['hi']) #should error out. model not fit


time: 734 µs

In [34]:
save_pipeline(model, results_path, 'scaffold')


time: 1.64 ms

In [35]:
model = load_pipeline(results_path, 'scaffold')


time: 1.45 ms

In [36]:
model = model.fit(X_train, y_train)


time: 5.07 s

In [37]:
model.predict(['fuck'])


Out[37]:
array([0])
time: 44.8 ms

In [23]:
save_pipeline(model, results_path, 'model')


time: 980 ms

In [24]:
model = load_pipeline(results_path, 'model')


time: 1.14 s

In [25]:
model.predict(['hi'])


Out[25]:
array([0])
time: 43.7 ms

In [26]:
model.predict(['fuck'])


Out[26]:
array([0])
time: 4.42 ms

In [ ]: