In [1]:
import nltk
import glob
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from numpy.random import shuffle
nltk.download('punkt')


def nltk_corpus(corpus_name):
    corpus = getattr(nltk.corpus, corpus_name)
    try:
        corpus.ensure_loaded()
    except:
        nltk.download(corpus_name)
    return corpus

def flatten(L):
    return [item for sublist in L for item in sublist]


[nltk_data] Downloading package punkt to /home/deploy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
corpus = nltk_corpus('movie_reviews')

In [3]:
def processor():
    preprocess = lambda x: [unicode(i) for i in x]
    
    if 'spacy' in globals() and 'nlp' in globals() and 'Doc' in globals():
        nlp = globals()['nlp']
    else:
        import spacy
        from spacy.tokens import Doc
        nlp = spacy.load('en')
    
    vocab = nlp.vocab
    process = lambda text: Doc(vocab, words=preprocess(text)).vector
    return process

In [4]:
process = processor()

In [5]:
ids = np.array(corpus.fileids())
shuffle(ids)
texts, labels = zip(*map(lambda x: (flatten(corpus.sents(x)),x.split("/")[0]), ids))
X = np.array(map(process, texts))
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size = .1)

In [6]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
predictions = gbc.predict(X_test)
print classification_report(y_test, predictions)


             precision    recall  f1-score   support

        neg       0.71      0.83      0.76        87
        pos       0.85      0.73      0.79       113

avg / total       0.79      0.78      0.78       200


In [7]:
import os
from boto.s3.connection import S3Connection
import pickle

AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']


def is_s3_cache_available():
    """
    Return True if a connection can be made to S3 in the current environment
    """
    try:
        S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
        return True
    except:
        print("WARNING: Unable to connect to s3")
        return False
        
def s3_key(key, new=False):
    """
    key is the S3 key in the ds_cache bucket.  This function returns a reference
    to the boto.s3.Key object corresponding to the key parameter.
    If new=True, create a new key.  Otherwise return an existing key.
    If the key doesn't exist, return None
    """
    s3 = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
    cache_bucket = s3.get_bucket('ds_cache')
    if new:
        return cache_bucket.new_key(key)
    return cache_bucket.get_key(key)


def load_cache(filepath, key):
    """
    Loads file into local cache and returns the path.  Returns None if the file
    is not available.
    filepath is the full local path to the file.
    key is the S3 key to use in ds_cache bucket
    """
    if os.path.exists(filepath):
        print("file exists in cache")
        return filepath
    if is_s3_cache_available():
        if s3_key(key) is not None:
            print("transferring from s3")
            s3_key(key).get_contents_to_filename(filepath)
            return filepath
    return None

def write_obj_to_cache(obj, filepath, key, use_s3=True):
    """
    Writes a python object to a file, and also stores that file in S3.
    filepath is the full local path to the file.
    key is the S3 key to use in ds_cache bucket
    """
    pickle.dump(obj, open(filepath, "wb"))
    add_to_cache(filepath, key)
    
def add_to_cache(filepath, key):
    """
    Saves a local file to S3.
    filepath is the full local path to the file.
    key is the S3 key to use in ds_cache bucket
    """
    if is_s3_cache_available():
        print("saving to s3")
        s3_key(key, new=True).set_contents_from_filename(filepath)
        
        
def read_obj_from_cache(filepath, key):
    """
    Reads object from local cache.  Returns None if the file
    is not available.
    filepath is the full local path to the file.
    key is the S3 key to use in ds_cache bucket
    """
    in_cache = load_cache(filepath, key)
    if in_cache:
        return pickle.load(open(in_cache, "rb"))
    return None

In [8]:
from ds.models.model import Model
import sklearn
sentiment = Model()
sentiment.package_new('sentiment'
                      ,gbc
                      ,type(gbc) 
                      ,sklearn.__version__
                      ,processor
                     )
sentiment.save()


/usr/local/lib/python2.7/dist-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [11]:
sentiment.comments.history


Out[11]:
[{'ID': 0,
  'comment': 'Created Thread',
  'name': 'SYSTEM',
  'ts': datetime.datetime(2017, 1, 6, 23, 37, 37, 551286)}]

In [10]:
sentiment.TextExplain('this is great')


Out[10]:
<lime.explanation.Explanation at 0x7fc4ee4af050>

In [ ]:


In [17]:
model_obj['comments'].display_history()


Out[17]:
ID comment name ts
0 0 Created Thread SYSTEM 2017-01-06 21:20:38.123623
1 1 F1 score on training is 0.7, trained this on t... Aaron 2017-01-06 21:20:38.123674

In [11]:
from ds.tools.Comment import Thread
c = Thread()
c.add_comment('F1 score on training is 0.7, trained this on the nltk movie reviews corpus', 'Aaron')

In [12]:
import sklearn
model_obj = {'model':gbc
             , 'type':type(gbc)
             , 'package_version':sklearn.__version__
             ,'preprocessing_dag':processor
             ,'comments':c
            }

In [18]:
import dill

key = s3_key('sentiment-model', new=True)
write_obj_to_cache(model_obj, 'sentiment-model.pkl', key)


saving to s3

In [34]:
gbc.predict_proba(process('I hate this movie'))


/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[34]:
array([[ 0.51269181,  0.48730819]])

In [35]:
gbc.predict_proba(process('I love this movie'))


/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[35]:
array([[ 0.68489638,  0.31510362]])

In [36]:
gbc.classes_


Out[36]:
array([u'neg', u'pos'], 
      dtype='<U3')

In [39]:
gbc.predict_proba(process('The movie was fantastic'))


/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
Out[39]:
array([[ 0.30757281,  0.69242719]])

In [ ]:
gbc.predict_proba(process('The movie was fantastic'))