In [1]:
import nltk
import glob
import spacy
from spacy.tokens import Doc
from spacy.vocab import Vocab
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from numpy.random import shuffle
nltk.download('punkt')
def nltk_corpus(corpus_name):
corpus = getattr(nltk.corpus, corpus_name)
try:
corpus.ensure_loaded()
except:
nltk.download(corpus_name)
return corpus
def flatten(L):
return [item for sublist in L for item in sublist]
In [2]:
corpus = nltk_corpus('movie_reviews')
In [3]:
def processor():
preprocess = lambda x: [unicode(i) for i in x]
if 'spacy' in globals() and 'nlp' in globals() and 'Doc' in globals():
nlp = globals()['nlp']
else:
import spacy
from spacy.tokens import Doc
nlp = spacy.load('en')
vocab = nlp.vocab
process = lambda text: Doc(vocab, words=preprocess(text)).vector
return process
In [4]:
process = processor()
In [5]:
ids = np.array(corpus.fileids())
shuffle(ids)
texts, labels = zip(*map(lambda x: (flatten(corpus.sents(x)),x.split("/")[0]), ids))
X = np.array(map(process, texts))
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size = .1)
In [6]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
predictions = gbc.predict(X_test)
print classification_report(y_test, predictions)
In [7]:
import os
from boto.s3.connection import S3Connection
import pickle
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY']
def is_s3_cache_available():
"""
Return True if a connection can be made to S3 in the current environment
"""
try:
S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
return True
except:
print("WARNING: Unable to connect to s3")
return False
def s3_key(key, new=False):
"""
key is the S3 key in the ds_cache bucket. This function returns a reference
to the boto.s3.Key object corresponding to the key parameter.
If new=True, create a new key. Otherwise return an existing key.
If the key doesn't exist, return None
"""
s3 = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
cache_bucket = s3.get_bucket('ds_cache')
if new:
return cache_bucket.new_key(key)
return cache_bucket.get_key(key)
def load_cache(filepath, key):
"""
Loads file into local cache and returns the path. Returns None if the file
is not available.
filepath is the full local path to the file.
key is the S3 key to use in ds_cache bucket
"""
if os.path.exists(filepath):
print("file exists in cache")
return filepath
if is_s3_cache_available():
if s3_key(key) is not None:
print("transferring from s3")
s3_key(key).get_contents_to_filename(filepath)
return filepath
return None
def write_obj_to_cache(obj, filepath, key, use_s3=True):
"""
Writes a python object to a file, and also stores that file in S3.
filepath is the full local path to the file.
key is the S3 key to use in ds_cache bucket
"""
pickle.dump(obj, open(filepath, "wb"))
add_to_cache(filepath, key)
def add_to_cache(filepath, key):
"""
Saves a local file to S3.
filepath is the full local path to the file.
key is the S3 key to use in ds_cache bucket
"""
if is_s3_cache_available():
print("saving to s3")
s3_key(key, new=True).set_contents_from_filename(filepath)
def read_obj_from_cache(filepath, key):
"""
Reads object from local cache. Returns None if the file
is not available.
filepath is the full local path to the file.
key is the S3 key to use in ds_cache bucket
"""
in_cache = load_cache(filepath, key)
if in_cache:
return pickle.load(open(in_cache, "rb"))
return None
In [8]:
from ds.models.model import Model
import sklearn
sentiment = Model()
sentiment.package_new('sentiment'
,gbc
,type(gbc)
,sklearn.__version__
,processor
)
sentiment.save()
In [11]:
sentiment.comments.history
Out[11]:
In [10]:
sentiment.TextExplain('this is great')
Out[10]:
In [ ]:
In [17]:
model_obj['comments'].display_history()
Out[17]:
In [11]:
from ds.tools.Comment import Thread
c = Thread()
c.add_comment('F1 score on training is 0.7, trained this on the nltk movie reviews corpus', 'Aaron')
In [12]:
import sklearn
model_obj = {'model':gbc
, 'type':type(gbc)
, 'package_version':sklearn.__version__
,'preprocessing_dag':processor
,'comments':c
}
In [18]:
import dill
key = s3_key('sentiment-model', new=True)
write_obj_to_cache(model_obj, 'sentiment-model.pkl', key)
In [34]:
gbc.predict_proba(process('I hate this movie'))
Out[34]:
In [35]:
gbc.predict_proba(process('I love this movie'))
Out[35]:
In [36]:
gbc.classes_
Out[36]:
In [39]:
gbc.predict_proba(process('The movie was fantastic'))
Out[39]:
In [ ]:
gbc.predict_proba(process('The movie was fantastic'))