In [0]:
# You need TF 1.13.1 to deploy this on AI Platform
!pip install tensorflow==1.13.1
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
In [0]:
# Authenticate to your cloud account
from google.colab import auth
auth.authenticate_user()
In [0]:
# Download the Stack Overflow data (or replace with your own text data)
!gsutil cp 'gs://cloudml-demo-lcm/SO_ml_tags_avocado_188k_v2.csv' ./
In [0]:
data = pd.read_csv('SO_ml_tags_avocado_188k_v2.csv', names=['tags', 'original_tags', 'text'], header=0)
data = data.drop(columns=['original_tags'])
data = data.dropna()
data = shuffle(data, random_state=22)
data.head()
Out[0]:
In [ ]:
# Encode top tags to multi-hot
tags_split = [tags.split(',') for tags in data['tags'].values]
print(tags_split)
In [ ]:
tag_encoder = MultiLabelBinarizer()
tags_encoded = tag_encoder.fit_transform(tags_split)
num_tags = len(tags_encoded[0])
print(data['text'].values[0])
print(tag_encoder.classes_)
print(tags_encoded[0])
In [0]:
# Split our data into train and test sets
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))
In [0]:
# Split our labels into train and test sets
train_tags = tags_encoded[:train_size]
test_tags = tags_encoded[train_size:]
In [0]:
# Pre-processing data: create our tokenizer class
%%writefile preprocess.py
from tensorflow.keras.preprocessing import text
class TextPreprocessor(object):
def __init__(self, vocab_size):
self._vocab_size = vocab_size
self._tokenizer = None
def create_tokenizer(self, text_list):
tokenizer = text.Tokenizer(num_words=self._vocab_size)
tokenizer.fit_on_texts(text_list)
self._tokenizer = tokenizer
def transform_text(self, text_list):
text_matrix = self._tokenizer.texts_to_matrix(text_list)
return text_matrix
In [0]:
# Create vocab from training corpus
from preprocess import TextPreprocessor
VOCAB_SIZE=400 # This is a hyperparameter, try out different values for your dataset
train_qs = data['text'].values[:train_size]
test_qs = data['text'].values[train_size:]
processor = TextPreprocessor(VOCAB_SIZE)
processor.create_tokenizer(train_qs)
body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)
In [0]:
# Preview the first input from our training data
print(len(body_train[0]))
print(body_train[0])
In [0]:
# Save the processor state of the tokenizer
import pickle
with open('./processor_state.pkl', 'wb') as f:
pickle.dump(processor, f)
In [0]:
def create_model(vocab_size, num_tags):
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
model.add(tf.keras.layers.Dense(25, activation='relu'))
model.add(tf.keras.layers.Dense(num_tags, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
return model
In [0]:
model = create_model(VOCAB_SIZE, num_tags)
model.summary()
# Train and evaluate the model
model.fit(body_train, train_tags, epochs=3, batch_size=128, validation_split=0.1)
print('Eval loss/accuracy:{}'.format(
model.evaluate(body_test, test_tags, batch_size=128)))
# Export the model to a file
model.save('keras_saved_model.h5')
In [0]:
# Use custom model prediction to save our model + tokenizer
%%writefile model_prediction.py
import pickle
import os
import numpy as np
class CustomModelPrediction(object):
def __init__(self, model, processor):
self._model = model
self._processor = processor
def predict(self, instances, **kwargs):
preprocessed_data = self._processor.transform_text(instances)
predictions = self._model.predict(preprocessed_data)
return predictions.tolist()
@classmethod
def from_path(cls, model_dir):
import tensorflow.keras as keras
model = keras.models.load_model(
os.path.join(model_dir,'keras_saved_model.h5'))
with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
processor = pickle.load(f)
return cls(model, processor)
In [0]:
test_requests = [
"How to preprocess strings in Keras models Lambda layer? I have the problem that the value passed on to the Lambda layer (at compile time) is a placeholder generated by keras (without values). When the model is compiled, the .eval () method throws the error: You must feed a value for placeholder tensor 'input_1' with dtype string and shape [?, 1] def text_preprocess(x): strings = tf.keras.backend.eval(x) vectors = [] for string in strings: vector = string_to_one_hot(string.decode('utf-8')) vectors.append(vector) vectorTensor = tf.constant(np.array(vectors),dtype=tf.float32) return vectorTensor input_text = Input(shape=(1,), dtype=tf.string) embedding = Lambda(text_preprocess)(input_text) dense = Dense(256, activation='relu')(embedding) outputs = Dense(2, activation='softmax')(dense) model = Model(inputs=[input_text], outputs=outputs) model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) model.summary() model.save('test.h5') If I pass a string array into the input layer statically, I can compile the model, but I get the same error if I want to convert the model to tflite. #I replaced this line: input_text = Input(shape=(1,), dtype=tf.string) #by this lines: test = tf.constant(['Hello', 'World']) input_text = Input(shape=(1,), dtype=tf.string, tensor=test) #but calling this ... converter = TFLiteConverter.from_keras_model_file('string_test.h5') tfmodel = converter.convert() #... still leads to this error: InvalidArgumentError: You must feed a value for placeholder tensor 'input_3' with dtype string and shape [2] [[{{node input_3}}]] ",
"Change the bar item name in Pandas I have a test excel file like: df = pd.DataFrame({'name':list('abcdefg'), 'age':[10,20,5,23,58,4,6]}) print (df) name age 0 a 10 1 b 20 2 c 5 3 d 23 4 e 58 5 f 4 6 g 6 I use Pandas and matplotlib to read and plot it: import pandas as pd import numpy as np import matplotlib.pyplot as plt import os excel_file = 'test.xlsx' df = pd.read_excel(excel_file, sheet_name=0) df.plot(kind='bar') plt.show() the result shows: enter image description here it use index number as item name, how can I change it to the name, which stored in column name?"
]
In [0]:
from model_prediction import CustomModelPrediction
classifier = CustomModelPrediction.from_path('.')
results = classifier.predict(test_requests)
print(results)
for i in range(len(results)):
print('Predicted labels:')
for idx,val in enumerate(results[i]):
if val > 0.7:
print(tag_encoder.classes_[idx])
print('\n')
In [0]:
%%writefile setup.py
from setuptools import setup
setup(
name="so_predict",
version="0.1",
include_package_data=True,
scripts=["preprocess.py", "model_prediction.py"]
)
In [0]:
## Replace this with the name of your Cloud Storage bucket
!gsutil cp keras_saved_model.h5 gs://your_gcs_bucket/
!gsutil cp processor_state.pkl gs://your_gcs_bucket/
In [0]:
# Replace with your bucket name below
!python setup.py sdist
!gsutil cp ./dist/so_predict-0.1.tar.gz gs://your_gcs_bucket/packages/so_predict-0.1.tar.gz
In [0]:
# Replace with your Cloud project name
!gcloud config set project your-cloud-project
In [0]:
# Create model if it hasn't been created yet
!gcloud ml-engine models create your_model_name
In [0]:
# To use this custom code feature, fill out this form: bit.ly/cmle-custom-code-signup
!gcloud alpha ml-engine versions create v1 --model your_model_name \
--origin=gs://your_gcs_bucket/ \
--python-version=3.5 \
--runtime-version=1.13 \
--framework='TENSORFLOW' \
--package-uris=gs://your_gcs_bucket/packages/so_predict-0.1.tar.gz \
--model-class=model_prediction.CustomModelPrediction
In [0]:
# https://stackoverflow.com/questions/55517871/how-to-preprocess-strings-in-keras-models-lambda-layer
# https://stackoverflow.com/questions/55508547/plot-histogram-for-feature-of-array-with-known-and-limited-values
%%writefile predictions.txt
"How to preprocess strings in Keras models Lambda layer? I have the problem that the value passed on to the Lambda layer (at compile time) is a placeholder generated by keras (without values). When the model is compiled, the .eval () method throws the error: You must feed a value for placeholder tensor 'input_1' with dtype string and shape [?, 1] def text_preprocess(x): strings = tf.keras.backend.eval(x) vectors = [] for string in strings: vector = string_to_one_hot(string.decode('utf-8')) vectors.append(vector) vectorTensor = tf.constant(np.array(vectors),dtype=tf.float32) return vectorTensor input_text = Input(shape=(1,), dtype=tf.string) embedding = Lambda(text_preprocess)(input_text) dense = Dense(256, activation='relu')(embedding) outputs = Dense(2, activation='softmax')(dense) model = Model(inputs=[input_text], outputs=outputs) model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) model.summary() model.save('test.h5') If I pass a string array into the input layer statically, I can compile the model, but I get the same error if I want to convert the model to tflite. #I replaced this line: input_text = Input(shape=(1,), dtype=tf.string) #by this lines: test = tf.constant(["Hello","World"]) input_text = Input(shape=(1,), dtype=tf.string, tensor=test) #but calling this ... converter = TFLiteConverter.from_keras_model_file('string_test.h5') tfmodel = converter.convert() #... still leads to this error: InvalidArgumentError: You must feed a value for placeholder tensor 'input_3' with dtype string and shape [2] [[{{node input_3}}]] "
"Change the bar item name in Pandas I have a test excel file like: df = pd.DataFrame({'name':list('abcdefg'), 'age':[10,20,5,23,58,4,6]}) print (df) name age 0 a 10 1 b 20 2 c 5 3 d 23 4 e 58 5 f 4 6 g 6 I use Pandas and matplotlib to read and plot it: import pandas as pd import numpy as np import matplotlib.pyplot as plt import os excel_file = 'test.xlsx' df = pd.read_excel(excel_file, sheet_name=0) df.plot(kind="bar") plt.show() the result shows: enter image description here it use index number as item name, how can I change it to the name, which stored in column name? "
In [0]:
# Get predictions from our trained model
predictions = !gcloud ml-engine predict --model='your_model_name' --text-instances=predictions.txt --version=v1
print(predictions)
In [0]:
print(tag_encoder.classes_, '\n')
for sigmoid_arr in eval(predictions[0]):
print(sigmoid_arr)
for idx,probability in enumerate(sigmoid_arr):
if probability > 0.7:
print(tag_encoder.classes_[idx])
print('\n')
In [0]:
!pip install shap
!pip install colored
In [0]:
import shap
attrib_data = body_train[:200]
explainer = shap.DeepExplainer(model, attrib_data)
num_explanations = 25
shap_vals = explainer.shap_values(body_test[:num_explanations])
In [0]:
words = processor._tokenizer.word_index
In [0]:
word_lookup = list()
for i in words.keys():
word_lookup.append(i)
word_lookup = [''] + word_lookup
print(word_lookup[:100])
In [0]:
shap.summary_plot(shap_vals, feature_names=word_lookup, class_names=tag_encoder.classes_)
In [0]:
import colored
import re
def colorprint(question, pos, neg):
# Split question string on multiple chars
q_arr = []
q_filtered = filter(None,re.split("[, .()]+", question))
for i in q_filtered:
q_arr.append(i)
color_str = []
for idx,word in enumerate(q_arr):
if word in pos:
color_str.append(colored.fg("blue") + word)
elif word in neg:
color_str.append(colored.fg("light_red") + word)
else:
color_str.append(colored.fg('black') + word)
# For wrapped printing
if idx % 15 == 0 and idx > 0:
color_str.append('\n')
print(' '.join(color_str) + colored.fg('black') + " ")
In [0]:
# Print highlighted signal words for a few questions
examples_to_print = [0,7,20,22,24]
for i in range(len(examples_to_print)):
# Print the actual labels
actual = test_tags[examples_to_print[i]]
num_labels = np.sum(actual)
actual_labels = np.argpartition(actual, -num_labels)[-num_labels:]
# Print the predicted labels
print('Predicted labels:')
pred_tag = model.predict([[body_test[examples_to_print[i]]]])
for idx,tagprob in enumerate(pred_tag[0]):
if tagprob > 0.8:
print(tag_encoder.classes_[idx])
print('\n')
# Get the highest and lowest signaling words
for idx,tag in enumerate(pred_tag[0]):
if tag > 0.7:
attributions = shap_vals[idx][examples_to_print[i]]
top_signal_words = np.argpartition(attributions, -5)[-5:]
pos_words = []
for word_idx in top_signal_words:
signal_wd = word_lookup[word_idx]
pos_words.append(signal_wd)
negative_signal_words = np.argpartition(attributions, 5)[:5]
neg_words = []
for word_idx in negative_signal_words:
signal_wd = word_lookup[word_idx]
neg_words.append(signal_wd)
colorprint(test_qs[examples_to_print[i]],pos_words, neg_words)
print('\n')
In [0]: