In [2]:
import  tensorflow  as tf
import matplotlib.pyplot as plt
import csv
import numpy as np
import os
import string
import requests
import io
import nltk
from zipfile import ZipFile
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.python.framework import ops


import sys
reload(sys)
sys.setdefaultencoding('utf-8')

ops.reset_default_graph()

In [3]:
sess = tf.Session()

In [4]:
batch_size = 200
max_features = 1000

In [5]:
save_file_name = 'temp_spam_data.csv'
if os.path.isfile(save_file_name):
    text_data = []
    with open(save_file_name, 'r') as temp_output_file:
        reader = csv.reader(temp_output_file)
        for row in reader:
            text_data.append(row)
            
else:
    zip_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
    r = requests.get(zip_url)
    z = ZipFile(io.BytesIO(r.content))
    file = z.read('SMSSpamCollection')
    
    text_data = file.decode()
    text_data = text_data.encode('ascii', errors ='ignore')
    text_data = text_data.decode().split('\n')
    text_data = [x.split('\t') for x in text_data if len(x) >= 1]

    with open(save_file_name, 'w') as temp_output_file:
        writer = csv.writer(temp_output_file)
        writer.writerows(text_data)

In [6]:
texts = [x[1] for x in text_data]
target = [x[0] for x in text_data]


target = [1. if x =='spam' else 0. for x in target]

texts = [x.lower() for x in texts]

texts = [''.join(c for c in x if c not in string.punctuation) for x in texts]
texts = [''.join(c for c in x if c not in '0123456789') for x in texts]
texts = [' '.join(x.split()) for x in texts]

In [7]:
def tokenizer(text):
    words = nltk.word_tokenize(text)
    return words

# Create TF-IDF of texts
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', max_features=max_features)
sparse_tfidf_texts = tfidf.fit_transform(texts)



LookupErrorTraceback (most recent call last)
<ipython-input-7-6e137b4be272> in <module>()
      5 # Create TF-IDF of texts
      6 tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', max_features=max_features)
----> 7 sparse_tfidf_texts = tfidf.fit_transform(texts)

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
   1379             Tf-idf-weighted document-term matrix.
   1380         """
-> 1381         X = super(TfidfVectorizer, self).fit_transform(raw_documents)
   1382         self._tfidf.fit(X)
   1383         # X is already a transformed view of raw_documents so

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
    867 
    868         vocabulary, X = self._count_vocab(raw_documents,
--> 869                                           self.fixed_vocabulary_)
    870 
    871         if self.binary:

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
    790         for doc in raw_documents:
    791             feature_counter = {}
--> 792             for feature in analyze(doc):
    793                 try:
    794                     feature_idx = vocabulary[feature]

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(doc)
    264 
    265             return lambda doc: self._word_ngrams(
--> 266                 tokenize(preprocess(self.decode(doc))), stop_words)
    267 
    268         else:

<ipython-input-7-6e137b4be272> in tokenizer(text)
      1 def tokenizer(text):
----> 2     words = nltk.word_tokenize(text)
      3     return words
      4 
      5 # Create TF-IDF of texts

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in word_tokenize(text, language, preserve_line)
    128     :type preserver_line: bool
    129     """
--> 130     sentences = [text] if preserve_line else sent_tokenize(text, language)
    131     return [token for sent in sentences
    132             for token in _treebank_word_tokenizer.tokenize(sent)]

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in sent_tokenize(text, language)
     94     :param language: the model name in the Punkt corpus
     95     """
---> 96     tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
     97     return tokenizer.tokenize(text)
     98 

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/data.pyc in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
    812 
    813     # Load the resource.
--> 814     opened_resource = _open(resource_url)
    815 
    816     if format == 'raw':

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/data.pyc in _open(resource_url)
    930 
    931     if protocol is None or protocol.lower() == 'nltk':
--> 932         return find(path_, path + ['']).open()
    933     elif protocol.lower() == 'file':
    934         # urllib might not use mode='rb', so handle this one ourselves:

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/data.pyc in find(resource_name, paths)
    651     sep = '*' * 70
    652     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 653     raise LookupError(resource_not_found)
    654 
    655 

LookupError: 
**********************************************************************
  Resource u'tokenizers/punkt/english.pickle' not found.  Please
  use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/Users/lipingzhang/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - u''
**********************************************************************

In [8]:
train_indices = np.random.choice(sparse_tfidf_texts.shape[0], round(0.8 * sparse_tfidf_texts.shape[0]), replace=False)

test_indices = np.array(list(set(range(sparse_tfidf_texts.shape[0])) -set(train_indices)))
texts_train = sparse_tfidf_texts[train_indices]
texts_test = sparse_tfidf_texts[test_indices]
target_train = np.array([x for ix, x in enumerate(target) if ix in train_indices])
target_test = np.array([x for ix, x in enumerate(target) if ix in test_indices])



NameErrorTraceback (most recent call last)
<ipython-input-8-16f3d95959c4> in <module>()
----> 1 train_indices = np.random.choice(sparse_tfidf_texts.shape[0], round(0.8 * sparse_tfidf_texts.shape[0]), replace=False)
      2 
      3 test_indices = np.array(list(set(range(sparse_tfidf_texts.shape[0])) -set(train_indices)))
      4 texts_train = sparse_tfidf_texts[train_indices]
      5 texts_test = sparse_tfidf_texts[test_indices]

NameError: name 'sparse_tfidf_texts' is not defined

In [9]:
A = tf.Variable(tf.random_normal(shape=[max_features, 1]))
b = tf.Variable(tf.random_normal(shape=[1,1]))

x_data = tf.placeholder(shape=[None, max_features], dtype=tf.float32)
y_target = tf.placeholder(shape=[None, 1], dtype=tf.float32)

model_output = tf.add(tf.matmul(x_data, A), b)

In [10]:
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=model_output, labels=y_target))

prediction = tf.round(tf.sigmoid(model_output))
prediction_correct = tf.cast(tf.equal(prediction, y_target), tf.float32)
accuracy = tf.reduce_mean(prediction_correct)

In [11]:
my_opt = tf.train.GradientDescentOptimizer(0.0025)
train_step = my_opt.minimize(loss)

init = tf.global_variables_initializer()
sess.run(init)

In [12]:
train_loss = []
test_loss = []
train_acc = []
test_acc = []
i_data = []
for i in range(10000):
    rand_index = np.random.choice(texts_train.shape[0], size=batch_size)
    rand_x = texts_train[rand_index].todense()
    rand_y = np.transpose([target_train[rand_index]])
    sess.run(train_step, feed_dict={x_data: rand_x, y_target: rand_y})
    
    
    if (i + 1) % 100 == 0:
        i_data.append(i + 1)
        train_loss_temp = sess.run(loss, feed_dict={x_data: rand_x, y_target: rand_y})
        train_loss.append(train_loss_temp)
        
        test_loss_temp = sess.run(loss, feed_dict={x_data: texts_test.todense(), y_target:np.transpose([target_test])})
        test_loss.append(test_loss_temp)
        
        train_acc_temp = sess.run(accuracy, feed_dict={x_data: rand_x, y_target: rand_y})
        train_acc.append(train_acc_temp)
        
        test_acc_temp = sess.run(accuracy, feed_dict={x_data: texts_test.todense(), y_target: np.transpose([target_test])})
        test_acc.append(test_acc_temp)
        
    if (i + 1) % 500 == 0:
        acc_and_loss = [i+1, train_loss_temp, test_loss_temp, train_acc_temp, train_acc_temp]
        acc_and_loss = [np.round(x, 2) for x in acc_and_loss]
        print('Generation # {}.Train Loss (Test Loss): {:.2f} ({:.2f}).Train Acc(Test Acc): {:.2f})'.format(*acc_and_loss))



NameErrorTraceback (most recent call last)
<ipython-input-12-262b097c94df> in <module>()
      5 i_data = []
      6 for i in range(10000):
----> 7     rand_index = np.random.choice(texts_train.shape[0], size=batch_size)
      8     rand_x = texts_train[rand_index].todense()
      9     rand_y = np.transpose([target_train[rand_index]])

NameError: name 'texts_train' is not defined

In [19]:
# Plot loss over time# Plot l 
plt.plot(i_data, train_loss, 'k-', label='Train Loss')
plt.plot(i_data, test_loss, 'r--', label='Test Loss', linewidth=4)
plt.title('Cross Entropy Loss per Generation')
plt.xlabel('Generation')
plt.ylabel('Cross Entropy Loss')
plt.legend(loc='upper right')
plt.show()

# Plot train and test acccuracy
plt.plot(i_data, train_acc, 'k-', label='Train Set Accuracy')
plt.plot(i_data, test_acc, 'r--', label='Test Set Accuracy', linewidth=4)
plt.title('Train and Test Accuracy')
plt.xlabel('Generation')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()



In [ ]: