In [1]:
import os
path = 'C:\\users\johannes\ProjectAmazonTextAnalysis\johannes'
os.chdir(path)
import pickle

import pandas as pd
import numpy as np

import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split

from collections import Counter
import gzip

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk import ngrams
from nltk.corpus import stopwords



import time


0.18.1

In [2]:
# import spacy
# nlp = spacy.load('en')

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

sample_size = 20000

def get_training_data(path):
    """
    Get all usable data
    :param path: path to compressed data
    :return: panda data frame
    """
    i = 0
    df = {}
    for d in parse(path):
        i += 1
        if i <= sample_size:
            df[i] = d
        else:
            break
        if (i + 1) % 1000 == 0:
            print("Step:", i + 1)
    return pd.DataFrame.from_dict(df, orient='index')


# def get_test_data(path):
#     """
#     Do not call this before the real test!!!!
#     """
#     pass
#     i = 0
#     df = {}
#     for d in parse(path):
#         i += 1
#         if i > 1400000:
#             df[i] = d
#     return pd.DataFrame.from_dict(df, orient='index')


start_time = time.time()
df = get_training_data('reviews_Electronics_5.json.gz')

print("Time :", time.time() - start_time)

df_1 = df


Step: 1000
Step: 2000
Step: 3000
Step: 4000
Step: 5000
Step: 6000
Step: 7000
Step: 8000
Step: 9000
Step: 10000
Step: 11000
Step: 12000
Step: 13000
Step: 14000
Step: 15000
Step: 16000
Step: 17000
Step: 18000
Step: 19000
Step: 20000
Time : 1.4663589000701904

In [4]:
%time
def fix_dataframe(df = df_1):
    y = df['overall'].values
    X = df['reviewText']
    df = pd.DataFrame(np.column_stack((X,y)), columns = ['text', 'review_labels'])
    return df
df = fix_dataframe(df_1)


Wall time: 0 ns

In [5]:
%time
def split_data(df = df):
    train_df, test_df = train_test_split(df)
    # print(train_df.head())
    # return pd.DataFrame(train_df, columns=['text', 'labels']), pd.DataFrame(test_df, columns=['text', 'labels'])
    return train_df, test_df
train_df, test_df = split_data(df)
train_df.head()
print(train_df.shape)


Wall time: 0 ns
(15000, 2)

In [6]:
# Using the standard stopwords given by nltk. Can also do feature relevance according to word frequency limits.
# Could also test a limit for word appearance in a given percentage of texts.
start_time = time.time()
def find_words(df = train_df, 
               stopword = False, 
               word_frequency = [sample_size, np.log(sample_size)], 
               number_of_words = 3000):
    # stemmer = SnowballStemmer('english')
    start_time = time.time()
    texts = df['text'].values
    # dictionary = np.unique([word.lower() for text in texts for word in word_tokenize(text)])
    # word_count = Counter([word.lower() for text in texts for word in word_tokenize(text)])
    
    if stopword == False:
        word_count = Counter([word.lower() for text in texts 
                              for word in word_tokenize(text)])
        if word_frequency != None:
            word_count = {word: count for word, count in word_count.items() 
                          if count < word_frequency[0] and count > word_frequency[1]}            
    elif stopword == True:
        word_count = Counter([word.lower() 
                                for text in texts 
                                for word in word_tokenize(text) 
                                if word not in stopwords.words('english')])
    else:
        raise ValueError('stopword argument needs to be True/False')
    print('first_time:', time.time() - start_time)
    dictionary = [word for word, count in word_count.items()]
    word_count = sorted([(word, count) for word, count in word_count.items()], key = lambda x: -x[1])
    return word_count, dictionary[:number_of_words]
word_freq, dictionary = find_words()
# print(word_freq[:100])
print('Total time:', time.time() - start_time)


first_time: 12.586816787719727
Total time: 12.59432053565979

In [7]:
def find_bigrams(words):
    return zip(words, words[1:])

In [9]:
start_time = time.time()

def get_bigrams(df=train_df, 
                lower_limit=np.log(sample_size), 
                upper_limit = sample_size, 
                number_of_bigrams = 1000):
    
    texts = df['text'].values
    # lower case text
    texts_lower = [[word.lower() for word in word_tokenize(text)] for text in texts]
    # bigrams from the lower case text
    bigrams = [gram for text in texts_lower for gram in find_bigrams(text)]
    # Count of bigrams sorted
    bigram_count = Counter(bigrams)
    
    # start_time = time.time()
    # bigrams = [bigram for bigram, count in bigram_count.items() if count > lower_limit]
    
    sorted_bigrams = sorted([(bigram, count)
                             for bigram, count in bigram_count.items()
                             if count > lower_limit and count < upper_limit],
                            key=lambda x: -x[1])
    bigrams = [bigram for bigram, count in sorted_bigrams]
    ######### sorted_bigrams = np.sort(np.array(bigram_count.items())
    # sorted_bigrams = sorted(bigram_count.items(), key = lambda x: -x[1])
    # sorted_bigrams = [i for i in sorted_bigrams if i[0] in bigrams]
    # print('sort bigram time:', time.time() - start_time)

    return bigrams[:number_of_bigrams], sorted_bigrams, texts_lower
    # (',', 'but) , ('do', "n't"), ('the', 'price'), ('.', 'if'), ('but', 'it'), ('did', "n't"), 
bigrams, bigram_count, texts_lower = get_bigrams()
# print(bigrams)
# print(bigram_count[:100])
print('Total time:', time.time() - start_time)


Total time: 13.651100873947144
Total time: 13.588640213012695

In [10]:
# start_time = time.time()
# 
# 
# def word_dataframe(texts = train_df.text.values, 
#                    words = dictionary):
#     word_occurances = []
#     start_time = time.time()
#     # texts_lower = [[word.lower() for word in text] for text in texts]
#     for text in texts:
#         text_occurences = np.zeros(len(words))
#         for word in word_tokenize(text):
#             word = word.lower()
#             if word in words:
#                 index = words.index(word)
#                 text_occurences[index] += 1
#         word_occurances.append(text_occurences)
#     print('loop time:', time.time() - start_time)
#     X_words = pd.DataFrame(np.array(word_occurances), columns = words)
#     return X_words
# 
# 
# _ = word_dataframe()
# 
# print('Total time:', time.time() - start_time)
# _.head()

In [11]:
# start_time = time.time()
def word_dataframe(texts = train_df.text.values, 
                   words = dictionary):
    texts_lower = [[word.lower() for word in word_tokenize(text)] for text in texts]
    counts = [Counter(text) for text in texts_lower]
    word_occurances = np.array([[counts[i][word] for word in words] for i in range(len(counts))])
    X_words = pd.DataFrame(word_occurances, columns = words)
    return X_words
# _ = word_dataframe()
# print('total time:', time.time() - start_time)
# _.head()

In [12]:
# start_time = time.time()
# 
# def bigram_dataframe(texts = train_df.text.values, 
#                    bigrams = bigrams):
#     bigram_occurances = []
#     for text in texts:
#         text_occurences = np.zeros(len(bigrams))
#         text_words = [word.lower() for word in word_tokenize(text)]
#         bigrams_in_text = [gram for gram in find_bigrams(text_words)]
#         for gram in bigrams_in_text:
#             if gram in bigrams:
#                 index = bigrams.index(gram)
#                 text_occurences[index] += 1
#         bigram_occurances.append(text_occurences)
#     
#     cols = [str(gram) for gram in bigrams]
#     # print(cols)
#     X_bigrams = pd.DataFrame(np.array(bigram_occurances), columns = cols)
#     return X_bigrams
# _ = bigram_dataframe()
# print('Total time:', time.time() - start_time)
# _.head()

In [13]:
# start_time = time.time()
def bigram_dataframe(texts = train_df.text.values, 
                   bigrams = bigrams):
    texts_lower = [[word.lower() for word in word_tokenize(text)] for text in texts]
    # print(texts_lower[0])
    bigrams_in_text = [[gram for gram in find_bigrams(text)] for text in texts_lower]
    bigram_counts = [Counter(gram) for gram in bigrams_in_text]
    bigram_occurances = np.array([[bigram_counts[i][gram] for gram in bigrams] for i in range(len(bigram_counts))])

    cols = [str(gram) for gram in bigrams]
    X_bigrams = pd.DataFrame(np.array(bigram_occurances), columns = cols)
    return X_bigrams
# _ = bigram_dataframe()
# print('total time:', time.time() - start_time)
# _.head()
# bigram_dataframe()

In [14]:
start_time = time.time()

# Dataframes of bigrams/unigrams in the train and test datasets.
train_bigrams = bigram_dataframe(texts = train_df.text.values)
test_bigrams = bigram_dataframe(texts = test_df.text.values)

train_unigrams = word_dataframe(texts = train_df.text.values)
test_unigrams = word_dataframe(texts = test_df.text.values)

print('Total time:', time.time() - start_time)


Total time: 78.97760486602783

In [15]:
print(train_unigrams.head())
print(train_bigrams.head())
print(20*'#')
print(train_unigrams.shape)
print(test_unigrams.shape)


   considerably  communicate  fixed  ideal  remembered  invested  distort  \
0             0            0      0      0           0         0        0   
1             0            0      0      0           0         0        0   
2             0            0      0      0           0         0        0   
3             0            0      0      0           0         0        0   
4             0            0      0      0           0         0        0   

   perspective  rom  v2    ...     conversion  tripplite  was  regarding  \
0            0    0   0    ...              0          0    0          0   
1            0    0   0    ...              0          0    0          0   
2            0    0   0    ...              0          0    0          0   
3            0    0   0    ...              0          0    0          0   
4            0    0   0    ...              0          0    0          0   

   proprietary  lens.i  recharges  pal  reception  lifespan  
0            0       0          0    0          0         0  
1            0       0          0    0          0         0  
2            0       0          0    0          0         0  
3            0       0          0    0          0         0  
4            0       0          0    0          0         0  

[5 rows x 3000 columns]
   ('.', 'i')  (',', 'and')  ('.', 'the')  (',', 'but')  ('of', 'the')  \
0           0             1             0             0              0   
1           0             1             0             0              0   
2           0             1             0             1              0   
3           0             0             0             0              0   
4           0             0             0             0              0   

   ('.', 'it')  (',', 'i')  ('i', 'have')  ('it', "'s")  ('on', 'the')  \
0            0           0              0             0              0   
1            0           0              0             0              0   
2            0           0              0             0              0   
3            0           0              0             0              0   
4            0           0              0             0              0   

       ...        ('my', 'head')  ('a', 'more')  ('for', 'over')  ('to', '.')  \
0      ...                     0              0                0            0   
1      ...                     0              0                0            0   
2      ...                     0              0                0            0   
3      ...                     0              0                0            0   
4      ...                     0              0                0            0   

   ('light', ',')  ('and', 'all')  ('love', 'this')  ('a', 'must')  \
0               0               0                 0              0   
1               0               0                 0              0   
2               0               0                 0              0   
3               0               0                 0              0   
4               0               0                 0              0   

   ('my', 'lens')  ('got', 'it')  
0               0              0  
1               0              0  
2               0              0  
3               0              0  
4               0              0  

[5 rows x 1000 columns]
####################
(15000, 3000)
(5000, 3000)

In [16]:
# Combining the bigrams and unigrams dataframes to one dataframe. 
def merge_grams(unigrams, bigrams):
    combined = pd.concat([bigrams, unigrams], axis = 1)
    return combined

In [17]:
# Get the length of each review
train_lengths = [len([word for word in word_tokenize(text)]) for text in train_df.text.values]
test_lengths = [len([word for word in word_tokenize(text)]) for text in test_df.text.values]
print(np.array(train_lengths).shape)
print(np.array(test_lengths).shape)
# lengths_test = [len(text) for text in word_tokenize(test_df.text.values)]


(15000,)
(5000,)

In [18]:
X_train = merge_grams(train_bigrams, train_unigrams)
X_test = merge_grams(test_bigrams, test_unigrams)
y_train = train_df['review_labels']
y_test = test_df['review_labels']

In [19]:
X_train['review_lenght'] = train_lengths
X_test['review_lenght'] = test_lengths

In [20]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')
y_train.to_pickle('y_train.pkl')
y_test.to_pickle('y_test.pkl')

In [ ]: