In [2]:
import os
path = 'C:\\users\johannes\ProjectAmazonTextAnalysis\johannes'
os.chdir(path)
import pickle

import pandas as pd
import numpy as np

import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split

from collections import Counter
import gzip

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk import ngrams
from nltk.corpus import stopwords


import time


0.18.1

In [3]:


In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

sample_size = 1400

def get_training_data(path):
    """
    Get all usable data
    :param path: path to compressed data
    :return: panda data frame
    """
    i = 0
    df = {}
    for d in parse(path):
        i += 1
        if i <= sample_size:
            df[i] = d
        else:
            break
        if (i + 1) % 1000 == 0:
            print("Step:", i + 1)
    return pd.DataFrame.from_dict(df, orient='index')


# def get_test_data(path):
#     """
#     Do not call this before the real test!!!!
#     """
#     pass
#     i = 0
#     df = {}
#     for d in parse(path):
#         i += 1
#         if i > 1400000:
#             df[i] = d
#     return pd.DataFrame.from_dict(df, orient='index')


start_time = time.time()
df = get_training_data('reviews_Electronics_5.json.gz')

print("Time :", time.time() - start_time)

df_1 = df


Step: 1000
Time : 0.10109710693359375

In [4]:
def fix_dataframe(df = df_1):
    y = df['overall'].values
    X = df['reviewText']
    df = pd.DataFrame(np.column_stack((X,y)), columns = ['text', 'labels'])
    return df
df = fix_dataframe(df_1)

In [5]:
def split_data(df = df):
    train_df, test_df = train_test_split(df)
    # print(train_df.head())
    # return pd.DataFrame(train_df, columns=['text', 'labels']), pd.DataFrame(test_df, columns=['text', 'labels'])
    return train_df, test_df
train_df, test_df = split_data(df)
train_df.head()


Out[5]:
text labels
863 Original Review: This cable does the job but b... 2
1069 How do audio cassettes load in your car's cass... 5
318 The Nook 7&#34; 8GB Wifi tablet was a Christma... 5
197 When we first bought our bedroom tv, we looked... 5
467 I got this as a gift about 5 months ago. It d... 1

In [6]:
# Using the standard stopwords given by nltk. Can also do feature relevance according to word frequency limits.
# Could also test a limit for word appearance in a given percentage of texts.

def find_words(df = train_df, stopword = False, word_frequency = [1000, 10]):
    stemmer = SnowballStemmer('english')
    
    texts = df['text'].values
    # dictionary = np.unique([word.lower() for text in texts for word in word_tokenize(text)])
    # word_count = Counter([word.lower() for text in texts for word in word_tokenize(text)])
    
    if stopword == False:
        word_count = Counter([word.lower() for text in texts 
                              for word in word_tokenize(text)])
        if word_frequency != None:
            word_count = {word: count for word, count in word_count.items() 
                          if count < word_frequency[0] and count > word_frequency[1]}            
    elif stopword == True:
        word_count = Counter([word.lower() 
                                for text in texts 
                                for word in word_tokenize(text) 
                                if word not in stopwords.words('english')])
    else:
        raise ValueError('stopword argument needs to be True/False')
    
    dictionary = [word for word, count in word_count.items()]
    word_count = sorted([(word, count) for word, count in word_count.items()], key = lambda x: -x[1])
    return word_count, dictionary
word_freq, dictionary = find_words()
print(word_freq[:100])


[("n't", 925), (')', 912), ('was', 910), ("'s", 884), ('as', 806), ('(', 789), ('can', 748), ('are', 692), ('or', 664), ('so', 660), ('be', 654), ('!', 583), ('from', 581), ('if', 576), ('do', 570), ('one', 566), ('&', 552), (';', 551), ('at', 516), ('an', 509), ('very', 479), ('all', 475), ('books', 472), ('use', 462), ('kindle', 457), ('like', 450), ('has', 442), ('just', 439), ('they', 433), ('more', 425), ('no', 414), ('me', 412), ('get', 409), ('your', 406), ('great', 393), ('would', 389), ('when', 389), ('had', 388), ('good', 385), ('screen', 383), ('than', 373), ('will', 372), ('read', 355), ('up', 343), ('about', 334), ('tv', 333), ('which', 326), ('does', 321), ('only', 318), ('there', 310), ('also', 304), ('tablet', 301), ('out', 299), ('...', 299), ('well', 292), ('n', 281), ('did', 273), ('b', 272), ('device', 271), ('other', 270), ('now', 270), ('some', 264), ('works', 262), ('what', 259), ("''", 257), ('much', 256), ("'m", 256), ('-', 255), (':', 254), ('am', 250), ('them', 249), ('book', 245), ('even', 244), ('bought', 243), ('radio', 241), ('these', 237), ('time', 237), ('buy', 233), ('because', 233), ('price', 232), ('any', 230), ('reading', 229), ('card', 228), ('by', 219), ('easy', 216), ('still', 215), ('could', 214), ('after', 214), ('really', 210), ('back', 208), ("'ve", 207), ('want', 206), ('$', 203), ('mount', 203), ('color', 199), ('used', 199), ('android', 195), ('battery', 194), ('better', 190), ('we', 190)]

In [9]:
def get_bigrams(df = train_df ,lower_limit = 10):
    texts = df['text'].values
    texts_lower = []
    for text in texts:
        words = []
        for word in word_tokenize(text):
            words.append(word.lower())
        texts_lower.append(words)
    # print(texts_lower[:2])

    bigrams = [gram for text in texts_lower for gram in ngrams(text, n=2)]
    bigram_count = Counter(bigrams)
    bigrams = [bigram for bigram, count in bigram_count.items() if count > lower_limit]
    sorted_bigrams = sorted([(bigram, count) 
                             for bigram, count in bigram_count.items() 
                             if bigram in bigrams], 
                            key = lambda x: -x[1])
    # print(sorted_bigrams[0:20])
    return bigrams, sorted_bigrams
    # (',', 'but) , ('do', "n't"), ('the', 'price'), ('.', 'if'), ('but', 'it'), ('did', "n't"), 
bigrams, bigram_count = get_bigrams()
# get_bigrams()

In [16]:
def word_dataframe(texts = train_df.text.values, 
                   words = dictionary, 
                   labels = train_df['labels'].values):
    word_occurances = []
    for text in texts:
        text_occurences = np.zeros(len(words))
        for word in word_tokenize(text):
            word = word.lower()
            if word in words:
                index = words.index(word)
                text_occurences[index] += 1
        word_occurances.append(text_occurences)
    
    X_words = pd.DataFrame(np.array(word_occurances), columns = words)
    y = labels
    return X_words, y

X_words, y = word_dataframe()

In [12]:
def bigram_dataframe(texts = train_df.text.values, 
                   bigrams = bigrams, 
                   labels = train_df['labels'].values):
    bigram_occurances = []
    for text in texts:
        text_occurences = np.zeros(len(bigrams))
        text_words = [word.lower() for word in word_tokenize(text)]
        bigrams_in_text = [gram for gram in ngrams(text_words, n=2)]
        for gram in bigrams_in_text:
            if gram in bigrams:
                index = bigrams.index(gram)
                text_occurences[index] += 1
        bigram_occurances.append(text_occurences)
    
    cols = [str(gram) for gram in bigrams]
    # print(cols)
    X_bigrams = pd.DataFrame(np.array(bigram_occurances), columns = cols)
    y = labels
    return X_bigrams
# 
X_bigrams = bigram_dataframe()
# bigram_dataframe()
X_bigrams.head()


Out[12]:
('would', 'recommend') ('and', 'get') ('one', 'of') ('think', 'the') ('price', '.') ('and', 'no') ('did', 'not') ('install', '.') ('this', 'item') ('the', 'ipad') ... ('is', 'on') ('and', 'so') ('nook', 'tablet') ('but', 'i') ('if', 'they') ('into', 'the') ('.', 'once') ('from', 'the') ('i', 'did') ('in', 'and')
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 1769 columns


In [14]:
print(train_df['labels'].shape)
print(X_bigrams.shape)


(1050,)
(1050, 1769)

In [22]:
print(X_words.shape)


(1050, 1313)

In [29]:
# Combining the bigrams and unigrams to one dataframe. 
X_train = pd.concat([X_bigrams, X_words], axis = 1)
print(X_train.shape)
X_train.head()


(1050, 3082)
Out[29]:
('would', 'recommend') ('and', 'get') ('one', 'of') ('think', 'the') ('price', '.') ('and', 'no') ('did', 'not') ('install', '.') ('this', 'item') ('the', 'ipad') ... option sold ereaders plug covers iphone youtube money wire work
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 3082 columns


In [ ]: