notebook.community

Edit and run



In [2]:

    
import os
path = 'C:\\users\johannes\ProjectAmazonTextAnalysis\johannes'
os.chdir(path)
import pickle

import pandas as pd
import numpy as np

import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split

from collections import Counter
import gzip

from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk import ngrams
from nltk.corpus import stopwords


import time



In [3]:



In [3]:

    
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

sample_size = 1400

def get_training_data(path):
    """
    Get all usable data
    :param path: path to compressed data
    :return: panda data frame
    """
    i = 0
    df = {}
    for d in parse(path):
        i += 1
        if i <= sample_size:
            df[i] = d
        else:
            break
        if (i + 1) % 1000 == 0:
            print("Step:", i + 1)
    return pd.DataFrame.from_dict(df, orient='index')


# def get_test_data(path):
#     """
#     Do not call this before the real test!!!!
#     """
#     pass
#     i = 0
#     df = {}
#     for d in parse(path):
#         i += 1
#         if i > 1400000:
#             df[i] = d
#     return pd.DataFrame.from_dict(df, orient='index')


start_time = time.time()
df = get_training_data('reviews_Electronics_5.json.gz')

print("Time :", time.time() - start_time)

df_1 = df









    



Step: 1000
Time : 0.10109710693359375



In [4]:

    
def fix_dataframe(df = df_1):
    y = df['overall'].values
    X = df['reviewText']
    df = pd.DataFrame(np.column_stack((X,y)), columns = ['text', 'labels'])
    return df
df = fix_dataframe(df_1)



In [5]:

    
def split_data(df = df):
    train_df, test_df = train_test_split(df)
    # print(train_df.head())
    # return pd.DataFrame(train_df, columns=['text', 'labels']), pd.DataFrame(test_df, columns=['text', 'labels'])
    return train_df, test_df
train_df, test_df = split_data(df)
train_df.head()









    Out[5]:






  
    
      
      text
      labels
    
  
  
    
      863
      Original Review: This cable does the job but b...
      2
    
    
      1069
      How do audio cassettes load in your car's cass...
      5
    
    
      318
      The Nook 7&#34; 8GB Wifi tablet was a Christma...
      5
    
    
      197
      When we first bought our bedroom tv, we looked...
      5
    
    
      467
      I got this as a gift about 5 months ago.  It d...
      1



In [6]:

    
# Using the standard stopwords given by nltk. Can also do feature relevance according to word frequency limits.
# Could also test a limit for word appearance in a given percentage of texts.

def find_words(df = train_df, stopword = False, word_frequency = [1000, 10]):
    stemmer = SnowballStemmer('english')
    
    texts = df['text'].values
    # dictionary = np.unique([word.lower() for text in texts for word in word_tokenize(text)])
    # word_count = Counter([word.lower() for text in texts for word in word_tokenize(text)])
    
    if stopword == False:
        word_count = Counter([word.lower() for text in texts 
                              for word in word_tokenize(text)])
        if word_frequency != None:
            word_count = {word: count for word, count in word_count.items() 
                          if count < word_frequency[0] and count > word_frequency[1]}            
    elif stopword == True:
        word_count = Counter([word.lower() 
                                for text in texts 
                                for word in word_tokenize(text) 
                                if word not in stopwords.words('english')])
    else:
        raise ValueError('stopword argument needs to be True/False')
    
    dictionary = [word for word, count in word_count.items()]
    word_count = sorted([(word, count) for word, count in word_count.items()], key = lambda x: -x[1])
    return word_count, dictionary
word_freq, dictionary = find_words()
print(word_freq[:100])









    



[("n't", 925), (')', 912), ('was', 910), ("'s", 884), ('as', 806), ('(', 789), ('can', 748), ('are', 692), ('or', 664), ('so', 660), ('be', 654), ('!', 583), ('from', 581), ('if', 576), ('do', 570), ('one', 566), ('&', 552), (';', 551), ('at', 516), ('an', 509), ('very', 479), ('all', 475), ('books', 472), ('use', 462), ('kindle', 457), ('like', 450), ('has', 442), ('just', 439), ('they', 433), ('more', 425), ('no', 414), ('me', 412), ('get', 409), ('your', 406), ('great', 393), ('would', 389), ('when', 389), ('had', 388), ('good', 385), ('screen', 383), ('than', 373), ('will', 372), ('read', 355), ('up', 343), ('about', 334), ('tv', 333), ('which', 326), ('does', 321), ('only', 318), ('there', 310), ('also', 304), ('tablet', 301), ('out', 299), ('...', 299), ('well', 292), ('n', 281), ('did', 273), ('b', 272), ('device', 271), ('other', 270), ('now', 270), ('some', 264), ('works', 262), ('what', 259), ("''", 257), ('much', 256), ("'m", 256), ('-', 255), (':', 254), ('am', 250), ('them', 249), ('book', 245), ('even', 244), ('bought', 243), ('radio', 241), ('these', 237), ('time', 237), ('buy', 233), ('because', 233), ('price', 232), ('any', 230), ('reading', 229), ('card', 228), ('by', 219), ('easy', 216), ('still', 215), ('could', 214), ('after', 214), ('really', 210), ('back', 208), ("'ve", 207), ('want', 206), ('$', 203), ('mount', 203), ('color', 199), ('used', 199), ('android', 195), ('battery', 194), ('better', 190), ('we', 190)]



In [9]:

    
def get_bigrams(df = train_df ,lower_limit = 10):
    texts = df['text'].values
    texts_lower = []
    for text in texts:
        words = []
        for word in word_tokenize(text):
            words.append(word.lower())
        texts_lower.append(words)
    # print(texts_lower[:2])

    bigrams = [gram for text in texts_lower for gram in ngrams(text, n=2)]
    bigram_count = Counter(bigrams)
    bigrams = [bigram for bigram, count in bigram_count.items() if count > lower_limit]
    sorted_bigrams = sorted([(bigram, count) 
                             for bigram, count in bigram_count.items() 
                             if bigram in bigrams], 
                            key = lambda x: -x[1])
    # print(sorted_bigrams[0:20])
    return bigrams, sorted_bigrams
    # (',', 'but) , ('do', "n't"), ('the', 'price'), ('.', 'if'), ('but', 'it'), ('did', "n't"), 
bigrams, bigram_count = get_bigrams()
# get_bigrams()



In [16]:

    
def word_dataframe(texts = train_df.text.values, 
                   words = dictionary, 
                   labels = train_df['labels'].values):
    word_occurances = []
    for text in texts:
        text_occurences = np.zeros(len(words))
        for word in word_tokenize(text):
            word = word.lower()
            if word in words:
                index = words.index(word)
                text_occurences[index] += 1
        word_occurances.append(text_occurences)
    
    X_words = pd.DataFrame(np.array(word_occurances), columns = words)
    y = labels
    return X_words, y

X_words, y = word_dataframe()



In [12]:

    
def bigram_dataframe(texts = train_df.text.values, 
                   bigrams = bigrams, 
                   labels = train_df['labels'].values):
    bigram_occurances = []
    for text in texts:
        text_occurences = np.zeros(len(bigrams))
        text_words = [word.lower() for word in word_tokenize(text)]
        bigrams_in_text = [gram for gram in ngrams(text_words, n=2)]
        for gram in bigrams_in_text:
            if gram in bigrams:
                index = bigrams.index(gram)
                text_occurences[index] += 1
        bigram_occurances.append(text_occurences)
    
    cols = [str(gram) for gram in bigrams]
    # print(cols)
    X_bigrams = pd.DataFrame(np.array(bigram_occurances), columns = cols)
    y = labels
    return X_bigrams
# 
X_bigrams = bigram_dataframe()
# bigram_dataframe()
X_bigrams.head()









    Out[12]:






  
    
      
      ('would', 'recommend')
      ('and', 'get')
      ('one', 'of')
      ('think', 'the')
      ('price', '.')
      ('and', 'no')
      ('did', 'not')
      ('install', '.')
      ('this', 'item')
      ('the', 'ipad')
      ...
      ('is', 'on')
      ('and', 'so')
      ('nook', 'tablet')
      ('but', 'i')
      ('if', 'they')
      ('into', 'the')
      ('.', 'once')
      ('from', 'the')
      ('i', 'did')
      ('in', 'and')
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      1.0
      1.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 1769 columns



In [14]:

    
print(train_df['labels'].shape)
print(X_bigrams.shape)









    



(1050,)
(1050, 1769)



In [22]:

    
print(X_words.shape)









    



(1050, 1313)



In [29]:

    
# Combining the bigrams and unigrams to one dataframe. 
X_train = pd.concat([X_bigrams, X_words], axis = 1)
print(X_train.shape)
X_train.head()









    



(1050, 3082)






    Out[29]:






  
    
      
      ('would', 'recommend')
      ('and', 'get')
      ('one', 'of')
      ('think', 'the')
      ('price', '.')
      ('and', 'no')
      ('did', 'not')
      ('install', '.')
      ('this', 'item')
      ('the', 'ipad')
      ...
      option
      sold
      ereaders
      plug
      covers
      iphone
      youtube
      money
      wire
      work
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 3082 columns



In [ ]:

	text	labels
863	Original Review: This cable does the job but b...	2
1069	How do audio cassettes load in your car's cass...	5
318	The Nook 7" 8GB Wifi tablet was a Christma...	5
197	When we first bought our bedroom tv, we looked...	5
467	I got this as a gift about 5 months ago. It d...	1

	('one', 'of')	...	('from', 'the')	('i', 'did')
0	0.0	...	0.0	0.0
1	0.0	...	1.0	1.0
2	0.0	...	0.0	0.0
3	1.0	...	0.0	0.0
4	0.0	...	0.0	0.0

	('one', 'of')	...
0	0.0	...
1	0.0	...
2	0.0	...
3	1.0	...
4	0.0	...

	('one', 'of')	...	('from', 'the')	('i', 'did')
0	0.0	...	0.0	0.0
1	0.0	...	1.0	1.0
2	0.0	...	0.0	0.0
3	1.0	...	0.0	0.0
4	0.0	...	0.0	0.0

	('one', 'of')	...
0	0.0	...
1	0.0	...
2	0.0	...
3	1.0	...
4	0.0	...

	('one', 'of')	...	('from', 'the')	('i', 'did')
0	0.0	...	0.0	0.0
1	0.0	...	1.0	1.0
2	0.0	...	0.0	0.0
3	1.0	...	0.0	0.0
4	0.0	...	0.0	0.0

	('one', 'of')	...
0	0.0	...
1	0.0	...
2	0.0	...
3	1.0	...
4	0.0	...