In [1]:
import os
path = 'C:\\users\johannes\ProjectAmazonTextAnalysis\johannes'
os.chdir(path)
import pickle
import pandas as pd
import numpy as np
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split
from collections import Counter
import gzip
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk import ngrams
from nltk.corpus import stopwords
import time
In [2]:
# import spacy
# nlp = spacy.load('en')
In [3]:
def parse(path):
g = gzip.open(path, 'rb')
for l in g:
yield eval(l)
sample_size = 20000
def get_training_data(path):
"""
Get all usable data
:param path: path to compressed data
:return: panda data frame
"""
i = 0
df = {}
for d in parse(path):
i += 1
if i <= sample_size:
df[i] = d
else:
break
if (i + 1) % 1000 == 0:
print("Step:", i + 1)
return pd.DataFrame.from_dict(df, orient='index')
# def get_test_data(path):
# """
# Do not call this before the real test!!!!
# """
# pass
# i = 0
# df = {}
# for d in parse(path):
# i += 1
# if i > 1400000:
# df[i] = d
# return pd.DataFrame.from_dict(df, orient='index')
start_time = time.time()
df = get_training_data('reviews_Electronics_5.json.gz')
print("Time :", time.time() - start_time)
df_1 = df
In [4]:
%time
def fix_dataframe(df = df_1):
y = df['overall'].values
X = df['reviewText']
df = pd.DataFrame(np.column_stack((X,y)), columns = ['text', 'review_labels'])
return df
df = fix_dataframe(df_1)
In [5]:
%time
def split_data(df = df):
train_df, test_df = train_test_split(df)
# print(train_df.head())
# return pd.DataFrame(train_df, columns=['text', 'labels']), pd.DataFrame(test_df, columns=['text', 'labels'])
return train_df, test_df
train_df, test_df = split_data(df)
train_df.head()
print(train_df.shape)
In [6]:
# Using the standard stopwords given by nltk. Can also do feature relevance according to word frequency limits.
# Could also test a limit for word appearance in a given percentage of texts.
start_time = time.time()
def find_words(df = train_df,
stopword = False,
word_frequency = [sample_size, np.log(sample_size)],
number_of_words = 3000):
# stemmer = SnowballStemmer('english')
start_time = time.time()
texts = df['text'].values
# dictionary = np.unique([word.lower() for text in texts for word in word_tokenize(text)])
# word_count = Counter([word.lower() for text in texts for word in word_tokenize(text)])
if stopword == False:
word_count = Counter([word.lower() for text in texts
for word in word_tokenize(text)])
if word_frequency != None:
word_count = {word: count for word, count in word_count.items()
if count < word_frequency[0] and count > word_frequency[1]}
elif stopword == True:
word_count = Counter([word.lower()
for text in texts
for word in word_tokenize(text)
if word not in stopwords.words('english')])
else:
raise ValueError('stopword argument needs to be True/False')
print('first_time:', time.time() - start_time)
dictionary = [word for word, count in word_count.items()]
word_count = sorted([(word, count) for word, count in word_count.items()], key = lambda x: -x[1])
return word_count, dictionary[:number_of_words]
word_freq, dictionary = find_words()
# print(word_freq[:100])
print('Total time:', time.time() - start_time)
In [7]:
def find_bigrams(words):
return zip(words, words[1:])
In [9]:
start_time = time.time()
def get_bigrams(df=train_df,
lower_limit=np.log(sample_size),
upper_limit = sample_size,
number_of_bigrams = 1000):
texts = df['text'].values
# lower case text
texts_lower = [[word.lower() for word in word_tokenize(text)] for text in texts]
# bigrams from the lower case text
bigrams = [gram for text in texts_lower for gram in find_bigrams(text)]
# Count of bigrams sorted
bigram_count = Counter(bigrams)
# start_time = time.time()
# bigrams = [bigram for bigram, count in bigram_count.items() if count > lower_limit]
sorted_bigrams = sorted([(bigram, count)
for bigram, count in bigram_count.items()
if count > lower_limit and count < upper_limit],
key=lambda x: -x[1])
bigrams = [bigram for bigram, count in sorted_bigrams]
######### sorted_bigrams = np.sort(np.array(bigram_count.items())
# sorted_bigrams = sorted(bigram_count.items(), key = lambda x: -x[1])
# sorted_bigrams = [i for i in sorted_bigrams if i[0] in bigrams]
# print('sort bigram time:', time.time() - start_time)
return bigrams[:number_of_bigrams], sorted_bigrams, texts_lower
# (',', 'but) , ('do', "n't"), ('the', 'price'), ('.', 'if'), ('but', 'it'), ('did', "n't"),
bigrams, bigram_count, texts_lower = get_bigrams()
# print(bigrams)
# print(bigram_count[:100])
print('Total time:', time.time() - start_time)
In [10]:
# start_time = time.time()
#
#
# def word_dataframe(texts = train_df.text.values,
# words = dictionary):
# word_occurances = []
# start_time = time.time()
# # texts_lower = [[word.lower() for word in text] for text in texts]
# for text in texts:
# text_occurences = np.zeros(len(words))
# for word in word_tokenize(text):
# word = word.lower()
# if word in words:
# index = words.index(word)
# text_occurences[index] += 1
# word_occurances.append(text_occurences)
# print('loop time:', time.time() - start_time)
# X_words = pd.DataFrame(np.array(word_occurances), columns = words)
# return X_words
#
#
# _ = word_dataframe()
#
# print('Total time:', time.time() - start_time)
# _.head()
In [11]:
# start_time = time.time()
def word_dataframe(texts = train_df.text.values,
words = dictionary):
texts_lower = [[word.lower() for word in word_tokenize(text)] for text in texts]
counts = [Counter(text) for text in texts_lower]
word_occurances = np.array([[counts[i][word] for word in words] for i in range(len(counts))])
X_words = pd.DataFrame(word_occurances, columns = words)
return X_words
# _ = word_dataframe()
# print('total time:', time.time() - start_time)
# _.head()
In [12]:
# start_time = time.time()
#
# def bigram_dataframe(texts = train_df.text.values,
# bigrams = bigrams):
# bigram_occurances = []
# for text in texts:
# text_occurences = np.zeros(len(bigrams))
# text_words = [word.lower() for word in word_tokenize(text)]
# bigrams_in_text = [gram for gram in find_bigrams(text_words)]
# for gram in bigrams_in_text:
# if gram in bigrams:
# index = bigrams.index(gram)
# text_occurences[index] += 1
# bigram_occurances.append(text_occurences)
#
# cols = [str(gram) for gram in bigrams]
# # print(cols)
# X_bigrams = pd.DataFrame(np.array(bigram_occurances), columns = cols)
# return X_bigrams
# _ = bigram_dataframe()
# print('Total time:', time.time() - start_time)
# _.head()
In [13]:
# start_time = time.time()
def bigram_dataframe(texts = train_df.text.values,
bigrams = bigrams):
texts_lower = [[word.lower() for word in word_tokenize(text)] for text in texts]
# print(texts_lower[0])
bigrams_in_text = [[gram for gram in find_bigrams(text)] for text in texts_lower]
bigram_counts = [Counter(gram) for gram in bigrams_in_text]
bigram_occurances = np.array([[bigram_counts[i][gram] for gram in bigrams] for i in range(len(bigram_counts))])
cols = [str(gram) for gram in bigrams]
X_bigrams = pd.DataFrame(np.array(bigram_occurances), columns = cols)
return X_bigrams
# _ = bigram_dataframe()
# print('total time:', time.time() - start_time)
# _.head()
# bigram_dataframe()
In [14]:
start_time = time.time()
# Dataframes of bigrams/unigrams in the train and test datasets.
train_bigrams = bigram_dataframe(texts = train_df.text.values)
test_bigrams = bigram_dataframe(texts = test_df.text.values)
train_unigrams = word_dataframe(texts = train_df.text.values)
test_unigrams = word_dataframe(texts = test_df.text.values)
print('Total time:', time.time() - start_time)
In [15]:
print(train_unigrams.head())
print(train_bigrams.head())
print(20*'#')
print(train_unigrams.shape)
print(test_unigrams.shape)
In [16]:
# Combining the bigrams and unigrams dataframes to one dataframe.
def merge_grams(unigrams, bigrams):
combined = pd.concat([bigrams, unigrams], axis = 1)
return combined
In [17]:
# Get the length of each review
train_lengths = [len([word for word in word_tokenize(text)]) for text in train_df.text.values]
test_lengths = [len([word for word in word_tokenize(text)]) for text in test_df.text.values]
print(np.array(train_lengths).shape)
print(np.array(test_lengths).shape)
# lengths_test = [len(text) for text in word_tokenize(test_df.text.values)]
In [18]:
X_train = merge_grams(train_bigrams, train_unigrams)
X_test = merge_grams(test_bigrams, test_unigrams)
y_train = train_df['review_labels']
y_test = test_df['review_labels']
In [19]:
X_train['review_lenght'] = train_lengths
X_test['review_lenght'] = test_lengths
In [20]:
X_train.to_pickle('X_train.pkl')
X_test.to_pickle('X_test.pkl')
y_train.to_pickle('y_train.pkl')
y_test.to_pickle('y_test.pkl')
In [ ]: