In [1]:
import numpy as np, pandas as pd
from nltk.tokenize import word_tokenize
from collections import Counter
import re
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('poster' ,rc={'figure.figsize': (10,7)})
In [2]:
path_to_file = 'Amazon_Musical_Instruments.csv'
In [208]:
def prepare_data(path_to_file):
"""Put data in the correct form for the model"""
df = pd.read_csv(path_to_file, sep='\t')
df = df.dropna()
df = df.rename(columns={col:col.lstrip().lower() for col in df.columns})
df = df[['comment', 'rating']]
df.comment = (df.comment.str.lower().str.replace('[^a-z.,;:\(\)?!]+', ' ')
.str.replace('.', '. '))
print('Database loaded and formatted ; tokenizing comments...')
df.comment = df.comment.apply(word_tokenize)
words = list(chain(*df.comment))
c = Counter(words)
freqs = c.most_common()
counts = dict(freqs)
dictionary = dict(zip([w for w,f in freqs], range(2, len(freqs)+2)))
reverse_dictionary = {v:k for k,v in dictionary.items()}
print('Comments tokenized and various utility structures created')
df['data'] = df.comment.apply(lambda l: [dictionary[w] for w in l])
dataset = (list(df.data), list(df.rating))
print("Done")
return dataset, counts, dictionary, reverse_dictionary
In [209]:
path_to_file = 'Amazon_Musical_Instruments.csv'
dataset, counts, dictionary, reverse_dictionary = prepare_data(path_to_file)