In [1]:
import numpy as np, pandas as pd
from nltk.tokenize import word_tokenize
from collections import Counter
import re
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('poster' ,rc={'figure.figsize': (10,7)})

In [2]:
path_to_file = 'Amazon_Musical_Instruments.csv'

In [208]:
def prepare_data(path_to_file):
    """Put data in the correct form for the model"""
    df = pd.read_csv(path_to_file, sep='\t')
    df = df.dropna()
    df = df.rename(columns={col:col.lstrip().lower() for col in df.columns})
    df = df[['comment', 'rating']]
    df.comment = (df.comment.str.lower().str.replace('[^a-z.,;:\(\)?!]+', ' ')
                  .str.replace('.', '. '))
    print('Database loaded and formatted ; tokenizing comments...')
    
    df.comment = df.comment.apply(word_tokenize)
    words = list(chain(*df.comment))
    c = Counter(words)
    freqs = c.most_common()
    counts = dict(freqs)
    
    dictionary = dict(zip([w for w,f in freqs], range(2, len(freqs)+2)))
    reverse_dictionary = {v:k for k,v in dictionary.items()}
    print('Comments tokenized and various utility structures created')

    df['data'] = df.comment.apply(lambda l: [dictionary[w] for w in l])

    dataset = (list(df.data), list(df.rating))
    print("Done")
    return dataset, counts, dictionary, reverse_dictionary

In [209]:
path_to_file = 'Amazon_Musical_Instruments.csv'
dataset, counts, dictionary, reverse_dictionary = prepare_data(path_to_file)


Database loaded and formatted ; tokenizing comments...
Comments tokenized and various utility structures created
Done