Per-Review Analysis


More here.


In [40]:
# IMPORTS
from __future__ import print_function
from pandas import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import sklearn
import csv
import os
import numpy as np
from collections import defaultdict

# Constants
REVIEWS_CSV = 'processed/word-freq-by-review/'
ALL_WORDS_FILE = 'processed/all_sorted'
DATA_DIR = 'processed/pandas/'
ALL_WORDS = []
# Map an id to an array for row numbers
TRAINING_ROWS = {}
TEST_ROWS = {}
# Style: [beer ids]
STYLES = defaultdict(list)
ALL_STYLES = []
# data...
DATA = {}


# Word Filtering
SMALL_A = 97
SMALL_Z = SMALL_A + 26

def is_worth_keeping(word):
    word = word.lower()
    word = word.strip()
    if len(word) >= 3:
        for c in word:
            if ord(c) < SMALL_A or ord(c) > SMALL_Z:
                return False
        return True
    return False

def walkDir(base):
    """
    Generate a single list of all the files in a directory
    DFS, and include the full path relative to base.

    """
    files = list(os.walk(base))
    files = files[1:]
    paths = []
    for group in files:
        for f in group[2]:
            paths.append(group[0] + '/' + f)
    paths = list(filter(lambda x: x.find('.txt') == -1, paths))
    return paths

In [ ]:


In [18]:
# Read In Files
with open(ALL_WORDS_FILE, 'r') as f:
    lines = f.readlines()
    print('Found:', len(lines), 'total words')
    ALL_WORDS = [word for word in lines if is_worth_keeping(word)]    
print('Found:', len(ALL_WORDS), 'usable words')


Found: 259005 total words
Found: 232090 usable words

In [ ]:
paths = walkDir(REVIEWS_CSV)
errors = []
for p in paths:
    parts = p.split('/')
    style = parts[2]
    id = parts[3]
    # Setup
    STYLES[style] += [ id ] # contains an array of file ids
    TRAINING_ROWS[id] = []
    TEST_ROWS[id] = []
    #raw_data = ''
    #with open(p, 'r+') as f:
    #    raw_data = f.read()
    if id not in DATA:
        try:
            csv_data = read_csv(p, header=0)
            # Cleanup bad first columns...
            del csv_data[csv_data.icol(0).name]
            DATA[id] = csv_data
        except Exception as e:
            print(e)
            errors.append(p)
            print(p)

In [ ]:
len(DATA.keys())

In [ ]:


In [ ]: