Per-Review Analysis


Note: All the paths are relative to the location of this file in project/ ! The following folders are expected:

  • data/
  • processed/ , processed/word-freq-by-review/ , processed/pandas/

This file is attempting to do Linear Regression on a per-review level to predict individual beers and styles


In [1]:
# IMPORTS
from __future__ import print_function
from pandas import *
from sklearn import linear_model
from sklearn.cross_validation import cross_val_score
import sklearn
import csv
import os
import numpy as np
from collections import defaultdict

# Constants
REVIEWS_CSV = 'processed/word-freq-by-review/'
ALL_WORDS_FILE = 'processed/all_sorted'
DATA_DIR = 'processed/pandas/'

# Word Filtering
SMALL_A = 97
SMALL_Z = SMALL_A + 26

def is_worth_keeping(word):
    word = word.lower()
    word = word.strip()
    if len(word) >= 3:
        for c in word:
            if ord(c) < SMALL_A or ord(c) > SMALL_Z:
                return False
        return True
    return False

def walkDir(base):
    """
    Generate a single list of all the files in a directory
    DFS, and include the full path relative to base.

    """
    files = list(os.walk(base))
    files = files[1:]
    paths = []
    for group in files:
        for f in group[2]:
            paths.append(group[0] + '/' + f)
    paths = list(filter(lambda x: x.find('.txt') == -1, paths))
    return paths

In [4]:
# Only refresh this cell when needed!!!
# --------------------------------------
ALL_WORDS = []
# Map an id to an array for row numbers
TRAINING_ROWS = {}
TEST_ROWS = {}
# Style: [beer ids]
STYLES = defaultdict(list)
# data...
DATA = {}

In [9]:
ALL_STYLES = []
for f in os.listdir('data'):
    if f != '.DS_Store' and f != 'all_beers.txt':
        ALL_STYLES.append(f)

print("%d styles" % len(ALL_STYLES))


104 styles

In [18]:
# Read In Files
with open(ALL_WORDS_FILE, 'r') as f:
    lines = f.readlines()
    print('Found:', len(lines), 'total words')
    ALL_WORDS = [word for word in lines if is_worth_keeping(word)]    
print('Found:', len(ALL_WORDS), 'usable words')


Found: 259005 total words
Found: 232090 usable words

In [12]:
# testing ground! 

ipa_data = 'processed/pandas/American_IPA.pkl'
data = read_pickle(ipa_data)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-12-d18eb8ddc69f> in <module>()
      2 
      3 ipa_data = 'processed/pandas/American_IPA.pkl'
----> 4 data = read_pickle(ipa_data)

/Library/Python/2.7/site-packages/pandas/io/pickle.pyc in read_pickle(path)
     58 
     59     try:
---> 60         return try_read(path)
     61     except:
     62         if PY3:

/Library/Python/2.7/site-packages/pandas/io/pickle.pyc in try_read(path, encoding)
     55             except:
     56                 with open(path, 'rb') as fh:
---> 57                     return pc.load(fh, encoding=encoding, compat=True)
     58 
     59     try:

/Library/Python/2.7/site-packages/pandas/compat/pickle_compat.pyc in load(fh, encoding, compat, is_verbose)
    114         up.is_verbose = is_verbose
    115 
--> 116         return up.load()
    117     except:
    118         raise

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.pyc in load(self)
    856             while 1:
    857                 key = read(1)
--> 858                 dispatch[key](self)
    859         except _Stop, stopinst:
    860             return stopinst.value

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/pickle.pyc in load_proto(self)
    884         proto = ord(self.read(1))
    885         if not 0 <= proto <= 2:
--> 886             raise ValueError, "unsupported pickle protocol: %d" % proto
    887     dispatch[PROTO] = load_proto
    888 

ValueError: unsupported pickle protocol: 4

In [11]:
# Create a linear regression
from sklearn import linear_model

logreg = linear_model.LogisticRegression(C=1e5)

In [ ]:
# IGNORE THIS CODE FOR NOW...
# ----------------------------

paths = walkDir(REVIEWS_CSV)
errors = []
for p in paths:
    parts = p.split('/')
    style = parts[2]
    id = parts[3]
    # Setup
    STYLES[style] += [ id ] # contains an array of file ids
    TRAINING_ROWS[id] = []
    TEST_ROWS[id] = []
    #raw_data = ''
    #with open(p, 'r+') as f:
    #    raw_data = f.read()
    if id not in DATA:
        try:
            csv_data = read_csv(p, header=0)
            # Cleanup bad first columns...
            del csv_data[csv_data.icol(0).name]
            DATA[id] = csv_data
        except Exception as e:
            print(e)
            errors.append(p)
            print(p)

In [ ]:


In [ ]: