Note: All the paths are relative to the location of this file in
project/! The following folders are expected:
data/processed/,processed/word-freq-by-review/,processed/pandas/
In [1]:
# IMPORTS
from __future__ import print_function
from pandas import *
from sklearn import linear_model
from sklearn.cross_validation import cross_val_score
import sklearn
import csv
import os
import numpy as np
from collections import defaultdict
# Constants
REVIEWS_CSV = 'processed/word-freq-by-review/'
ALL_WORDS_FILE = 'processed/all_sorted'
DATA_DIR = 'processed/pandas/'
# Word Filtering
SMALL_A = 97
SMALL_Z = SMALL_A + 26
def is_worth_keeping(word):
word = word.lower()
word = word.strip()
if len(word) >= 3:
for c in word:
if ord(c) < SMALL_A or ord(c) > SMALL_Z:
return False
return True
return False
def walkDir(base):
"""
Generate a single list of all the files in a directory
DFS, and include the full path relative to base.
"""
files = list(os.walk(base))
files = files[1:]
paths = []
for group in files:
for f in group[2]:
paths.append(group[0] + '/' + f)
paths = list(filter(lambda x: x.find('.txt') == -1, paths))
return paths
In [4]:
# Only refresh this cell when needed!!!
# --------------------------------------
ALL_WORDS = []
# Map an id to an array for row numbers
TRAINING_ROWS = {}
TEST_ROWS = {}
# Style: [beer ids]
STYLES = defaultdict(list)
# data...
DATA = {}
In [9]:
ALL_STYLES = []
for f in os.listdir('data'):
if f != '.DS_Store' and f != 'all_beers.txt':
ALL_STYLES.append(f)
print("%d styles" % len(ALL_STYLES))
In [18]:
# Read In Files
with open(ALL_WORDS_FILE, 'r') as f:
lines = f.readlines()
print('Found:', len(lines), 'total words')
ALL_WORDS = [word for word in lines if is_worth_keeping(word)]
print('Found:', len(ALL_WORDS), 'usable words')
In [12]:
# testing ground!
ipa_data = 'processed/pandas/American_IPA.pkl'
data = read_pickle(ipa_data)
In [11]:
# Create a linear regression
from sklearn import linear_model
logreg = linear_model.LogisticRegression(C=1e5)
In [ ]:
# IGNORE THIS CODE FOR NOW...
# ----------------------------
paths = walkDir(REVIEWS_CSV)
errors = []
for p in paths:
parts = p.split('/')
style = parts[2]
id = parts[3]
# Setup
STYLES[style] += [ id ] # contains an array of file ids
TRAINING_ROWS[id] = []
TEST_ROWS[id] = []
#raw_data = ''
#with open(p, 'r+') as f:
# raw_data = f.read()
if id not in DATA:
try:
csv_data = read_csv(p, header=0)
# Cleanup bad first columns...
del csv_data[csv_data.icol(0).name]
DATA[id] = csv_data
except Exception as e:
print(e)
errors.append(p)
print(p)
In [ ]:
In [ ]: