In [40]:
# IMPORTS
from __future__ import print_function
from pandas import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import sklearn
import csv
import os
import numpy as np
from collections import defaultdict
# Constants
REVIEWS_CSV = 'processed/word-freq-by-review/'
ALL_WORDS_FILE = 'processed/all_sorted'
DATA_DIR = 'processed/pandas/'
ALL_WORDS = []
# Map an id to an array for row numbers
TRAINING_ROWS = {}
TEST_ROWS = {}
# Style: [beer ids]
STYLES = defaultdict(list)
ALL_STYLES = []
# data...
DATA = {}
# Word Filtering
SMALL_A = 97
SMALL_Z = SMALL_A + 26
def is_worth_keeping(word):
word = word.lower()
word = word.strip()
if len(word) >= 3:
for c in word:
if ord(c) < SMALL_A or ord(c) > SMALL_Z:
return False
return True
return False
def walkDir(base):
"""
Generate a single list of all the files in a directory
DFS, and include the full path relative to base.
"""
files = list(os.walk(base))
files = files[1:]
paths = []
for group in files:
for f in group[2]:
paths.append(group[0] + '/' + f)
paths = list(filter(lambda x: x.find('.txt') == -1, paths))
return paths
In [ ]:
In [18]:
# Read In Files
with open(ALL_WORDS_FILE, 'r') as f:
lines = f.readlines()
print('Found:', len(lines), 'total words')
ALL_WORDS = [word for word in lines if is_worth_keeping(word)]
print('Found:', len(ALL_WORDS), 'usable words')
In [ ]:
paths = walkDir(REVIEWS_CSV)
errors = []
for p in paths:
parts = p.split('/')
style = parts[2]
id = parts[3]
# Setup
STYLES[style] += [ id ] # contains an array of file ids
TRAINING_ROWS[id] = []
TEST_ROWS[id] = []
#raw_data = ''
#with open(p, 'r+') as f:
# raw_data = f.read()
if id not in DATA:
try:
csv_data = read_csv(p, header=0)
# Cleanup bad first columns...
del csv_data[csv_data.icol(0).name]
DATA[id] = csv_data
except Exception as e:
print(e)
errors.append(p)
print(p)
In [ ]:
len(DATA.keys())
In [ ]:
In [ ]: