Tensorflow attempt on HeatReplay


In [1]:
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from tensorflow.contrib import learn
from tqdm import tqdm

from context import *
from util.dfmgmt import initSet, wrangle

In [2]:
df = initSet()
df = df[df['decade'] != 2010]

In [3]:
# Drop both year and decade
dropList = ['most_used_term']
removeList = ['decade', 'year', 'charted']
target = 'charted'  # main feature to be predicted
df, features = wrangle(df, dropList, removeList, True)

df.head()


Out[3]:
year decade unique_words density unique_words_raw density_raw nouns verbs adjectives syllables most_used_freq explicit total_curses reading_score sentiment charted
0 1961 1960 36 72 65 158 34 30 10 70 7 0 0 2.367848 0.9901 0
1 1961 1960 45 91 74 197 45 37 19 81 6 0 0 2.771777 0.9712 1
2 1961 1960 54 103 88 223 45 48 17 98 10 0 0 3.885650 0.9974 1
3 1961 1960 42 148 66 263 81 61 36 76 24 0 0 2.889886 0.9993 1
4 1961 1960 28 131 60 354 56 77 5 57 38 0 0 2.940000 0.9812 1

In [4]:
X = df[features].as_matrix()
y = df[target].as_matrix()

In [5]:
# Do cross validation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [6]:
def gridSearchTF(X_train, X_test, y_train, y_test, units, steps=200, batch_size=64):

    units = units.split(',')
    # Instantiate model
    clf = learn.DNNClassifier(hidden_units=units, n_classes=len(units))

    # Train model
    clf.fit(X_train, y_train, steps=steps, batch_size=batch_size)

    # Score model
    score = metrics.accuracy_score(y_test, clf.predict(X_test))

    return score

In [7]:
from random import sample, shuffle

def randomList(nClasses):
    
    new = ''
    
    for i in xrange(nClasses):        
        new += str(sample(xrange(30, 101), 10)[0]) + ','

    return new[:-1]

def randomizer(nClasses, nSearches):

    hiddenUnits = []

    for _ in xrange(nSearches):
        hiddenUnits.append(randomList(nClasses))
        
    shuffle(hiddenUnits)

    return hiddenUnits

In [8]:
params = {units: None for units in randomizer(3, 10)}

for units in tqdm(params):
    params[units] = gridSearchTF(X_train, X_test, y_train, y_test, units)


100%|██████████| 10/10 [01:38<00:00,  9.82s/it]

In [9]:
params


Out[9]:
{'48,63,95': 0.67796610169491522,
 '53,80,36': 0.68318122555410687,
 '54,77,97': 0.67535853976531945,
 '71,65,96': 0.6805736636245111,
 '75,49,37': 0.69361147327249018,
 '81,76,39': 0.67275097783572357,
 '85,65,66': 0.68839634941329853,
 '91,33,35': 0.67535853976531945,
 '92,63,30': 0.69621903520208606,
 '99,83,43': 0.68448500651890487}

In [ ]: