Tensorflow attempt on HeatReplay



In [1]:

    
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from tensorflow.contrib import learn
from tqdm import tqdm

from context import *
from util.dfmgmt import initSet, wrangle



In [2]:

    
df = initSet()
df = df[df['decade'] != 2010]



In [3]:

    
# Drop both year and decade
dropList = ['most_used_term']
removeList = ['decade', 'year', 'charted']
target = 'charted'  # main feature to be predicted
df, features = wrangle(df, dropList, removeList, True)

df.head()









    Out[3]:






  
    
      
      year
      decade
      unique_words
      density
      unique_words_raw
      density_raw
      nouns
      verbs
      adjectives
      syllables
      most_used_freq
      explicit
      total_curses
      reading_score
      sentiment
      charted
    
  
  
    
      0
      1961
      1960
      36
      72
      65
      158
      34
      30
      10
      70
      7
      0
      0
      2.367848
      0.9901
      0
    
    
      1
      1961
      1960
      45
      91
      74
      197
      45
      37
      19
      81
      6
      0
      0
      2.771777
      0.9712
      1
    
    
      2
      1961
      1960
      54
      103
      88
      223
      45
      48
      17
      98
      10
      0
      0
      3.885650
      0.9974
      1
    
    
      3
      1961
      1960
      42
      148
      66
      263
      81
      61
      36
      76
      24
      0
      0
      2.889886
      0.9993
      1
    
    
      4
      1961
      1960
      28
      131
      60
      354
      56
      77
      5
      57
      38
      0
      0
      2.940000
      0.9812
      1



In [4]:

    
X = df[features].as_matrix()
y = df[target].as_matrix()



In [5]:

    
# Do cross validation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [6]:

    
def gridSearchTF(X_train, X_test, y_train, y_test, units, steps=200, batch_size=64):

    units = units.split(',')
    # Instantiate model
    clf = learn.DNNClassifier(hidden_units=units, n_classes=len(units))

    # Train model
    clf.fit(X_train, y_train, steps=steps, batch_size=batch_size)

    # Score model
    score = metrics.accuracy_score(y_test, clf.predict(X_test))

    return score



In [7]:

    
from random import sample, shuffle

def randomList(nClasses):
    
    new = ''
    
    for i in xrange(nClasses):        
        new += str(sample(xrange(30, 101), 10)[0]) + ','

    return new[:-1]

def randomizer(nClasses, nSearches):

    hiddenUnits = []

    for _ in xrange(nSearches):
        hiddenUnits.append(randomList(nClasses))
        
    shuffle(hiddenUnits)

    return hiddenUnits



In [8]:

    
params = {units: None for units in randomizer(3, 10)}

for units in tqdm(params):
    params[units] = gridSearchTF(X_train, X_test, y_train, y_test, units)









    



100%|██████████| 10/10 [01:38<00:00,  9.82s/it]



In [9]:

    
params









    Out[9]:





{'48,63,95': 0.67796610169491522,
 '53,80,36': 0.68318122555410687,
 '54,77,97': 0.67535853976531945,
 '71,65,96': 0.6805736636245111,
 '75,49,37': 0.69361147327249018,
 '81,76,39': 0.67275097783572357,
 '85,65,66': 0.68839634941329853,
 '91,33,35': 0.67535853976531945,
 '92,63,30': 0.69621903520208606,
 '99,83,43': 0.68448500651890487}



In [ ]:

	year	decade	unique_words	density	unique_words_raw	density_raw	nouns	verbs	adjectives	syllables	most_used_freq	reading_score	sentiment	charted
0	1961	1960	36	72	65	158	34	30	10	70	7	2.367848	0.9901	0
1	1961	1960	45	91	74	197	45	37	19	81	6	2.771777	0.9712	1
2	1961	1960	54	103	88	223	45	48	17	98	10	3.885650	0.9974	1
3	1961	1960	42	148	66	263	81	61	36	76	24	2.889886	0.9993	1
4	1961	1960	28	131	60	354	56	77	5	57	38	2.940000	0.9812	1