notebook.community

Edit and run



In [ ]:

    
import pandas
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
import tensorflow as tf
import glob
import datetime
import itertools
from time import sleep



In [ ]:

    
np.random.seed(1)



In [ ]:

    
import os
import os.path
import gc



In [ ]:

    
import argparse
parser = argparse.ArgumentParser(description = "Please insert the train flag")



In [ ]:

    
parser.add_argument('-t', '--train', action = "store",
                    help='If true, we train and save. Else, otherwise.', required = True)



In [ ]:

    
my_args = vars(parser.parse_args())
trainFlag = my_args['train']
trainFlag = trainFlag.lower() in ("True", "t", "true", "1", 1)



In [ ]:

    
print datetime.datetime.now()
validFilePaths = []
for f in os.listdir("data/anomaly_data"):
    filePath = os.path.join("data/anomaly_data", f)
    if os.path.isdir(filePath):
        continue
    if os.stat(filePath).st_size <= 3:
        continue
    validFilePaths.append(filePath)
    
numF = int(1 * len(validFilePaths))
print 'Using this many files {0}'.format(numF)
validFilePaths = np.random.choice(validFilePaths, numF, replace=False)
df_list = (pandas.read_csv(f) for f in validFilePaths)
df = pandas.concat(df_list, ignore_index=True)
df = df[df['radiant_win'].notnull()]



In [ ]:

    
print df.shape
columns = df.columns
df_catInteger_features_example = filter(lambda x: 'hero_id' in x, columns)



In [ ]:

    
from itertools import chain
# these will require string processing on the column names to work
numericalFeatures = ['positive_votes', 'negative_votes', 'first_blood_time', 'radiant_win',
                    'duration', 'kills', 'deaths', 'assists', 'kpm', 'kda', 'hero_dmg',
                    'gpm', 'hero_heal', 'xpm', 'totalgold', 'totalxp', 'lasthits', 'denies',
                    'tower_kills', 'courier_kills', 'observer_uses', 'sentry_uses',
                    'ancient_kills', 'camps_stacked', 'abandons'] #apm problem
categoricalIntegerFeatures = ['hero_id']#['barracks_status', 'tower_status', 'hero_id'] 
                              #'item0', 'item1', 'item2', 'item3', 'item4', 'item5']
categoricalFullFeatures = ['patch']
numFeatures = [filter(lambda x: z in x, columns) for z in numericalFeatures]
categoricalIntegerFeatures  = [filter(lambda x: z in x, columns) for z in categoricalIntegerFeatures]
catFull = [filter(lambda x: z in x, columns) for z in categoricalFullFeatures]
numFeatures = list(chain(*numFeatures))
categoricalIntegerFeatures = list(chain(*categoricalIntegerFeatures))
catFull = list(chain(*catFull))



In [ ]:

    
match_ids = df['match_id']
df_numerical = df[numFeatures]
df_numerical.loc[:, 'radiant_win'] = df_numerical.loc[:, 'radiant_win'].apply(lambda x : int(x))
df_numerical.iloc[:, 1:len(df_numerical.columns)] = df_numerical.iloc[:, 1:len(df_numerical.columns)].apply(lambda x: (x - np.nanmean(x)) / (np.nanmax(x) - np.nanmin(x)))
df_numerical = df_numerical.fillna(0)
df_numerical['radiant_win'] = df_numerical['radiant_win'].apply(lambda x: 1 if x >= 0 else 0)
df = df_numerical



In [ ]:

    
x = np.random.rand(df.shape[0])
mask = np.where(x < 0.75)[0]
mask2 = np.where(x >= 0.75)[0]
df_train = df.iloc[mask, :]
df_test = df.iloc[mask2, :]
match_ids_train = match_ids.iloc[mask]
match_ids_test = match_ids.iloc[mask2]



In [ ]:

    
NumFeatures = df.shape[1]
layer_size = [int(NumFeatures * 0.75), NumFeatures]



In [ ]:

    
print NumFeatures



In [ ]:

    
print df_train.shape



In [ ]:

    
x = tf.placeholder(tf.float32, [None, NumFeatures])
y = x
#encoders
weights_1 = tf.Variable(tf.random_normal([NumFeatures, layer_size[0]], stddev = 1.0/NumFeatures/100), name='weights_1')
bias_1 = tf.Variable(tf.random_normal([layer_size[0]], stddev = 1.0/NumFeatures/100), name='bias_1')

#decoders
weights_2 = tf.Variable(tf.random_normal([layer_size[0], layer_size[1]], stddev = 1.0/NumFeatures/100), name='weights_2')
bias_2 = tf.Variable(tf.random_normal([layer_size[1]], stddev = 1.0/NumFeatures/100), name='bias_2')
  
layer1 = tf.tanh(tf.matmul(x, weights_1) + bias_1)
output = tf.tanh(tf.matmul(layer1, weights_2) + bias_2)

cost = tf.reduce_mean(tf.reduce_sum(tf.pow(y-output, 2), 1))
rank = tf.rank(cost)

learning_rate = 0.000001
beta1 = 0.5
beta2 = 0.5
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta1, beta2=beta2)
gradients, variables = zip(*optimizer.compute_gradients(cost))
gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
train_op = optimizer.apply_gradients(zip(gradients, variables))
    
variable_dict = {'weights_1': weights_1, 'weights_2': weights_2,
                     'bias_1': bias_1, 'bias_2': bias_2}
saver = tf.train.Saver(variable_dict)
init = tf.global_variables_initializer()

ckpoint_dir = os.path.join(os.getcwd(), 'model-backups/model.ckpt')



In [ ]:

    
flatten = lambda l: [item for sublist in l for item in sublist]
import requests
import json

def canIAnalyzeThisMatch(currentMatchID):
    host = "https://api.opendota.com/api/matches/" + str(currentMatchID)
    data = {'match_id': currentMatchID}
    data = requests.get(host, data)
    return data.status_code == 200

def test(sess, test_data):
    batch = test_data
    data = batch.as_matrix()
    data = data.astype(np.float32)
    layer1 = tf.tanh(tf.matmul(data, weights_1) + bias_1)
    output = tf.tanh(tf.matmul(layer1, weights_2) + bias_2)
    residuals = tf.reduce_sum(tf.abs(output - tf.cast(data, tf.float32)), axis = 1)
    output_results, residuals = sess.run([output, residuals])
    indices = np.argsort(residuals)[::-1]
    return data, output_results, indices, residuals



In [ ]:

    
def train():
    numEpochs = 1000
    numBatches = 1000
    batchSize = int(round(0.01 * df_train.shape[0]))
    for epochIter in xrange(numEpochs):
        print 'Epoch: {0}'.format(epochIter)
        gc.collect()
        batch = df_train.sample(n=batchSize).as_matrix()
        temp_out = sess.run(cost, feed_dict = {x: batch})
        print temp_out
        if (epochIter+1) % 50 == 0:
            saver.save(sess, ckpoint_dir)
        for batchItr in xrange(numBatches):
            batch = df_train.sample(n=batchSize).as_matrix()
            sess.run(train_op, feed_dict = {x : batch})

with tf.Session() as sess:
    if sess.run(rank) != 0:
        raise Exception("Wrong dimenions of cost")
    if (trainFlag):
        sess.run(init)
        train()
    else:
        print 'Doing test'
        saver.restore(sess, ckpoint_dir)
        np.savetxt("data/weights1.csv", weights_1.eval(), delimiter=",")
        np.savetxt("data/bias1.csv", bias_1.eval(), delimiter=",")
        np.savetxt("data/weights2.csv", weights_2.eval(), delimiter=",")
        np.savetxt("data/bias2.csv", bias_2.eval(), delimiter=",")
        anomalies, output, indices_test, residuals = test(sess, df_test)
        anomaliesSave = anomalies[indices_test, :]
        output = output[indices_test, :]
        print anomalies[0, 0:10]
        print output[0, 0:10]
        np.savetxt("data/anomalies.csv", anomaliesSave, delimiter=",")
        np.savetxt("data/output.csv", output, delimiter=",")
        np.savetxt('data/indices.csv', indices_test, delimiter = ',')
        anomalizedAnalizable = match_ids_test.values
        goodMatches = []
        print len(anomalizedAnalizable)
        for i in range(len(anomalizedAnalizable)):
            an = anomalizedAnalizable[i]
            residual = residuals[i]
            goodMatches.append([int(an), residual])
        np.savetxt('data/goodAnomaliesResidual.csv', np.array(goodMatches), delimiter = ',')



In [ ]:

    
print 'Done'
print datetime.datetime.now()



In [ ]: