In [80]:
import utilities
from utilities import *
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam
import os

def test_sentence(sentence):
    sentence = process_text(sentence)
    vec = vectorizer.transform([sentence]).toarray()
    prediction = model.predict_proba(vec)[0]
    print "Label", label_names[np.argmax(prediction)]
    for i, category in enumerate(label_names):
        print "{}: {:.2f}".format(category, prediction[i])

# Constants
no_dev = False
binary = False

In [81]:
print "Get and process data"
# Get raw text + labels
features, labels, label_names = get_data(whatIWant='meta', binary=binary, no_dev=no_dev)
features = np.asanyarray(features)

# x sind die eingabematrizen, y sind die vektoren in denen die ergebnisse stehen
x_train, x_test, y_train, y_test = split_train_test(features, labels, ratio=0.7, shuffle=True)

print "Prepare one-hot-encoding"
# One-Hot-Encoding needed for Neural Net Output
y_train = one_hot_encoding(y_train)
y_test = one_hot_encoding(y_test)


Get and process data
Prepare one-hot-encoding

In [86]:
label_names


Out[86]:
[u'SKIPPED',
 u'DEV',
 u'UNSURE',
 u'DATA',
 u'WEB',
 u'HW',
 u'OTHER',
 u'UNLABELED',
 u'DOCS',
 u'EDU']

In [87]:
model = None
trained_model_filneame = 'modelMeta'

if trained_model_filneame not in os.listdir('.'):
    # Struktur des Netzes
    model = Sequential()
    input_size = x_train.shape[1]
    model.add(Dense(input_size, input_dim=input_size))
    model.add(Activation('relu'))
    model.add(Dense(input_size * 4))
    model.add(Activation('relu'))
    model.add(Dense(input_size * 4)) # Let's make it deeeep
    model.add(Activation('relu'))
    # Output Layer, one neuron per class
    if binary:
        model.add(Dense(2))
    else:
        if no_dev:
            model.add(Dense(9))
        else:
            model.add(Dense(10)) # 9 without Skipped
    # Softmax zum Normalisieren der Werte, damit Wert des Neurons WSK in % angibt
    model.add(Activation('softmax'))
    adam = Adam()
    model.compile(metrics=['accuracy'], optimizer=adam, loss='categorical_crossentropy')
    print "Built model from scratch"
else:
    model = keras.models.load_model(trained_model_filneame)
    print "Model was loaded from file"


Built model from scratch

In [88]:
print "Train model"
model.fit(x_train, y_train, nb_epoch=10, shuffle=True, verbose=True)


Train model
Epoch 1/10
301/301 [==============================] - 0s - loss: 2.4353 - acc: 0.0698         
Epoch 2/10
301/301 [==============================] - 0s - loss: 2.0676 - acc: 0.3987     
Epoch 3/10
301/301 [==============================] - 0s - loss: 1.8449 - acc: 0.5050     
Epoch 4/10
301/301 [==============================] - 0s - loss: 1.7185 - acc: 0.5050     
Epoch 5/10
301/301 [==============================] - 0s - loss: 1.6315 - acc: 0.5050     
Epoch 6/10
301/301 [==============================] - 0s - loss: 1.6017 - acc: 0.5050     
Epoch 7/10
301/301 [==============================] - 0s - loss: 1.5609 - acc: 0.5083     
Epoch 8/10
301/301 [==============================] - 0s - loss: 1.5383 - acc: 0.5083     
Epoch 9/10
301/301 [==============================] - 0s - loss: 1.5228 - acc: 0.5083     
Epoch 10/10
301/301 [==============================] - 0s - loss: 1.5100 - acc: 0.5083     
Out[88]:
<keras.callbacks.History at 0x11c645bd0>

In [89]:
# Get accuracy on test_set
print "Test on {} unknown samples".format(len(x_test))
acc = model.evaluate(x_test, y_test, verbose=0)
print "Loss, accuracy: ", acc


Test on 130 unknown samples
Loss, accuracy:  [1.0756243522350604, 0.7846153846153846]

In [17]:
key_max = {}
data = api_call()

for i in xrange(len(data)):
    for x in data[i]:
        if not x in key_max:
            key_max[x] = data[i][x]
        else:
            if data[i][x] > key_max[x]:
                key_max[x] = data[i][x]

for x in key_max:
    if x != 'readme' and x != 'description':
        print x, key_max[x]


files screenshot.png
hasDownloads 1
watches 9
folder_count 93
treeDepth 9
id 99
api_url https://api.github.com/repos/zzolo/dropbox-upstart
author zzolo
stars 964
forks 98
api_calls 99
folders tests
language_array VimL Ruby Shell
commit_count 98
commit_interval_avg 9
branch_count 9
contributors_count 9
class WEB
open_issues_count 9
name zfo-editor
language_main r
url https://github.com/zzolo/dropbox-upstart
avg_commit_length 99
hasWiki 1
treeArray  src srcmipruebagit
file_count 99
commit_interval_max 9
tagger stefan
isFork 1

In [40]:
max(labels)


Out[40]:
8

In [ ]: