notebook.community

Edit and run



In [6]:

    
# Simple method to convert from liblinear format to fann format
def convert_liblinear_file(ll_filepath, fann_filepath):
    with open(ll_filepath, 'r') as ll_file:
        data = []
        for line in ll_file.readlines():
            fields = line.strip().split(" ")
            row = [int(fields[0])]
            for (i, field) in enumerate(fields[1:]):
                (colnum, value) = field.split(":")
                assert(int(colnum) == (i + 1))
                row.append(float(value))
            data.append(row)
    with open(fann_filepath, 'w') as fann_file:
        rows = len(data)
        cols = len(data[0]) -1
        fann_file.write("%s %s %s\n" % (rows, cols, 1))
        for row in data:
            is_cleaved = row[0]
            for feature in row[1:]:
                fann_file.write("%.12f " % (feature))
            fann_file.write("\n%d\n" % (is_cleaved))



In [7]:

    
from fann2 import libfann
import pandas as pd
import numpy as np



In [8]:

    
data_dir = "/workspace/chipper_data/chipper-0.3.0-data/"
train_data_path = data_dir + "train_data.fann"
test_data_path = data_dir + "test_data.fann"
neural_network_save_file = data_dir + "neural.net"



In [9]:

    
convert_liblinear_file(data_dir + "testing_data.ll", test_data_path)
convert_liblinear_file(data_dir + "training_data.ll", train_data_path)



In [10]:

    
from fann2 import libfann

train_data = libfann.training_data()
train_data.read_train_from_file(train_data_path)

num_input = train_data.num_input_train_data()
num_output = train_data.num_output_train_data()

alpha = 5.0 # Range of 2 to 10, below 5 might cause overfitting
num_hidden = train_data.length_train_data() / (alpha * (num_input  + num_output))

ann = libfann.neural_net()
ann.create_sparse_array(connection_rate=0.5, layers=(num_input, num_output))
ann.set_scaling_params(data=train_data, new_input_min=0, new_input_max=1, new_output_min=0, new_output_max=1)
ann.scale_train(data=train_data)
ann.set_learning_rate(0.7)
#ann.set_train_error_function(libfann.ERRORFUNC_LINEAR)
ann.set_activation_function_hidden(libfann.ELLIOT)
# Steepness= 0.5 ELLIOT AUC=0.88
ann.set_activation_steepness_output(0.5)
ann.set_activation_function_output(libfann.ELLIOT)


ann.train_on_data(data=train_data, desired_error=0.09, epochs_between_reports=5, max_epochs=500)
ann.save(neural_network_save_file)
ann.print_connections()
ann.print_parameters()



In [11]:

    
from fann2 import libfann

ann = libfann.neural_net()
ann.create_from_file(neural_network_save_file)

test_data = libfann.training_data()
test_data.read_train_from_file(test_data_path)
ann.scale_train(data=test_data)

ann.reset_MSE()
input=test_data.get_input()
output=test_data.get_output()

prediction_file = data_dir + "neural_predictions.csv"
with open(prediction_file, 'w') as predict_file:
    predict_file.write(",actual,predicted\n")
    for i in range(len(input)):
        predict = ann.test(input[i], output[i])
        predict_file.write("%d,%d,%f\n" % (i, int(output[i][0]), predict[0]))
print "MSE error on test data: %f" % ann.get_MSE()









    



MSE error on test data: 0.145478



In [13]:

    
import pandas as pd

df = pd.DataFrame.from_csv(prediction_file)
real = df.actual.astype("int")
predicted = df.predicted

from sklearn.metrics import classification_report, matthews_corrcoef, confusion_matrix

classification_vector = lambda cutoff: [1 if pred >= cutoff else 0 for pred in predicted]

def find_best_mcc():
    best_mcc = 0.0
    best_cutoff = 0.0
    for i in range(1, 100):
        pred_cutoff = i/100.0
        mcc = matthews_corrcoef(real, classification_vector(pred_cutoff))
        if (mcc > best_mcc):
            best_mcc = mcc
            best_cutoff = pred_cutoff
    return (best_cutoff, best_mcc)

(best_cutoff, best_mcc) = find_best_mcc()
print "** Cutoff= %.2f (MCC=%.3f) **" % (best_cutoff, best_mcc)
print classification_report(real, classification_vector(best_cutoff))

def print_metrics(name, actual, predicted):
    ((tn, fp), (fn, tp)) = confusion_matrix(actual, predicted)    
    sensitivity = 100.0 * tp / (tp + fn)
    specificity = 100.0 * tn / (tn + fp)
    precision = 100.0 * tp / (tp + fp)
    print "%s: sensitivity(recall)=%.1f, specificity=%.1f, precision=%.1f" % (name, sensitivity, specificity, precision)

print_metrics("NN", real, classification_vector(best_cutoff))

%matplotlib notebook
import matplotlib.pyplot as plt
from sklearn import svm, metrics

fpr, tpr, thresholds = metrics.roc_curve(real, predicted, pos_label=1)
roc_auc = metrics.roc_auc_score(real, predicted, average='macro', sample_weight=None)
plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b', label='AUC = %0.2f'% (roc_auc))
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')









    



/Users/matt/anaconda2/lib/python2.7/site-packages/sklearn/metrics/classification.py:516: RuntimeWarning: invalid value encountered in double_scalars
  mcc = cov_ytyp / np.sqrt(var_yt * var_yp)






    



** Cutoff= 0.37 (MCC=0.612) **
             precision    recall  f1-score   support

          0       0.83      0.77      0.80       208
          1       0.79      0.84      0.81       208

avg / total       0.81      0.81      0.81       416

NN: sensitivity(recall)=83.7, specificity=77.4, precision=78.7






    














    











    Out[13]:





<matplotlib.text.Text at 0x12b881d50>



In [14]:

    
def fetch_neural_connections():
    with open(neural_network_save_file, 'r') as network_file:
        data = []
        for line in network_file.readlines():
            if line.startswith("connections"):
                (key, value) = line.split("=")
                for record in value.strip()[1:-1].split(") ("):
                    (neuron, weight) = record.split(", ")
                    data.append((int(neuron), float(weight)))
        df = pd.DataFrame(data)
        df.columns = ["connected_to_neuron", "weight"]
        return df
                   
df = fetch_neural_connections()
df["abs_weight"] = df.weight.abs()
df["position"] = df.connected_to_neuron.div(50).astype("int")
df["measure"] = df.connected_to_neuron.mod(50)
# 0 - 17 (18 values)
df["hydrophic"] = df.measure.le(17)
# 36 - 49 (15 values)
df["electronic"] = df.measure.ge(36)
# 18 - 35 (17 values)
df["steric"] = (df.hydrophic | df.electronic) == False

total_weight = df.abs_weight.sum()
df["perc_weight"] = df.abs_weight / total_weight * 100
df = df[df.perc_weight > 0.1]
df.sort_values(by=["perc_weight"], ascending=False)









    Out[14]:






  
    
      
      connected_to_neuron
      weight
      abs_weight
      position
      measure
      hydrophic
      electronic
      steric
      perc_weight
    
  
  
    
      454
      453
      2.098767
      2.098767
      9
      3
      True
      False
      False
      1.627395
    
    
      488
      487
      -2.083853
      2.083853
      9
      37
      False
      True
      False
      1.615831
    
    
      460
      459
      -1.651490
      1.651490
      9
      9
      True
      False
      False
      1.280574
    
    
      309
      308
      1.444614
      1.444614
      6
      8
      True
      False
      False
      1.120162
    
    
      449
      448
      -1.354448
      1.354448
      8
      48
      False
      True
      False
      1.050246
    
    
      149
      148
      1.289342
      1.289342
      2
      48
      False
      True
      False
      0.999763
    
    
      338
      337
      1.256546
      1.256546
      6
      37
      False
      True
      False
      0.974332
    
    
      364
      363
      -1.216270
      1.216270
      7
      13
      True
      False
      False
      0.943102
    
    
      500
      499
      1.169001
      1.169001
      9
      49
      False
      True
      False
      0.906450
    
    
      260
      259
      1.031897
      1.031897
      5
      9
      True
      False
      False
      0.800139
    
    
      304
      303
      -1.015071
      1.015071
      6
      3
      True
      False
      False
      0.787092
    
    
      237
      236
      -0.993563
      0.993563
      4
      36
      False
      True
      False
      0.770414
    
    
      414
      413
      -0.922397
      0.922397
      8
      13
      True
      False
      False
      0.715231
    
    
      614
      613
      0.909673
      0.909673
      12
      13
      True
      False
      False
      0.705366
    
    
      491
      490
      -0.848689
      0.848689
      9
      40
      False
      True
      False
      0.658078
    
    
      343
      342
      -0.827520
      0.827520
      6
      42
      False
      True
      False
      0.641663
    
    
      254
      253
      -0.815750
      0.815750
      5
      3
      True
      False
      False
      0.632537
    
    
      456
      455
      -0.787811
      0.787811
      9
      5
      True
      False
      False
      0.610873
    
    
      398
      397
      -0.777451
      0.777451
      7
      47
      False
      True
      False
      0.602840
    
    
      339
      338
      0.756024
      0.756024
      6
      38
      False
      True
      False
      0.586225
    
    
      249
      248
      0.747059
      0.747059
      4
      48
      False
      True
      False
      0.579273
    
    
      521
      520
      -0.727990
      0.727990
      10
      20
      False
      False
      True
      0.564488
    
    
      440
      439
      0.720371
      0.720371
      8
      39
      False
      True
      False
      0.558579
    
    
      694
      693
      0.703738
      0.703738
      13
      43
      False
      True
      False
      0.545682
    
    
      544
      543
      -0.687252
      0.687252
      10
      43
      False
      True
      False
      0.532899
    
    
      199
      198
      0.673047
      0.673047
      3
      48
      False
      True
      False
      0.521884
    
    
      350
      349
      0.668894
      0.668894
      6
      49
      False
      True
      False
      0.518664
    
    
      438
      437
      -0.667054
      0.667054
      8
      37
      False
      True
      False
      0.517237
    
    
      571
      570
      -0.664693
      0.664693
      11
      20
      False
      False
      True
      0.515406
    
    
      390
      389
      0.663404
      0.663404
      7
      39
      False
      True
      False
      0.514407
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      624
      623
      -0.146371
      0.146371
      12
      23
      False
      False
      True
      0.113497
    
    
      606
      605
      -0.146086
      0.146086
      12
      5
      True
      False
      False
      0.113276
    
    
      502
      501
      0.145884
      0.145884
      10
      1
      True
      False
      False
      0.113119
    
    
      505
      504
      0.144971
      0.144971
      10
      4
      True
      False
      False
      0.112411
    
    
      182
      181
      0.144398
      0.144398
      3
      31
      False
      False
      True
      0.111967
    
    
      950
      949
      0.143513
      0.143513
      18
      49
      False
      True
      False
      0.111280
    
    
      602
      601
      0.143477
      0.143477
      12
      1
      True
      False
      False
      0.111253
    
    
      356
      355
      0.143098
      0.143098
      7
      5
      True
      False
      False
      0.110959
    
    
      718
      717
      0.142541
      0.142541
      14
      17
      True
      False
      False
      0.110527
    
    
      626
      625
      -0.142198
      0.142198
      12
      25
      False
      False
      True
      0.110261
    
    
      468
      467
      -0.142100
      0.142100
      9
      17
      True
      False
      False
      0.110185
    
    
      620
      619
      -0.140747
      0.140747
      12
      19
      False
      False
      True
      0.109136
    
    
      487
      486
      0.140268
      0.140268
      9
      36
      False
      True
      False
      0.108765
    
    
      683
      682
      0.139436
      0.139436
      13
      32
      False
      False
      True
      0.108120
    
    
      725
      724
      -0.137877
      0.137877
      14
      24
      False
      False
      True
      0.106911
    
    
      115
      114
      -0.137534
      0.137534
      2
      14
      True
      False
      False
      0.106645
    
    
      503
      502
      -0.137099
      0.137099
      10
      2
      True
      False
      False
      0.106308
    
    
      984
      983
      -0.136961
      0.136961
      19
      33
      False
      False
      True
      0.106200
    
    
      284
      283
      0.136811
      0.136811
      5
      33
      False
      False
      True
      0.106084
    
    
      641
      640
      0.136426
      0.136426
      12
      40
      False
      True
      False
      0.105786
    
    
      472
      471
      0.135166
      0.135166
      9
      21
      False
      False
      True
      0.104808
    
    
      900
      899
      -0.134755
      0.134755
      17
      49
      False
      True
      False
      0.104490
    
    
      74
      73
      0.133886
      0.133886
      1
      23
      False
      False
      True
      0.103816
    
    
      321
      320
      0.132825
      0.132825
      6
      20
      False
      False
      True
      0.102993
    
    
      189
      188
      0.132576
      0.132576
      3
      38
      False
      True
      False
      0.102800
    
    
      21
      20
      -0.131978
      0.131978
      0
      20
      False
      False
      True
      0.102337
    
    
      296
      295
      -0.131114
      0.131114
      5
      45
      False
      True
      False
      0.101667
    
    
      737
      736
      -0.130869
      0.130869
      14
      36
      False
      True
      False
      0.101476
    
    
      342
      341
      0.130757
      0.130757
      6
      41
      False
      True
      False
      0.101390
    
    
      253
      252
      0.129204
      0.129204
      5
      2
      True
      False
      False
      0.100185
    
  

249 rows × 9 columns

	connected_to_neuron	weight	abs_weight	position	measure	hydrophic	electronic	steric	perc_weight
454	453	2.098767	2.098767	9	3	True	False	False	1.627395
488	487	-2.083853	2.083853	9	37	False	True	False	1.615831
460	459	-1.651490	1.651490	9	9	True	False	False	1.280574
309	308	1.444614	1.444614	6	8	True	False	False	1.120162
449	448	-1.354448	1.354448	8	48	False	True	False	1.050246
149	148	1.289342	1.289342	2	48	False	True	False	0.999763
338	337	1.256546	1.256546	6	37	False	True	False	0.974332
364	363	-1.216270	1.216270	7	13	True	False	False	0.943102
500	499	1.169001	1.169001	9	49	False	True	False	0.906450
260	259	1.031897	1.031897	5	9	True	False	False	0.800139
304	303	-1.015071	1.015071	6	3	True	False	False	0.787092
237	236	-0.993563	0.993563	4	36	False	True	False	0.770414
414	413	-0.922397	0.922397	8	13	True	False	False	0.715231
614	613	0.909673	0.909673	12	13	True	False	False	0.705366
491	490	-0.848689	0.848689	9	40	False	True	False	0.658078
343	342	-0.827520	0.827520	6	42	False	True	False	0.641663
254	253	-0.815750	0.815750	5	3	True	False	False	0.632537
456	455	-0.787811	0.787811	9	5	True	False	False	0.610873
398	397	-0.777451	0.777451	7	47	False	True	False	0.602840
339	338	0.756024	0.756024	6	38	False	True	False	0.586225
249	248	0.747059	0.747059	4	48	False	True	False	0.579273
521	520	-0.727990	0.727990	10	20	False	False	True	0.564488
440	439	0.720371	0.720371	8	39	False	True	False	0.558579
694	693	0.703738	0.703738	13	43	False	True	False	0.545682
544	543	-0.687252	0.687252	10	43	False	True	False	0.532899
199	198	0.673047	0.673047	3	48	False	True	False	0.521884
350	349	0.668894	0.668894	6	49	False	True	False	0.518664
438	437	-0.667054	0.667054	8	37	False	True	False	0.517237
571	570	-0.664693	0.664693	11	20	False	False	True	0.515406
390	389	0.663404	0.663404	7	39	False	True	False	0.514407
...	...	...	...	...	...	...	...	...	...
624	623	-0.146371	0.146371	12	23	False	False	True	0.113497
606	605	-0.146086	0.146086	12	5	True	False	False	0.113276
502	501	0.145884	0.145884	10	1	True	False	False	0.113119
505	504	0.144971	0.144971	10	4	True	False	False	0.112411
182	181	0.144398	0.144398	3	31	False	False	True	0.111967
950	949	0.143513	0.143513	18	49	False	True	False	0.111280
602	601	0.143477	0.143477	12	1	True	False	False	0.111253
356	355	0.143098	0.143098	7	5	True	False	False	0.110959
718	717	0.142541	0.142541	14	17	True	False	False	0.110527
626	625	-0.142198	0.142198	12	25	False	False	True	0.110261
468	467	-0.142100	0.142100	9	17	True	False	False	0.110185
620	619	-0.140747	0.140747	12	19	False	False	True	0.109136
487	486	0.140268	0.140268	9	36	False	True	False	0.108765
683	682	0.139436	0.139436	13	32	False	False	True	0.108120
725	724	-0.137877	0.137877	14	24	False	False	True	0.106911
115	114	-0.137534	0.137534	2	14	True	False	False	0.106645
503	502	-0.137099	0.137099	10	2	True	False	False	0.106308
984	983	-0.136961	0.136961	19	33	False	False	True	0.106200
284	283	0.136811	0.136811	5	33	False	False	True	0.106084
641	640	0.136426	0.136426	12	40	False	True	False	0.105786
472	471	0.135166	0.135166	9	21	False	False	True	0.104808
900	899	-0.134755	0.134755	17	49	False	True	False	0.104490
74	73	0.133886	0.133886	1	23	False	False	True	0.103816
321	320	0.132825	0.132825	6	20	False	False	True	0.102993
189	188	0.132576	0.132576	3	38	False	True	False	0.102800
21	20	-0.131978	0.131978	0	20	False	False	True	0.102337
296	295	-0.131114	0.131114	5	45	False	True	False	0.101667
737	736	-0.130869	0.130869	14	36	False	True	False	0.101476
342	341	0.130757	0.130757	6	41	False	True	False	0.101390
253	252	0.129204	0.129204	5	2	True	False	False	0.100185