notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import os,sys
from sklearn import linear_model
from scipy import stats as stats



In [2]:

    
# load lb, test and CV CIDs

# load LB CIDs
with open('/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/CID_leaderboard.txt') as f: 
    content = f.readlines()
lb_CIDs = list(content)  
lb_CIDs = [int(x) for x in lb_CIDs]

# load test CIDs
with open("/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/CID_testset.txt") as f: 
    content = f.readlines()
test_CIDs = list(content)  
test_CIDs = [int(x) for x in test_CIDs]



In [4]:

    
# load morgan matrix to use them as weights in training
morgan = pd.read_csv('/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/morgan_sim.csv', index_col=0)
weights = morgan[morgan.index.astype(str)]
weights = pd.concat((weights,weights)).sort_index()
print weights.shape
weights.head()









    



(952, 476)






    Out[4]:






  
    
      
      126
      176
      177
      180
      196
      239
      240
      241
      243
      244
      ...
      5366244
      5367698
      5367706
      5368076
      5371102
      6114390
      6429333
      6999977
      10857465
      16220109
    
    
      0
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      126
      1.000000
      0.108108
      0.171429
      0.054054
      0.066667
      0.090909
      0.509091
      0.166667
      0.315789
      0.290909
      ...
      0.033613
      0.183673
      0.195652
      0.051948
      0.252632
      0.237288
      0.307692
      0.066667
      0.000000
      0.050633
    
    
      126
      1.000000
      0.108108
      0.171429
      0.054054
      0.066667
      0.090909
      0.509091
      0.166667
      0.315789
      0.290909
      ...
      0.033613
      0.183673
      0.195652
      0.051948
      0.252632
      0.237288
      0.307692
      0.066667
      0.000000
      0.050633
    
    
      176
      0.108108
      1.000000
      0.285714
      0.625000
      0.256410
      0.434783
      0.058824
      0.000000
      0.277778
      0.058824
      ...
      0.081633
      0.103896
      0.112676
      0.142857
      0.135135
      0.082474
      0.136364
      0.256410
      0.027397
      0.172414
    
    
      176
      0.108108
      1.000000
      0.285714
      0.625000
      0.256410
      0.434783
      0.058824
      0.000000
      0.277778
      0.058824
      ...
      0.081633
      0.103896
      0.112676
      0.142857
      0.135135
      0.082474
      0.136364
      0.256410
      0.027397
      0.172414
    
    
      177
      0.171429
      0.285714
      1.000000
      0.285714
      0.054054
      0.095238
      0.187500
      0.000000
      0.058824
      0.000000
      ...
      0.041667
      0.080000
      0.086957
      0.111111
      0.083333
      0.084211
      0.238095
      0.108108
      0.028169
      0.142857
    
  

5 rows × 476 columns



In [3]:

    
#load the features
features = pd.read_csv('features_dragon_morgan.csv', index_col=0)
features.head()









    Out[3]:






  
    
      
      CID
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      ...
      91541756_2
      91552833_2
      91563027_2
      91595028_2
      91614181_2
      91617014_2
      91617930_2
      91618238_2
      neglog10d
      Intensity
    
  
  
    
      0
      126
      0.181128
      0.270753
      0.030587
      0.262264
      0.219126
      0.253846
      0.214989
      0.216981
      0.425532
      ...
      0.014024
      0.000296
      0.021098
      0.000186
      0.003159
      0.002299
      0.000138
      0.011080
      1
      1
    
    
      1
      126
      0.181128
      0.270753
      0.030587
      0.262264
      0.219126
      0.253846
      0.214989
      0.216981
      0.425532
      ...
      0.014024
      0.000296
      0.021098
      0.000186
      0.003159
      0.002299
      0.000138
      0.011080
      3
      0
    
    
      2
      176
      0.060311
      0.109331
      0.025411
      0.096943
      0.105579
      0.090940
      0.107335
      0.125214
      0.659574
      ...
      0.008391
      0.000930
      0.001442
      0.000094
      0.000607
      0.001362
      0.000229
      0.004162
      5
      1
    
    
      3
      176
      0.060311
      0.109331
      0.025411
      0.096943
      0.105579
      0.090940
      0.107335
      0.125214
      0.659574
      ...
      0.008391
      0.000930
      0.001442
      0.000094
      0.000607
      0.001362
      0.000229
      0.004162
      7
      0
    
    
      4
      177
      0.020039
      0.067721
      0.015501
      0.075556
      0.083688
      0.078074
      0.089782
      0.106346
      0.382979
      ...
      0.000961
      0.000339
      0.000657
      0.000008
      0.000098
      0.000221
      0.000037
      0.001932
      3
      1
    
  

5 rows × 14616 columns



In [46]:

    
]



In [7]:

    
# give a number for each descriptor
descriptor = {}
for idx, desc in enumerate([u'INTENSITY/STRENGTH', u'VALENCE/PLEASANTNESS', u'BAKERY', 
                       u'SWEET', u'FRUIT', u'FISH', u'GARLIC', u'SPICES', u'COLD', u'SOUR', u'BURNT',
                       u'ACID', u'WARM', u'MUSKY', u'SWEATY', u'AMMONIA/URINOUS', u'DECAYED', u'WOOD',
                       u'GRASS', u'FLOWER', u'CHEMICAL']):
    descriptor[idx] = desc



In [8]:

    
# load the targets
all_targets = pd.read_csv('target.csv', index_col=0)
all_targets.head()









    Out[8]:






  
    
      
      #oID
      individual
      INTENSITY/STRENGTH
      VALENCE/PLEASANTNESS
      BAKERY
      SWEET
      FRUIT
      FISH
      GARLIC
      SPICES
      ...
      ACID
      WARM
      MUSKY
      SWEATY
      AMMONIA/URINOUS
      DECAYED
      WOOD
      GRASS
      FLOWER
      CHEMICAL
    
  
  
    
      0
      126
      25
      49.551020
      49.465116
      0.674419
      25.953488
      6.581395
      0.302326
      1.720930
      3.906977
      ...
      3.046512
      0.790698
      8.023256
      1.604651
      1.209302
      5.069767
      1.348837
      1.441860
      9.906977
      14.813953
    
    
      1
      126
      25
      24.653061
      49.465116
      0.674419
      25.953488
      6.581395
      0.302326
      1.720930
      3.906977
      ...
      3.046512
      0.790698
      8.023256
      1.604651
      1.209302
      5.069767
      1.348837
      1.441860
      9.906977
      14.813953
    
    
      2
      176
      25
      11.551020
      45.944444
      3.666667
      8.166667
      1.777778
      0.000000
      10.388889
      6.055556
      ...
      4.166667
      6.111111
      8.666667
      2.166667
      5.222222
      4.388889
      2.611111
      2.166667
      5.944444
      4.222222
    
    
      3
      176
      25
      4.551020
      45.944444
      3.666667
      8.166667
      1.777778
      0.000000
      10.388889
      6.055556
      ...
      4.166667
      6.111111
      8.666667
      2.166667
      5.222222
      4.388889
      2.611111
      2.166667
      5.944444
      4.222222
    
    
      4
      177
      25
      33.265306
      45.147059
      9.411765
      22.441176
      1.676471
      0.000000
      0.705882
      2.735294
      ...
      4.970588
      4.470588
      3.823529
      2.176471
      4.235294
      3.558824
      1.147059
      4.470588
      2.441176
      18.794118
    
  

5 rows × 23 columns



In [9]:

    
scores = pd.read_csv('LB_scores/scores_' + str(0) + '.csv',index_col=0)



In [9]:

    
#load splits
trainsplits = pd.read_csv('/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/cv_splits_train_big.csv',header=None)
testsplits = pd.read_csv('/media/gabor/H/python_from_C/final_ofaction_for_paper_2/data/cv_splits_test_big.csv',header=None)



In [20]:

    
# predict LB with different number of features
for k in range(10):
    print k
    # set a cv split as holout data
    lb_CIDs = testsplits.ix[k,:].values
    #for feature_number in [5,10,20,50,100,200,300,500,1000,1500,2000,3000,4000]:
    #for feature_number in [1,2,3,5,10,33,100,333,1000,3333,10000]:
    for feature_number in [1,2,3,4,5,10,33,100,333,1000,3333,10000]: #new run, with 1,2,3 features
        print(feature_number)
        sys.stdout.flush()

        targets = all_targets[~all_targets['#oID'].isin(test_CIDs)]# remove test data 
        features = features[~features.CID.isin(test_CIDs)] # remove test data 

        train_targets = targets[~targets['#oID'].isin(lb_CIDs)]  # exclude lb targets from training
        train_features = features[~features.CID.isin(lb_CIDs)] # exclude lb features from training
        test_features = features[features.CID.isin(lb_CIDs)] 

        # set the regressor
        regr = linear_model.Ridge(alpha=1, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto')


        result = []
        for idx in range(21):

            #print(descriptor[idx])

            # load the scores for the descriptor
            scores = pd.read_csv('LB_scores_morgan' + str(k) + '/scores_' + str(idx) + '.csv',index_col=0)

            #exclude the data leak 
            scores = scores.loc[[x for x in scores.index if x not in['Intensity','neglog10d']] ].sort_values(by='0', ascending=0)

            X_all = train_features[scores.sort_values(by='0',ascending=0)[:feature_number].index] # set X values with the best features
            X_all['CID'] = train_features.CID # add the CIDs as a column
            

            for CID in lb_CIDs:

                Y_train = train_targets[['#oID',descriptor[idx]]]

                Y_train = Y_train[~Y_train[descriptor[idx]].isnull()]
                X = X_all[X_all.CID.isin(Y_train['#oID'])]
                weight = weights[weights.index.isin(Y_train['#oID'])][str(CID)]

                if idx == 0: # if predicting intensity, use 1/1000 dilutions (neglog10 of 1/1000 is 3)
                    test_data = test_features[test_features.neglog10d == 3]
                    test_data = test_data[test_data.CID == CID]
                    test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]

                else: # otherwise use high dilution data (not that they differ in this target matrix from the low ones)
                    test_data = test_features[test_features.Intensity == 1]
                    test_data = test_data[test_data.CID == CID]
                    test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]
                    
                    
                # in case the data frame lenght is zero, dont try to predict

                if len(test_data) == 0:
                    print 'zero data',CID
                else:
                    regr.fit(X.drop('CID',1),Y_train[descriptor[idx]], sample_weight = weight.values)

                    Y_test = regr.predict(test_data)
                    std = -(Y_test**2)/2500.0+Y_test/25.0
                    result.append([CID, descriptor[idx], Y_test,std])

        result = pd.DataFrame(result)
        result.columns = ['#oID', 'descriptor', 'value', 'sigma']

        result.value = result.value.astype(float)
        result.sigma = result.sigma.astype(float)

        # remove negative data and data above 100

        result.value[result.value < 0] = 0 
        result.value[result.value > 100] = 100

        result.sigma[result.sigma < 0] = 0


        #result_mean['sigma'] = -(result_mean.value**2)/2500.0+result_mean.value/25.0
        result.to_csv('results_morgan_noleak/' + str(k) + '/subchallenge2_' +str(feature_number) + '.txt',sep='\t',index =0)









    



0
4






    



/home/gabor/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:35: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



zero data 6501
zero data 8063
zero data 7991
zero data 556940
zero data 61199
zero data 176
zero data 263
zero data 61130
1
4






    



/home/gabor/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:74: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/gabor/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:75: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/gabor/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:77: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    



zero data 18635
zero data 1549026
zero data 61945
zero data 23642
zero data 31252
zero data 7341
zero data 6920
zero data 7969
2
4
zero data 9609
zero data 7969
zero data 7894
zero data 61199
zero data 61130
zero data 7916
3
4
zero data 7916
zero data 10430
zero data 9016
zero data 12587
zero data 15717
zero data 18635
zero data 526618
zero data 7894
4
4
zero data 10430
zero data 10448
zero data 7991
zero data 61653
zero data 15380
zero data 101010
zero data 27458
zero data 10285
zero data 62725
zero data 61527
5
4
zero data 7150
zero data 556940
zero data 8118
zero data 6590
zero data 6429333
zero data 61641
zero data 7969
zero data 5362588
zero data 31252
zero data 61130
zero data 18635
zero data 61199
6
4
zero data 10797
zero data 10430
zero data 526618
zero data 7991
zero data 7894
zero data 263
zero data 5362588
zero data 18635
zero data 61130
zero data 9016
zero data 1032
7
4
zero data 61527
zero data 1549026
zero data 7144
zero data 263
zero data 10430
zero data 6501
zero data 526618
8
4
zero data 61199
zero data 8063
zero data 62725
zero data 1032
zero data 12587
zero data 61670
zero data 61130
9
4
zero data 7144
zero data 10448
zero data 1049
zero data 36822
zero data 23642



In [29]:

    
test_features[test_features.CID == CID]









    Out[29]:






  
    
      
      CID
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      ...
      Hypertens-80_2
      Hypertens-50_2
      Hypnotic-80_2
      Hypnotic-50_2
      Neoplastic-80_2
      Neoplastic-50_2
      Infective-80_2
      Infective-50_2
      neglog10d
      Intensity
    
  
  
    
      308
      7916
      0.183463
      0.213539
      0.018611
      0.218204
      0.217213
      0.218838
      0.221028
      0.132933
      0.375887
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      5
      1
    
    
      309
      7916
      0.183463
      0.213539
      0.018611
      0.218204
      0.217213
      0.218838
      0.221028
      0.132933
      0.375887
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      7
      0
    
  

2 rows × 9741 columns



In [22]:

    
test_data = test_features[test_features.neglog10d == 3]
#test_data = test_data[test_data.CID == CID]
#test_data = test_data[scores.sort_values(by='0',ascending=0)[:feature_number].index]
test_data









    Out[22]:






  
    
      
      CID
      complexity from pubmed
      MW
      AMW
      Sv
      Se
      Sp
      Si
      Mv
      Me
      ...
      Hypertens-80_2
      Hypertens-50_2
      Hypnotic-80_2
      Hypnotic-50_2
      Neoplastic-80_2
      Neoplastic-50_2
      Infective-80_2
      Infective-50_2
      neglog10d
      Intensity
    
  
  
    
      11
      239
      0.102724
      0.184880
      0.020081
      0.173157
      0.187816
      0.169183
      0.193197
      0.108919
      0.503546
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      14
      241
      0.030156
      0.156299
      0.017265
      0.189579
      0.158847
      0.199864
      0.165365
      0.186964
      0.078014
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      33
      325
      0.196498
      0.343857
      0.013183
      0.393635
      0.371028
      0.411944
      0.381254
      0.138937
      0.148936
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      0
    
    
      38
      379
      0.173735
      0.328253
      0.009412
      0.370962
      0.390989
      0.389775
      0.404832
      0.095197
      0.205674
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      68
      1001
      0.126459
      0.268335
      0.013591
      0.311468
      0.288660
      0.327932
      0.300793
      0.144940
      0.113475
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      92
      2346
      0.256809
      0.341231
      0.035778
      0.315476
      0.244267
      0.331467
      0.244026
      0.239280
      0.212766
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      1
    
    
      113
      5541
      0.445525
      0.520675
      0.025550
      0.470627
      0.464004
      0.447941
      0.454528
      0.146655
      0.567376
      ...
      0
      0
      0
      0
      1
      0
      1
      0
      3
      0
    
    
      130
      6050
      0.591440
      0.739597
      0.016644
      0.744646
      0.749431
      0.746776
      0.752025
      0.121784
      0.354610
      ...
      1
      1
      0
      0
      1
      1
      1
      0
      3
      1
    
    
      134
      6057
      0.342412
      0.424399
      0.025754
      0.405540
      0.370847
      0.394752
      0.367998
      0.174099
      0.432624
      ...
      0
      0
      1
      0
      1
      0
      1
      0
      3
      1
    
    
      145
      6213
      0.165953
      0.197987
      0.033990
      0.148211
      0.154410
      0.159060
      0.155754
      0.127787
      0.531915
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      148
      6276
      0.038716
      0.182435
      0.004106
      0.228344
      0.257465
      0.249039
      0.273600
      0.067753
      0.134752
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      186
      6826
      0.309339
      0.382789
      0.022758
      0.384153
      0.348939
      0.381914
      0.350431
      0.169811
      0.340426
      ...
      0
      0
      1
      0
      0
      0
      1
      0
      3
      1
    
    
      190
      6943
      0.192412
      0.307370
      0.014661
      0.347990
      0.323452
      0.362148
      0.331673
      0.146655
      0.163121
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      194
      6997
      0.156809
      0.270883
      0.016620
      0.302315
      0.275875
      0.312323
      0.282078
      0.156947
      0.191489
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      223
      7151
      0.243191
      0.309815
      0.029982
      0.292803
      0.253803
      0.282293
      0.251270
      0.201544
      0.453901
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      245
      7463
      0.167704
      0.302247
      0.009779
      0.372278
      0.349121
      0.399106
      0.363687
      0.133791
      0.042553
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      250
      7519
      0.107782
      0.234396
      0.019297
      0.256640
      0.228315
      0.262527
      0.232498
      0.171527
      0.219858
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      300
      7824
      0.153696
      0.291766
      0.010334
      0.325287
      0.343429
      0.339979
      0.355251
      0.096913
      0.234043
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      328
      8030
      0.044358
      0.171981
      0.040447
      0.149527
      0.112541
      0.168392
      0.114608
      0.239280
      0.163121
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      342
      8077
      0.034047
      0.271143
      0.026513
      0.232292
      0.223514
      0.279550
      0.233166
      0.127787
      0.092199
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      345
      8078
      0.030156
      0.172059
      0.002302
      0.236869
      0.252070
      0.264450
      0.270551
      0.081475
      0.007092
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      348
      8091
      0.193969
      0.364740
      0.008677
      0.416637
      0.438566
      0.439600
      0.454427
      0.093482
      0.184397
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      1
    
    
      350
      8093
      0.149027
      0.286643
      0.006000
      0.349575
      0.369098
      0.376937
      0.387279
      0.088336
      0.113475
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      381
      8193
      0.157977
      0.437845
      0.003135
      0.548008
      0.590452
      0.597698
      0.620677
      0.075472
      0.070922
      ...
      0
      0
      0
      0
      1
      0
      1
      0
      3
      0
    
    
      409
      8697
      0.193385
      0.328253
      0.009412
      0.370962
      0.390989
      0.389775
      0.404832
      0.095197
      0.205674
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      420
      8857
      0.096304
      0.182305
      0.015501
      0.188293
      0.200716
      0.190561
      0.206510
      0.106346
      0.382979
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      424
      8892
      0.134047
      0.255279
      0.011542
      0.279612
      0.295853
      0.290154
      0.305671
      0.099485
      0.269504
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      431
      8918
      0.235409
      0.437715
      0.007583
      0.507956
      0.533703
      0.539192
      0.553588
      0.091767
      0.156028
      ...
      0
      0
      1
      0
      1
      0
      1
      0
      3
      0
    
    
      466
      10882
      0.153696
      0.291766
      0.010334
      0.325287
      0.343429
      0.339979
      0.355251
      0.096913
      0.234043
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      526
      12580
      0.160895
      0.307370
      0.014661
      0.347990
      0.323452
      0.362148
      0.331673
      0.146655
      0.163121
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      542
      13216
      0.258755
      0.354364
      0.007551
      0.425162
      0.433172
      0.455011
      0.451378
      0.102058
      0.106383
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      1
    
    
      548
      14104
      0.254864
      0.380344
      0.012016
      0.439310
      0.418589
      0.461769
      0.430834
      0.132933
      0.127660
      ...
      0
      0
      1
      0
      0
      0
      1
      0
      3
      1
    
    
      550
      14228
      0.301556
      0.547176
      0.006498
      0.644981
      0.676416
      0.688638
      0.702343
      0.089194
      0.127660
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      562
      14514
      0.233463
      0.276006
      0.023738
      0.278027
      0.250206
      0.275393
      0.250065
      0.176672
      0.361702
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      612
      22386
      0.052724
      0.182435
      0.004106
      0.228344
      0.257465
      0.249039
      0.273600
      0.067753
      0.134752
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      617
      23235
      0.332685
      0.489675
      0.015159
      0.536283
      0.504553
      0.552709
      0.512500
      0.143225
      0.198582
      ...
      1
      0
      0
      0
      1
      0
      1
      0
      3
      0
    
    
      655
      31249
      0.262646
      0.406221
      0.018824
      0.397972
      0.403708
      0.393960
      0.404890
      0.124357
      0.418440
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      696
      61138
      0.148054
      0.213539
      0.018611
      0.218204
      0.217213
      0.218838
      0.221028
      0.132933
      0.375887
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      698
      61151
      0.303502
      0.385338
      0.025787
      0.374970
      0.336171
      0.366305
      0.331717
      0.181818
      0.411348
      ...
      0
      0
      1
      0
      0
      0
      1
      0
      3
      1
    
    
      702
      61177
      0.120428
      0.291766
      0.010334
      0.325287
      0.343429
      0.339979
      0.355251
      0.096913
      0.234043
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      704
      61185
      0.280156
      0.510689
      0.006808
      0.599306
      0.628840
      0.638813
      0.652748
      0.090051
      0.134752
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      1
    
    
      706
      61192
      0.317121
      0.434958
      0.029876
      0.389746
      0.350210
      0.400209
      0.349270
      0.177530
      0.361702
      ...
      0
      0
      1
      0
      1
      0
      1
      0
      3
      1
    
    
      710
      61204
      0.299611
      0.432461
      0.009714
      0.492223
      0.502639
      0.517673
      0.518526
      0.104631
      0.177305
      ...
      0
      0
      1
      0
      1
      0
      1
      0
      3
      1
    
    
      724
      61337
      0.297665
      0.348851
      0.029493
      0.329325
      0.288594
      0.316508
      0.282137
      0.198113
      0.475177
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      746
      61918
      0.285992
      0.343753
      0.022505
      0.347661
      0.314148
      0.347670
      0.319565
      0.170669
      0.304965
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      760
      62374
      0.478599
      0.348981
      0.018146
      0.369347
      0.345343
      0.374986
      0.349226
      0.151801
      0.269504
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      766
      62444
      0.118872
      0.260585
      0.020889
      0.248086
      0.244152
      0.274092
      0.251880
      0.126072
      0.212766
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      770
      62572
      0.175681
      0.296890
      0.015501
      0.300999
      0.317760
      0.303020
      0.323224
      0.106346
      0.382979
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      784
      78925
      0.031907
      0.192864
      0.022007
      0.172499
      0.180079
      0.196019
      0.187781
      0.107204
      0.248227
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      793
      89440
      0.712062
      0.625247
      0.011004
      0.727477
      0.689432
      0.767362
      0.707788
      0.132075
      0.099291
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      0
    
    
      796
      93375
      0.307393
      0.312624
      0.011142
      0.363723
      0.354532
      0.383667
      0.366735
      0.123499
      0.141844
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      811
      159055
      0.422179
      0.349111
      0.010155
      0.409398
      0.402092
      0.433492
      0.416316
      0.119211
      0.127660
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      0
    
    
      813
      165675
      0.233463
      0.359617
      0.005281
      0.440925
      0.464235
      0.476558
      0.486440
      0.087479
      0.092199
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      0
    
    
      815
      170833
      0.293774
      0.354364
      0.007551
      0.425162
      0.433172
      0.455011
      0.451378
      0.102058
      0.106383
      ...
      0
      0
      0
      0
      0
      0
      1
      0
      3
      0
    
    
      831
      440967
      0.344358
      0.307500
      0.006906
      0.388011
      0.380201
      0.420625
      0.398763
      0.114065
      0.028369
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      838
      519539
      0.165370
      0.320503
      0.021779
      0.317091
      0.289914
      0.346228
      0.299631
      0.152659
      0.141844
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      1
    
    
      852
      565690
      0.476654
      0.578409
      0.007738
      0.674892
      0.692913
      0.716916
      0.716862
      0.098628
      0.127660
      ...
      0
      0
      0
      0
      1
      0
      1
      0
      3
      1
    
    
      862
      637776
      0.322957
      0.416701
      0.018016
      0.444933
      0.409416
      0.453088
      0.413325
      0.156947
      0.241135
      ...
      0
      0
      1
      0
      0
      0
      1
      0
      3
      1
    
    
      923
      5363491
      0.282101
      0.317747
      0.016146
      0.339435
      0.328846
      0.346709
      0.334708
      0.135506
      0.269504
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      3
      0
    
    
      942
      6114390
      0.669261
      0.687454
      0.023224
      0.707227
      0.608928
      0.706934
      0.602645
      0.192110
      0.269504
      ...
      1
      0
      0
      0
      1
      0
      1
      0
      3
      1
    
  

61 rows × 9741 columns



In [16]:

    
CID









    Out[16]:





6501



In [ ]:

	126	176	177	180	196	239	240	241	243	244	...	5366244	5367698	5367706	5368076	5371102	6114390	6429333	6999977	10857465	16220109
0
126	1.000000	0.108108	0.171429	0.054054	0.066667	0.090909	0.509091	0.166667	0.315789	0.290909	...	0.033613	0.183673	0.195652	0.051948	0.252632	0.237288	0.307692	0.066667	0.000000	0.050633
126	1.000000	0.108108	0.171429	0.054054	0.066667	0.090909	0.509091	0.166667	0.315789	0.290909	...	0.033613	0.183673	0.195652	0.051948	0.252632	0.237288	0.307692	0.066667	0.000000	0.050633
176	0.108108	1.000000	0.285714	0.625000	0.256410	0.434783	0.058824	0.000000	0.277778	0.058824	...	0.081633	0.103896	0.112676	0.142857	0.135135	0.082474	0.136364	0.256410	0.027397	0.172414
176	0.108108	1.000000	0.285714	0.625000	0.256410	0.434783	0.058824	0.000000	0.277778	0.058824	...	0.081633	0.103896	0.112676	0.142857	0.135135	0.082474	0.136364	0.256410	0.027397	0.172414
177	0.171429	0.285714	1.000000	0.285714	0.054054	0.095238	0.187500	0.000000	0.058824	0.000000	...	0.041667	0.080000	0.086957	0.111111	0.083333	0.084211	0.238095	0.108108	0.028169	0.142857

	CID	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	...	91541756_2	91552833_2	91563027_2	91595028_2	91614181_2	91617014_2	91617930_2	91618238_2	neglog10d	Intensity
0	126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	...	0.014024	0.000296	0.021098	0.000186	0.003159	0.002299	0.000138	0.011080	1	1
1	126	0.181128	0.270753	0.030587	0.262264	0.219126	0.253846	0.214989	0.216981	0.425532	...	0.014024	0.000296	0.021098	0.000186	0.003159	0.002299	0.000138	0.011080	3	0
2	176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	...	0.008391	0.000930	0.001442	0.000094	0.000607	0.001362	0.000229	0.004162	5	1
3	176	0.060311	0.109331	0.025411	0.096943	0.105579	0.090940	0.107335	0.125214	0.659574	...	0.008391	0.000930	0.001442	0.000094	0.000607	0.001362	0.000229	0.004162	7	0
4	177	0.020039	0.067721	0.015501	0.075556	0.083688	0.078074	0.089782	0.106346	0.382979	...	0.000961	0.000339	0.000657	0.000008	0.000098	0.000221	0.000037	0.001932	3	1

	#oID	individual	INTENSITY/STRENGTH	VALENCE/PLEASANTNESS	BAKERY	SWEET	FRUIT	FISH	GARLIC	SPICES	...	ACID	WARM	MUSKY	SWEATY	AMMONIA/URINOUS	DECAYED	WOOD	GRASS	FLOWER	CHEMICAL
0	126	25	49.551020	49.465116	0.674419	25.953488	6.581395	0.302326	1.720930	3.906977	...	3.046512	0.790698	8.023256	1.604651	1.209302	5.069767	1.348837	1.441860	9.906977	14.813953
1	126	25	24.653061	49.465116	0.674419	25.953488	6.581395	0.302326	1.720930	3.906977	...	3.046512	0.790698	8.023256	1.604651	1.209302	5.069767	1.348837	1.441860	9.906977	14.813953
2	176	25	11.551020	45.944444	3.666667	8.166667	1.777778	0.000000	10.388889	6.055556	...	4.166667	6.111111	8.666667	2.166667	5.222222	4.388889	2.611111	2.166667	5.944444	4.222222
3	176	25	4.551020	45.944444	3.666667	8.166667	1.777778	0.000000	10.388889	6.055556	...	4.166667	6.111111	8.666667	2.166667	5.222222	4.388889	2.611111	2.166667	5.944444	4.222222
4	177	25	33.265306	45.147059	9.411765	22.441176	1.676471	0.000000	0.705882	2.735294	...	4.970588	4.470588	3.823529	2.176471	4.235294	3.558824	1.147059	4.470588	2.441176	18.794118

	CID	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	...	Hypertens-80_2	Hypertens-50_2	Hypnotic-80_2	Hypnotic-50_2	Neoplastic-80_2	Neoplastic-50_2	Infective-80_2	Infective-50_2	neglog10d	Intensity
308	7916	0.183463	0.213539	0.018611	0.218204	0.217213	0.218838	0.221028	0.132933	0.375887	...	0	0	0	0	0	0	0	0	5	1
309	7916	0.183463	0.213539	0.018611	0.218204	0.217213	0.218838	0.221028	0.132933	0.375887	...	0	0	0	0	0	0	0	0	7	0

	CID	complexity from pubmed	MW	AMW	Sv	Se	Sp	Si	Mv	Me	...	Hypertens-80_2	Hypertens-50_2	Hypnotic-80_2	Hypnotic-50_2	Neoplastic-80_2	Neoplastic-50_2	Infective-80_2	Infective-50_2	neglog10d	Intensity
11	239	0.102724	0.184880	0.020081	0.173157	0.187816	0.169183	0.193197	0.108919	0.503546	...	0	0	0	0	0	0	0	0	3	0
14	241	0.030156	0.156299	0.017265	0.189579	0.158847	0.199864	0.165365	0.186964	0.078014	...	0	0	0	0	0	0	0	0	3	1
33	325	0.196498	0.343857	0.013183	0.393635	0.371028	0.411944	0.381254	0.138937	0.148936	...	0	0	0	0	0	0	1	0	3	0
38	379	0.173735	0.328253	0.009412	0.370962	0.390989	0.389775	0.404832	0.095197	0.205674	...	0	0	0	0	0	0	0	0	3	1
68	1001	0.126459	0.268335	0.013591	0.311468	0.288660	0.327932	0.300793	0.144940	0.113475	...	0	0	0	0	0	0	0	0	3	1
92	2346	0.256809	0.341231	0.035778	0.315476	0.244267	0.331467	0.244026	0.239280	0.212766	...	0	0	0	0	0	0	1	0	3	1
113	5541	0.445525	0.520675	0.025550	0.470627	0.464004	0.447941	0.454528	0.146655	0.567376	...	0	0	0	0	1	0	1	0	3	0
130	6050	0.591440	0.739597	0.016644	0.744646	0.749431	0.746776	0.752025	0.121784	0.354610	...	1	1	0	0	1	1	1	0	3	1
134	6057	0.342412	0.424399	0.025754	0.405540	0.370847	0.394752	0.367998	0.174099	0.432624	...	0	0	1	0	1	0	1	0	3	1
145	6213	0.165953	0.197987	0.033990	0.148211	0.154410	0.159060	0.155754	0.127787	0.531915	...	0	0	0	0	0	0	0	0	3	0
148	6276	0.038716	0.182435	0.004106	0.228344	0.257465	0.249039	0.273600	0.067753	0.134752	...	0	0	0	0	0	0	0	0	3	1
186	6826	0.309339	0.382789	0.022758	0.384153	0.348939	0.381914	0.350431	0.169811	0.340426	...	0	0	1	0	0	0	1	0	3	1
190	6943	0.192412	0.307370	0.014661	0.347990	0.323452	0.362148	0.331673	0.146655	0.163121	...	0	0	0	0	0	0	0	0	3	1
194	6997	0.156809	0.270883	0.016620	0.302315	0.275875	0.312323	0.282078	0.156947	0.191489	...	0	0	0	0	0	0	0	0	3	1
223	7151	0.243191	0.309815	0.029982	0.292803	0.253803	0.282293	0.251270	0.201544	0.453901	...	0	0	0	0	0	0	0	0	3	0
245	7463	0.167704	0.302247	0.009779	0.372278	0.349121	0.399106	0.363687	0.133791	0.042553	...	0	0	0	0	0	0	0	0	3	0
250	7519	0.107782	0.234396	0.019297	0.256640	0.228315	0.262527	0.232498	0.171527	0.219858	...	0	0	0	0	0	0	0	0	3	1
300	7824	0.153696	0.291766	0.010334	0.325287	0.343429	0.339979	0.355251	0.096913	0.234043	...	0	0	0	0	0	0	0	0	3	1
328	8030	0.044358	0.171981	0.040447	0.149527	0.112541	0.168392	0.114608	0.239280	0.163121	...	0	0	0	0	0	0	0	0	3	1
342	8077	0.034047	0.271143	0.026513	0.232292	0.223514	0.279550	0.233166	0.127787	0.092199	...	0	0	0	0	0	0	0	0	3	1
345	8078	0.030156	0.172059	0.002302	0.236869	0.252070	0.264450	0.270551	0.081475	0.007092	...	0	0	0	0	0	0	0	0	3	0
348	8091	0.193969	0.364740	0.008677	0.416637	0.438566	0.439600	0.454427	0.093482	0.184397	...	0	0	0	0	0	0	1	0	3	1
350	8093	0.149027	0.286643	0.006000	0.349575	0.369098	0.376937	0.387279	0.088336	0.113475	...	0	0	0	0	0	0	0	0	3	1
381	8193	0.157977	0.437845	0.003135	0.548008	0.590452	0.597698	0.620677	0.075472	0.070922	...	0	0	0	0	1	0	1	0	3	0
409	8697	0.193385	0.328253	0.009412	0.370962	0.390989	0.389775	0.404832	0.095197	0.205674	...	0	0	0	0	0	0	0	0	3	0
420	8857	0.096304	0.182305	0.015501	0.188293	0.200716	0.190561	0.206510	0.106346	0.382979	...	0	0	0	0	0	0	0	0	3	1
424	8892	0.134047	0.255279	0.011542	0.279612	0.295853	0.290154	0.305671	0.099485	0.269504	...	0	0	0	0	0	0	0	0	3	1
431	8918	0.235409	0.437715	0.007583	0.507956	0.533703	0.539192	0.553588	0.091767	0.156028	...	0	0	1	0	1	0	1	0	3	0
466	10882	0.153696	0.291766	0.010334	0.325287	0.343429	0.339979	0.355251	0.096913	0.234043	...	0	0	0	0	0	0	0	0	3	1
526	12580	0.160895	0.307370	0.014661	0.347990	0.323452	0.362148	0.331673	0.146655	0.163121	...	0	0	0	0	0	0	0	0	3	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
542	13216	0.258755	0.354364	0.007551	0.425162	0.433172	0.455011	0.451378	0.102058	0.106383	...	0	0	0	0	0	0	1	0	3	1
548	14104	0.254864	0.380344	0.012016	0.439310	0.418589	0.461769	0.430834	0.132933	0.127660	...	0	0	1	0	0	0	1	0	3	1
550	14228	0.301556	0.547176	0.006498	0.644981	0.676416	0.688638	0.702343	0.089194	0.127660	...	0	0	0	0	0	0	0	0	3	1
562	14514	0.233463	0.276006	0.023738	0.278027	0.250206	0.275393	0.250065	0.176672	0.361702	...	0	0	0	0	0	0	0	0	3	1
612	22386	0.052724	0.182435	0.004106	0.228344	0.257465	0.249039	0.273600	0.067753	0.134752	...	0	0	0	0	0	0	0	0	3	1
617	23235	0.332685	0.489675	0.015159	0.536283	0.504553	0.552709	0.512500	0.143225	0.198582	...	1	0	0	0	1	0	1	0	3	0
655	31249	0.262646	0.406221	0.018824	0.397972	0.403708	0.393960	0.404890	0.124357	0.418440	...	0	0	0	0	0	0	0	0	3	0
696	61138	0.148054	0.213539	0.018611	0.218204	0.217213	0.218838	0.221028	0.132933	0.375887	...	0	0	0	0	0	0	0	0	3	1
698	61151	0.303502	0.385338	0.025787	0.374970	0.336171	0.366305	0.331717	0.181818	0.411348	...	0	0	1	0	0	0	1	0	3	1
702	61177	0.120428	0.291766	0.010334	0.325287	0.343429	0.339979	0.355251	0.096913	0.234043	...	0	0	0	0	0	0	0	0	3	1
704	61185	0.280156	0.510689	0.006808	0.599306	0.628840	0.638813	0.652748	0.090051	0.134752	...	0	0	0	0	0	0	1	0	3	1
706	61192	0.317121	0.434958	0.029876	0.389746	0.350210	0.400209	0.349270	0.177530	0.361702	...	0	0	1	0	1	0	1	0	3	1
710	61204	0.299611	0.432461	0.009714	0.492223	0.502639	0.517673	0.518526	0.104631	0.177305	...	0	0	1	0	1	0	1	0	3	1
724	61337	0.297665	0.348851	0.029493	0.329325	0.288594	0.316508	0.282137	0.198113	0.475177	...	0	0	0	0	0	0	0	0	3	1
746	61918	0.285992	0.343753	0.022505	0.347661	0.314148	0.347670	0.319565	0.170669	0.304965	...	0	0	0	0	0	0	0	0	3	1
760	62374	0.478599	0.348981	0.018146	0.369347	0.345343	0.374986	0.349226	0.151801	0.269504	...	0	0	0	0	0	0	0	0	3	1
766	62444	0.118872	0.260585	0.020889	0.248086	0.244152	0.274092	0.251880	0.126072	0.212766	...	0	0	0	0	0	0	0	0	3	1
770	62572	0.175681	0.296890	0.015501	0.300999	0.317760	0.303020	0.323224	0.106346	0.382979	...	0	0	0	0	0	0	0	0	3	1
784	78925	0.031907	0.192864	0.022007	0.172499	0.180079	0.196019	0.187781	0.107204	0.248227	...	0	0	0	0	0	0	0	0	3	1
793	89440	0.712062	0.625247	0.011004	0.727477	0.689432	0.767362	0.707788	0.132075	0.099291	...	0	0	0	0	0	0	1	0	3	0
796	93375	0.307393	0.312624	0.011142	0.363723	0.354532	0.383667	0.366735	0.123499	0.141844	...	0	0	0	0	0	0	0	0	3	1
811	159055	0.422179	0.349111	0.010155	0.409398	0.402092	0.433492	0.416316	0.119211	0.127660	...	0	0	0	0	0	0	1	0	3	0
813	165675	0.233463	0.359617	0.005281	0.440925	0.464235	0.476558	0.486440	0.087479	0.092199	...	0	0	0	0	0	0	1	0	3	0
815	170833	0.293774	0.354364	0.007551	0.425162	0.433172	0.455011	0.451378	0.102058	0.106383	...	0	0	0	0	0	0	1	0	3	0
831	440967	0.344358	0.307500	0.006906	0.388011	0.380201	0.420625	0.398763	0.114065	0.028369	...	0	0	0	0	0	0	0	0	3	0
838	519539	0.165370	0.320503	0.021779	0.317091	0.289914	0.346228	0.299631	0.152659	0.141844	...	0	0	0	0	0	0	0	0	3	1
852	565690	0.476654	0.578409	0.007738	0.674892	0.692913	0.716916	0.716862	0.098628	0.127660	...	0	0	0	0	1	0	1	0	3	1
862	637776	0.322957	0.416701	0.018016	0.444933	0.409416	0.453088	0.413325	0.156947	0.241135	...	0	0	1	0	0	0	1	0	3	1
923	5363491	0.282101	0.317747	0.016146	0.339435	0.328846	0.346709	0.334708	0.135506	0.269504	...	0	0	0	0	0	0	0	0	3	0
942	6114390	0.669261	0.687454	0.023224	0.707227	0.608928	0.706934	0.602645	0.192110	0.269504	...	1	0	0	0	1	0	1	0	3	1