In [1]:
import os

min_conf = 0.9
min_supp = 10
min_cov = 0. #no restriction on coverage

clauses_dir = './clauses'
clauses_file = os.path.join(clauses_dir,'clauses_conf%.3f_supp%d.pl'%(min_conf, min_supp))
if not os.path.exists(clauses_dir):
    os.makedirs(clauses_dir)
print('clause file %s'%clauses_file)


clause file ./clauses/clauses_conf0.900_supp10.pl

load data


In [2]:
import numpy as np
import multiprocessing


with open('kinships.tsv','r') as datafile:
    facts = np.array([line.strip().split() for line in datafile])
print(facts[:5,:])
entities = sorted(list(set(facts[:,0]).union(set(facts[:,2]))))
relations = sorted(list(set(facts[:,1])))
Nfacts, Nentities, Nrelations = len(facts), len(entities), len(relations)
print('loaded kinships: %d facts, %d entities, %d relations'%(Nfacts, Nentities, Nrelations))
print('on average: %.1f facts/entity, %.1f facts/relation'%(Nfacts/Nentities, Nfacts/Nrelations))

pairs = np.ndarray.tolist(facts[:,[0,2]])
pairs = [str(tuple(pair)) for pair in pairs]
print('observed %d different entity pairs (from %d possible ones)'%(len(set(pairs)), Nentities*(Nentities-1)))

facts = np.ndarray.tolist(facts)
relations2facts = {r:[fact for fact in facts if r==fact[1]] for r in relations}
print('found %d facts for relation %s'%(len(relations2facts[relations[0]]), relations[0]))  
    
fact2pair = lambda fact: (fact[0],fact[2])
fact2relation = lambda fact: fact[1]
relation2pairs = {r: set([fact2pair(fact) for fact in facts if r==fact2relation(fact)]) for r in relations}
#relation2pairs['Term0']


[['Person0' 'term0' 'Person45']
 ['Person0' 'term0' 'Person96']
 ['Person1' 'term0' 'Person45']
 ['Person1' 'term0' 'Person96']
 ['Person2' 'term0' 'Person86']]
loaded kinships: 10686 facts, 104 entities, 25 relations
on average: 102.8 facts/entity, 427.4 facts/relation
observed 10686 different entity pairs (from 10712 possible ones)
found 228 facts for relation term0

find rules for pairs


In [3]:
#useful for exploring rules:
calc_support = lambda relation: relation2pairs[relation]
invpair = lambda pair: (pair[1], pair[0])

def quantify_simple_implication(relation_body, relation_head):
    rules = {}
    #simple implication
    pairs_body = calc_support(relation_body)
    pairs_head = calc_support(relation_head)  
    confidence = len(set(pairs_body).intersection(set(pairs_head))) / float(len(set(pairs_body)))
    headcoverage = len(set(pairs_body).intersection(set(pairs_head))) / float(len(set(pairs_head)))
    if headcoverage > 0. and not relation_body == relation_head:
        rule = '%s(X0, X1) :- %s(X0, X1)'%(relation_head, relation_body)
        rules[rule] = {'confidence':confidence, 'support':len(set(pairs_body)), 'coverage':headcoverage}

    #inverse
    pairs_head_inv = [invpair(p) for p in pairs_head]
    confidence = len(set(pairs_body).intersection(set(pairs_head_inv))) / float(len(set(pairs_body)))
    headcoverage = len(set(pairs_body).intersection(set(pairs_head_inv))) / float(len(set(pairs_head)))
    if headcoverage > 0.:
        rule = '%s(X0, X1) :- %s(X1, X0)'%(relation_head, relation_body)
        rules[rule] = {'confidence':confidence, 'support':len(set(pairs_body)), 'coverage':headcoverage}
    return rules

In [4]:
simple_rules = []

for p in relations:
    for q in relations:
        rules = quantify_simple_implication(p, q)
        for rule in rules:
            conf, supp, cov = rules[rule]['confidence'], rules[rule]['support'], rules[rule]['coverage']            
            if conf >= min_conf and supp >= min_supp and cov >= min_cov:
                print(rule, ' (conf: %.3f, supp: %d, cov: %.3f)'%(conf, supp, cov))
                simple_rules.append(rule)
        
print('\nincludes symmetric relations')
print('most interesting are those involving non-symmetric relations')


term0(X0, X1) :- term0(X1, X0)  (conf: 0.904, supp: 228, cov: 0.904)
term18(X0, X1) :- term18(X1, X0)  (conf: 0.938, supp: 569, cov: 0.938)
term8(X0, X1) :- term19(X1, X0)  (conf: 0.923, supp: 13, cov: 0.015)
term1(X0, X1) :- term2(X1, X0)  (conf: 0.926, supp: 231, cov: 0.438)
term15(X0, X1) :- term5(X1, X0)  (conf: 0.949, supp: 508, cov: 0.511)
term15(X0, X1) :- term6(X1, X0)  (conf: 0.932, supp: 453, cov: 0.448)

includes symmetric relations
most interesting are those involving non-symmetric relations

find rules of the form

  • r(X,Z) :- p(X,Y),q(Y,Z)
  • r(X,Z) :- p(X,Y),q(Y,Z)

In [ ]:
def quantify_conj_implication(body1, body2, head):
    rules = {}

    p_body1 = calc_support(body1) #all (subject, object) pairs that form a fact with relation body1
    p_body2 = calc_support(body2)
    p_head = calc_support(head)
    N_head = len(p_head)
    
    #for each (subject, object) of relation body1:
    N_pXY_qYZ = 0 #ok
    N_pXY_qYZ_rXZ = 0 #ok 
    N_pXY_qYZ_rZX = 0 #ok
    N_pXY_qXZ = 0 #ok
    N_pXY_qXZ_rYZ = 0 #ok
    N_pXY_qXZ_rZY = 0 #ok

    N_pXY_qZX = 0
    N_pXY_qZX_rYZ = 0
    N_pXY_qZX_rZY = 0    
    N_pXY_qZY = 0
    N_pXY_qZY_rXZ = 0
    N_pXY_qZY_rZX = 0

    for XY in p_body1:

        #rules: p(X,Y) AND q(Y,Z) => ...
        YZs = [YZ for YZ in p_body2 if YZ[0] == XY[1]] 
        for YZ in YZs: #for each of those
            N_pXY_qYZ += 1
            required_XZ = (XY[0],YZ[1])
            if required_XZ in p_head:
                N_pXY_qYZ_rXZ += 1
            required_ZX = (YZ[1],XY[0])
            if required_ZX in p_head:
                N_pXY_qYZ_rZX += 1

        #rules: p(X,Y) AND q(X,Z) => ...
        XZs = [XZ for XZ in p_body2 if XZ[0] == XY[0]]
        for XZ in XZs:
            N_pXY_qXZ += 1
            required_YZ = (XY[1], XZ[1])
            if required_YZ in p_head:
                N_pXY_qXZ_rYZ += 1
            required_ZY = (XZ[1], XY[1])
            if required_ZY in p_head:
                N_pXY_qXZ_rZY += 1

        #rules: p(X,Y) AND q(Z,X) => ...
        ZXs = [ZX for ZX in p_body2 if ZX[1] == XY[0]]
        for ZX in ZXs:
            N_pXY_qZX += 1
            required_YZ = (XY[1], ZX[0])
            if required_YZ in p_head:
                N_pXY_qZX_rYZ += 1
            required_ZY = (ZX[0], XY[1])
            if required_ZY in p_head:
                N_pXY_qZX_rZY += 1
            
        #rules: p(X,Y) AND q(Z,Y) => ...
        ZYs = [ZY for ZY in p_body2 if ZY[1] == XY[1]]
        for ZY in ZYs:
            N_pXY_qZY += 1
            required_XZ = (XY[0], ZY[0])
            if required_XZ in p_head:
                N_pXY_qZY_rXZ += 1
            required_ZX = (ZY[0], XY[0])
            if required_ZX in p_head:
                N_pXY_qZY_rZX += 1
            
    if N_pXY_qYZ_rXZ > 0:
        rules['%s(X0, X2) :- %s(X0, X1), %s(X1, X2)'%(head, body1, body2)] = \
        {'confidence':N_pXY_qYZ_rXZ/float(N_pXY_qYZ), 'support':N_pXY_qYZ, 'coverage':N_pXY_qYZ_rXZ/float(N_head)}
    if N_pXY_qYZ_rZX > 0:
        rules['%s(X2, X0) :- %s(X0, X1), %s(X1, X2)'%(head, body1, body2)] = \
        {'confidence':N_pXY_qYZ_rZX/float(N_pXY_qYZ), 'support':N_pXY_qYZ, 'coverage':N_pXY_qYZ_rZX/float(N_head)}
    if N_pXY_qXZ_rYZ > 0:
        rules['%s(X1, X2) :- %s(X0, X1), %s(X0, X2)'%(head, body1, body2)] = \
        {'confidence':N_pXY_qXZ_rYZ/float(N_pXY_qXZ), 'support':N_pXY_qXZ, 'coverage':N_pXY_qXZ_rYZ/float(N_head)}
    if N_pXY_qXZ_rZY > 0:
        rules['%s(X2, X1) :- %s(X0, X1), %s(X0, X2)'%(head, body1, body2)] = \
        {'confidence':N_pXY_qXZ_rZY/float(N_pXY_qXZ), 'support':N_pXY_qXZ, 'coverage':N_pXY_qXZ_rZY/float(N_head)}
    if N_pXY_qZX_rYZ > 0:
        rules['%s(X1, X2) :- %s(X0, X1), %s(X2, X0)'%(head, body1, body2)] = \
        {'confidence':N_pXY_qZX_rYZ/float(N_pXY_qZX), 'support':N_pXY_qZX, 'coverage':N_pXY_qZX_rYZ/float(N_head)}
    if N_pXY_qZX_rZY > 0:
        rules['%s(X2, X1) :- %s(X0, X1), %s(X2, X0)'%(head, body1, body2)] = \
        {'confidence':N_pXY_qZX_rZY/float(N_pXY_qZX), 'support':N_pXY_qZX, 'coverage':N_pXY_qZX_rZY/float(N_head)}
    if N_pXY_qZY_rXZ > 0:
        rules['%s(X0, X2) :- %s(X0, X1), %s(X2, X1)'%(head, body1, body2)] = \
        {'confidence':N_pXY_qZY_rXZ/float(N_pXY_qZY), 'support':N_pXY_qZY, 'coverage':N_pXY_qZY_rXZ/float(N_head)}
    if N_pXY_qZY_rZX > 0:
        rules['%s(X2, X0) :- %s(X0, X1), %s(X2, X1)'%(head, body1, body2)] = \
        {'confidence':N_pXY_qZY_rZX/float(N_pXY_qZY), 'support':N_pXY_qZY, 'coverage':N_pXY_qZY_rZX/float(N_head)}
                
    return rules

In [ ]:
def process(relation_tuple):
    (p,q,r) = relation_tuple
    rules = quantify_conj_implication(p,q,r)
    selected_rules = []
    for rule in rules:
        conf, supp, cov = rules[rule]['confidence'], rules[rule]['support'], rules[rule]['coverage']            
        if conf >= min_conf and supp >= min_supp and cov >= min_cov:
            print(rule, ' (conf: %.3f, supp: %d, cov: %.3f)'%(conf, supp, cov))           
            selected_rules.append(rule)
    return selected_rules

trials = [(p,q,r) for p in relations for q in relations for r in relations]
                  
pool = multiprocessing.Pool(multiprocessing.cpu_count())
conj_rules = pool.map(process, trials)
conj_rules = [r for rule_list in conj_rules for r in rule_list]    

print('finished')


term1(X0, X2) :- term13(X0, X1), term24(X1, X2)  (conf: 0.923, supp: 13, cov: 0.025)
term2(X2, X0) :- term13(X0, X1), term24(X1, X2)  (conf: 0.923, supp: 13, cov: 0.052)
term15(X0, X2) :- term2(X0, X1), term22(X1, X2)  (conf: 0.922, supp: 746, cov: 0.730)

In [ ]:
extracted_rules = simple_rules + conj_rules
with open(clauses_file,'w') as f_out:
    for rule in simple_rules + conj_rules:
        f_out.write('%s\n'%rule)
f_out.close()