In [2]:
! sudo pip -q install pandas
In [4]:
! cd ~/pynb/fb15k-akbc
! wget -q https://download.microsoft.com/download/8/7/0/8700516A-AB3D-4850-B4BB-805C515AECE1/FB15K-237.zip
! unzip FB15K-237.zip
In [3]:
import pandas as pd
In [5]:
BASE_DIR = './Release/'
TRAIN_FILE = BASE_DIR + 'train.txt'
TEST_FILE = BASE_DIR + 'test.txt'
VALID_FILE = BASE_DIR + 'valid.txt'
TEXT_CVSC_FILE = BASE_DIR + 'text_cvsc.txt'
TRAIN_CSV_FILE = 'fb15k_train.csv'
VALID_CSV_FILE = 'fb15k_valid.csv'
TEST_CSV_FILE = 'fb15k_test.csv'
CVSC_ENTITIES_CSV_FILE = 'fb15k_cvsc_entities.csv'
CVSC_TRAIN_CSV_FILE = 'fb15k_cvsc_train.csv'
CVSC_PAIRS_CSV_FILE = 'fb15k_cvsc_pairs.csv'
CVSC_RELATIONS_CSV_FILE = 'fb15k_cvsc_relations.csv'
ENTITY_PAIRS = {}
RELATIONS = {}
In [6]:
def index(val, idx):
if val not in idx:
idx[val] = len(idx)
return idx[val]
In [7]:
def add_id_columns(df):
df['pair'] = df['subj'] + ':' + df['obj']
df['pid'] = df['pair'].apply(lambda x: index(x, ENTITY_PAIRS))
df['rid'] = df['rel'].apply(lambda x: index(x, RELATIONS))
In [8]:
train_kb_triples = pd.read_csv(TRAIN_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(train_kb_triples)
print 'Train KB triples:', len(train_kb_triples)
train_kb_triples.to_csv(TRAIN_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid'])
print 'Saved to', TRAIN_CSV_FILE
In [16]:
valid_kb_triples = pd.read_csv(VALID_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(valid_kb_triples)
print 'Validation KB triples:', len(valid_kb_triples)
valid_kb_triples.to_csv(VALID_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid'])
print 'Saved to', VALID_CSV_FILE
In [10]:
test_kb_triples = pd.read_csv(TEST_FILE, sep='\t', names=['subj', 'rel', 'obj'])
add_id_columns(test_kb_triples)
print 'Test KB triples:', len(test_kb_triples)
test_kb_triples.to_csv(TEST_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid'])
print 'Saved to', TEST_CSV_FILE
In [11]:
cvsc_text_triples = pd.read_csv(TEXT_CVSC_FILE, sep='\t', names=['subj', 'rel', 'obj', 'occ'])
add_id_columns(cvsc_text_triples)
print 'Text triples (CVSC):', len(cvsc_text_triples)
cvsc_train_triples = pd.concat([train_kb_triples, cvsc_text_triples], join="outer")
print 'Training triples (CVSC):', len(cvsc_train_triples)
cvsc_train_triples.to_csv(CVSC_TRAIN_CSV_FILE, sep='\t', header=True, columns=['subj', 'rel', 'obj', 'pid', 'rid', 'occ'])
print 'Saved to', CVSC_TRAIN_CSV_FILE
In [12]:
cvsc_entities = cvsc_text_triples['subj'].combine_first(cvsc_text_triples['obj']).drop_duplicates()
cvsc_entities.name = "entity"
print 'Entities:', len(cvsc_entities)
cvsc_entities.to_csv(CVSC_ENTITIES_CSV_FILE, sep='\t', header=True)
print 'Saved to', CVSC_ENTITIES_CSV_FILE
In [13]:
cvsc_pairs = cvsc_train_triples[['subj', 'obj', 'pid']].drop_duplicates()
print 'Entity pairs (CVSC):', len(cvsc_pairs)
cvsc_pairs.to_csv(CVSC_PAIRS_CSV_FILE, sep='\t', header=True, columns=['subj', 'obj', 'pid'])
print 'Saved to', CVSC_PAIRS_CSV_FILE
In [14]:
cvsc_relations = cvsc_train_triples[['rel', 'rid']].drop_duplicates()
print 'Relations (CVSC):', len(cvsc_relations)
cvsc_relations.to_csv(CVSC_RELATIONS_CSV_FILE, sep='\t', header=True, columns=['rel', 'rid'])
print 'Saved to', CVSC_RELATIONS_CSV_FILE
In [15]:
print 'Pairs:', cvsc_train_triples['pid'].max() + 1
print 'Relations:', cvsc_train_triples['rid'].max() + 1
In [ ]: