In [ ]:
import py_entitymatching as em
import os
import pandas as pd
# specify filepaths for tables A and B.
path_A = 'tableA.csv'
path_B = 'tableB.csv'
# read table A; table A has 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='id')
# read table B; table B has 'ID' as the key attribute
B = em.read_csv_metadata(path_B, key='id')
In [ ]:
# Impute missing values
# Manually set metadata properties, as current py_entitymatching.impute_table()
# requires 'fk_ltable', 'fk_rtable', 'ltable', 'rtable' properties
em.set_property(A, 'fk_ltable', 'id')
em.set_property(A, 'fk_rtable', 'id')
em.set_property(A, 'ltable', A)
em.set_property(A, 'rtable', A)
A_all_attrs = list(A.columns.values)
A_impute_attrs = ['year','min_num_players','max_num_players','min_gameplay_time','max_gameplay_time','min_age']
A_exclude_attrs = list(set(A_all_attrs) - set(A_impute_attrs))
A1 = em.impute_table(A, exclude_attrs=A_exclude_attrs, missing_val='NaN', strategy='most_frequent', axis=0, val_all_nans=0, verbose=True)
# Compare number of missing values to check the results
print(sum(A['min_num_players'].isnull()))
print(sum(A1['min_num_players'].isnull()))
# Do the same thing for B
em.set_property(B, 'fk_ltable', 'id')
em.set_property(B, 'fk_rtable', 'id')
em.set_property(B, 'ltable', B)
em.set_property(B, 'rtable', B)
B_all_attrs = list(B.columns.values)
# TODO: add 'min_age'
B_impute_attrs = ['year','min_num_players','max_num_players','min_gameplay_time','max_gameplay_time']
B_exclude_attrs = list(set(B_all_attrs) - set(B_impute_attrs))
B1 = em.impute_table(B, exclude_attrs=B_exclude_attrs, missing_val='NaN', strategy='most_frequent', axis=0, val_all_nans=0, verbose=True)
# Compare number of missing values to check the results
print(sum(B['min_num_players'].isnull()))
print(sum(B1['min_num_players'].isnull()))
In [ ]:
# Load the pre-labeled data
S = em.read_csv_metadata('sample_labeled.csv',
key='_id',
ltable=A1, rtable=B1,
fk_ltable='ltable_id', fk_rtable='rtable_id')
path_total_cand_set = 'candidate_set_C1.csv'
total_cand_set = em.read_csv_metadata(path_total_cand_set,
key='_id',
ltable=A1, rtable=B1,
fk_ltable='ltable_id', fk_rtable='rtable_id')
In [ ]:
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.75, random_state=35)
I = IJ['train']
J = IJ['test']
In [ ]:
corres = em.get_attr_corres(A1, B1)
In [ ]:
# Generate a set of features
#import pdb; pdb.set_trace();
import py_entitymatching.feature.attributeutils as au
import py_entitymatching.feature.simfunctions as sim
import py_entitymatching.feature.tokenizers as tok
ltable = A1
rtable = B1
# Get similarity functions for generating the features for matching
sim_funcs = sim.get_sim_funs_for_matching()
# Get tokenizer functions for generating the features for matching
tok_funcs = tok.get_tokenizers_for_matching()
# Get the attribute types of the input tables
attr_types_ltable = au.get_attr_types(ltable)
attr_types_rtable = au.get_attr_types(rtable)
# Get the attribute correspondence between the input tables
attr_corres = au.get_attr_corres(ltable, rtable)
print(attr_types_ltable['name'])
print(attr_types_rtable['name'])
attr_types_ltable['name'] = 'str_bt_5w_10w'
attr_types_rtable['name'] = 'str_bt_5w_10w'
# Get the features
F = em.get_features(ltable, rtable, attr_types_ltable,
attr_types_rtable, attr_corres,
tok_funcs, sim_funcs)
#F = em.get_features_for_matching(A1, B1)
print(F['feature_name'])
# Convert the I into a set of feature vectors using F
# Here, we add name edit distance as a feature
include_features_2 = [
'min_num_players_min_num_players_lev_dist',
'max_num_players_max_num_players_lev_dist',
'min_gameplay_time_min_gameplay_time_lev_dist',
'max_gameplay_time_max_gameplay_time_lev_dist',
'name_name_lev_dist'
]
F_2 = F.loc[F['feature_name'].isin(include_features_2)]
In [ ]:
# Apply train, test set evaluation
I_table = em.extract_feature_vecs(I, feature_table=F_2, attrs_after='label', show_progress=False)
J_table = em.extract_feature_vecs(J, feature_table=F_2, attrs_after='label', show_progress=False)
In [ ]:
total_cand_set_features = em.extract_feature_vecs(total_cand_set, feature_table=F_2, show_progress=False)
m = em.LogRegMatcher(name='LogReg', random_state=0)
m.fit(table=I_table, exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'], target_attr='label')
total_cand_set_features['prediction'] = m.predict(
table=total_cand_set_features,
exclude_attrs=['_id', 'ltable_id', 'rtable_id'],
)
In [ ]:
# Join tables on matched tuples
match_tuples = total_cand_set_features[total_cand_set_features['prediction']==1]
match_tuples = match_tuples[['ltable_id','rtable_id']]
A1['ltable_id'] = A1['id']
B1['rtable_id'] = B1['id']
joined_tables = pd.merge(match_tuples, A1, how='left', on='ltable_id')
joined_tables = pd.merge(joined_tables, B1, how='left', on='rtable_id')
In [ ]:
for n in A1.columns:
if not n in ['_id', 'ltable_id', 'rtable_id']:
joined_tables[n] = joined_tables.apply((lambda row: row[n+'_y'] if pd.isnull(row[n+'_x']) else row[n+'_x']), axis=1)
joined_tables = joined_tables.drop(n+'_x', axis=1).drop(n+'_y',axis=1)
joined_tables.to_csv('joined_table.csv')
In [ ]:
joined_tables
In [14]:
import pandas as pd
import matplotlib as plt
from scipy.stats.stats import pearsonr
%matplotlib inline
joined_tables = pd.read_csv('new_joined_table.csv')
joined_tables.iloc[1:4].to_csv('4_tuples.csv')
In [8]:
from StringIO import StringIO
import prettytable
pt = prettytable.from_csv(open('new_joined_table.csv'))
print pt
In [12]:
joined_tables.columns
print 'Size: ' + str(len(joined_tables))
for c in joined_tables.columns:
print c + ' : '+ str(sum(joined_tables[c].isnull()))
print (len(joined_tables.iloc[12]))
In [ ]:
# Rating vs year
joined_tables.groupby('year').agg({'year': 'mean','rating': 'mean'}).plot.scatter(x='year', y='rating')
joined_tables.plot.scatter(x='year', y='rating')
pearsonr(
joined_tables['year'][joined_tables['rating'].notnull()],
joined_tables['rating'][joined_tables['rating'].notnull()]
)
In [ ]:
# Complexity weight vs year
joined_tables.plot.scatter(x='year', y='complexity_weight')
pearsonr(joined_tables['year'][joined_tables['complexity_weight'].notnull()],joined_tables['complexity_weight'][joined_tables['complexity_weight'].notnull()])
In [ ]:
# Price mean vs year
joined_tables.groupby('year').agg({'year': 'mean','mean_price': 'mean'}).plot.scatter(x='year', y='mean_price')
joined_tables.plot.scatter(x='year', y='mean_price')
pearsonr(joined_tables['year'][joined_tables['mean_price'].notnull()],joined_tables['mean_price'][joined_tables['mean_price'].notnull()])
In [ ]:
# Price mean vs rating
joined_tables.plot.scatter(x='rating', y='mean_price')
nonull = joined_tables['rating'].notnull() & joined_tables['mean_price'].notnull()
pearsonr(joined_tables['rating'][nonull],joined_tables['mean_price'][nonull])
In [ ]:
# Num players vs complexity weight
joined_tables.plot.scatter(x='complexity_weight', y='min_num_players')
nonull = joined_tables['complexity_weight'].notnull() & joined_tables['min_num_players'].notnull()
pearsonr(joined_tables['complexity_weight'][nonull],joined_tables['min_num_players'][nonull])
In [ ]:
#complexity weight vs gameplay time
joined_tables.plot.scatter(x='max_gameplay_time', y='complexity_weight')
nonull = joined_tables['complexity_weight'].notnull() & joined_tables['min_gameplay_time'].notnull()
pearsonr(joined_tables['complexity_weight'][nonull],joined_tables['min_gameplay_time'][nonull])
In [ ]: