Stage 5, Report

https://github.com/anhaidgroup/py_entitymatching/blob/master/notebooks/vldb_demo/Demo_notebook_v6.ipynb



In [ ]:

    
import py_entitymatching as em
import os
import pandas as pd

# specify filepaths for tables A and B. 
path_A = 'tableA.csv'
path_B = 'tableB.csv'
# read table A; table A has 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='id')
# read table B; table B has 'ID' as the key attribute
B = em.read_csv_metadata(path_B, key='id')

Filling in Missing Values



In [ ]:

    
# Impute missing values

# Manually set metadata properties, as current py_entitymatching.impute_table()
# requires 'fk_ltable', 'fk_rtable', 'ltable', 'rtable' properties
em.set_property(A, 'fk_ltable', 'id')
em.set_property(A, 'fk_rtable', 'id')
em.set_property(A, 'ltable', A)
em.set_property(A, 'rtable', A)

A_all_attrs = list(A.columns.values)
A_impute_attrs = ['year','min_num_players','max_num_players','min_gameplay_time','max_gameplay_time','min_age']
A_exclude_attrs = list(set(A_all_attrs) - set(A_impute_attrs))
A1 = em.impute_table(A, exclude_attrs=A_exclude_attrs, missing_val='NaN', strategy='most_frequent', axis=0, val_all_nans=0, verbose=True)

# Compare number of missing values to check the results
print(sum(A['min_num_players'].isnull()))
print(sum(A1['min_num_players'].isnull()))

# Do the same thing for B
em.set_property(B, 'fk_ltable', 'id')
em.set_property(B, 'fk_rtable', 'id')
em.set_property(B, 'ltable', B)
em.set_property(B, 'rtable', B)

B_all_attrs = list(B.columns.values)
# TODO: add 'min_age'
B_impute_attrs = ['year','min_num_players','max_num_players','min_gameplay_time','max_gameplay_time']
B_exclude_attrs = list(set(B_all_attrs) - set(B_impute_attrs))
B1 = em.impute_table(B, exclude_attrs=B_exclude_attrs, missing_val='NaN', strategy='most_frequent', axis=0, val_all_nans=0, verbose=True)

# Compare number of missing values to check the results
print(sum(B['min_num_players'].isnull()))
print(sum(B1['min_num_players'].isnull()))



In [ ]:

    
# Load the pre-labeled data
S = em.read_csv_metadata('sample_labeled.csv', 
                         key='_id',
                         ltable=A1, rtable=B1, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')

path_total_cand_set = 'candidate_set_C1.csv'
total_cand_set = em.read_csv_metadata(path_total_cand_set, 
                         key='_id',
                         ltable=A1, rtable=B1, 
                         fk_ltable='ltable_id', fk_rtable='rtable_id')



In [ ]:

    
# Split S into I an J
IJ = em.split_train_test(S, train_proportion=0.75, random_state=35)
I = IJ['train']
J = IJ['test']



In [ ]:

    
corres = em.get_attr_corres(A1, B1)

Generating Features

Here, we generate all the features we decided upon after our final iteration of cross validation and debugging. We only use the relevant subset of all these features in the reported iterations below.



In [ ]:

    
# Generate a set of features
#import pdb; pdb.set_trace();
import py_entitymatching.feature.attributeutils as au
import py_entitymatching.feature.simfunctions as sim
import py_entitymatching.feature.tokenizers as tok

ltable = A1
rtable = B1

# Get similarity functions for generating the features for matching
sim_funcs = sim.get_sim_funs_for_matching()
# Get tokenizer functions for generating the features for matching
tok_funcs = tok.get_tokenizers_for_matching()

# Get the attribute types of the input tables
attr_types_ltable = au.get_attr_types(ltable)
attr_types_rtable = au.get_attr_types(rtable)

# Get the attribute correspondence between the input tables
attr_corres = au.get_attr_corres(ltable, rtable)
print(attr_types_ltable['name'])
print(attr_types_rtable['name'])
attr_types_ltable['name'] = 'str_bt_5w_10w'
attr_types_rtable['name'] = 'str_bt_5w_10w'



# Get the features
F = em.get_features(ltable, rtable, attr_types_ltable,
                                 attr_types_rtable, attr_corres,
                                 tok_funcs, sim_funcs)

#F = em.get_features_for_matching(A1, B1)
print(F['feature_name'])

# Convert the I into a set of feature vectors using F
# Here, we add name edit distance as a feature
include_features_2 = [
    'min_num_players_min_num_players_lev_dist',
    'max_num_players_max_num_players_lev_dist',
    'min_gameplay_time_min_gameplay_time_lev_dist',
    'max_gameplay_time_max_gameplay_time_lev_dist',
    'name_name_lev_dist'
]
F_2 = F.loc[F['feature_name'].isin(include_features_2)]

Generate training set



In [ ]:

    
# Apply train, test set evaluation
I_table = em.extract_feature_vecs(I, feature_table=F_2, attrs_after='label', show_progress=False)
J_table = em.extract_feature_vecs(J, feature_table=F_2, attrs_after='label', show_progress=False)



In [ ]:

    
total_cand_set_features = em.extract_feature_vecs(total_cand_set, feature_table=F_2, show_progress=False)

m = em.LogRegMatcher(name='LogReg', random_state=0)

m.fit(table=I_table, exclude_attrs=['_id', 'ltable_id', 'rtable_id','label'], target_attr='label')

total_cand_set_features['prediction'] = m.predict(
    table=total_cand_set_features, 
    exclude_attrs=['_id', 'ltable_id', 'rtable_id'],
)

Joining Tables



In [ ]:

    
# Join tables on matched tuples
match_tuples = total_cand_set_features[total_cand_set_features['prediction']==1]
match_tuples = match_tuples[['ltable_id','rtable_id']]
A1['ltable_id'] = A1['id']
B1['rtable_id'] = B1['id']
joined_tables = pd.merge(match_tuples, A1, how='left', on='ltable_id')
joined_tables = pd.merge(joined_tables, B1, how='left', on='rtable_id')



In [ ]:

    
for n in A1.columns: 
    if not n in ['_id', 'ltable_id', 'rtable_id']:
        joined_tables[n] =  joined_tables.apply((lambda row: row[n+'_y'] if pd.isnull(row[n+'_x']) else row[n+'_x']), axis=1)
        joined_tables = joined_tables.drop(n+'_x', axis=1).drop(n+'_y',axis=1)

joined_tables.to_csv('joined_table.csv')



In [ ]:

    
joined_tables

Adventure Time!



In [14]:

    
import pandas as pd
import matplotlib as plt
from scipy.stats.stats import pearsonr
%matplotlib inline

joined_tables = pd.read_csv('new_joined_table.csv')
joined_tables.iloc[1:4].to_csv('4_tuples.csv')



In [8]:

    
from StringIO import StringIO
import prettytable    

pt = prettytable.from_csv(open('new_joined_table.csv'))
print pt









    



---------------------------------------------------------------------------
Exception                                 Traceback (most recent call last)
<ipython-input-8-4f2991f1e648> in <module>()
      2 import prettytable
      3 
----> 4 pt = prettytable.from_csv(open('new_joined_table.csv'))
      5 print pt

//anaconda/lib/python2.7/site-packages/prettytable.pyc in from_csv(fp, field_names, **kwargs)
   1349 
   1350     for row in reader:
-> 1351         table.add_row([x.strip() for x in row])
   1352 
   1353     return table

//anaconda/lib/python2.7/site-packages/prettytable.pyc in add_row(self, row)
    816 
    817         if self._field_names and len(row) != len(self._field_names):
--> 818             raise Exception("Row has incorrect number of values, (actual) %d!=%d (expected)" %(len(row),len(self._field_names)))
    819         if not self._field_names:
    820             self.field_names = [("Field %d" % (n+1)) for n in range(0,len(row))]

Exception: Row has incorrect number of values, (actual) 30!=29 (expected)



In [12]:

    
joined_tables.columns
print 'Size: ' + str(len(joined_tables))
for c in joined_tables.columns:
    print c + ' : '+ str(sum(joined_tables[c].isnull()))
print (len(joined_tables.iloc[12]))









    



Size: 563
Unnamed: 0 : 0
id : 0
ltable_id : 0
rtable_id : 0
id.1 : 0
name : 0
year : 0
rating : 5
rank : 169
num_players : 0
min_num_players : 0
max_num_players : 0
gameplay_time : 0
min_gameplay_time : 0
max_gameplay_time : 0
min_age : 0
complexity_weight : 39
category : 1
mechanisms : 49
type : 178
BGG_link : 0
store_names : 0
store_prices : 0
links_to_buy : 0
availability : 0
international_store : 0
min_price : 27
max_price : 27
mean_price : 27
29



In [ ]:

    
# Rating vs year
joined_tables.groupby('year').agg({'year': 'mean','rating': 'mean'}).plot.scatter(x='year', y='rating')

joined_tables.plot.scatter(x='year', y='rating')
pearsonr(
    joined_tables['year'][joined_tables['rating'].notnull()],
    joined_tables['rating'][joined_tables['rating'].notnull()]
)



In [ ]:

    
# Complexity weight vs year

joined_tables.plot.scatter(x='year', y='complexity_weight')
pearsonr(joined_tables['year'][joined_tables['complexity_weight'].notnull()],joined_tables['complexity_weight'][joined_tables['complexity_weight'].notnull()])



In [ ]:

    
# Price mean vs year

joined_tables.groupby('year').agg({'year': 'mean','mean_price': 'mean'}).plot.scatter(x='year', y='mean_price')
joined_tables.plot.scatter(x='year', y='mean_price')
pearsonr(joined_tables['year'][joined_tables['mean_price'].notnull()],joined_tables['mean_price'][joined_tables['mean_price'].notnull()])



In [ ]:

    
# Price mean vs rating

joined_tables.plot.scatter(x='rating', y='mean_price')
nonull = joined_tables['rating'].notnull() & joined_tables['mean_price'].notnull()
pearsonr(joined_tables['rating'][nonull],joined_tables['mean_price'][nonull])



In [ ]:

    
# Num players vs complexity weight

joined_tables.plot.scatter(x='complexity_weight', y='min_num_players')
nonull = joined_tables['complexity_weight'].notnull() & joined_tables['min_num_players'].notnull()
pearsonr(joined_tables['complexity_weight'][nonull],joined_tables['min_num_players'][nonull])



In [ ]:

    
#complexity weight vs gameplay time

joined_tables.plot.scatter(x='max_gameplay_time', y='complexity_weight')
nonull = joined_tables['complexity_weight'].notnull() & joined_tables['min_gameplay_time'].notnull()
pearsonr(joined_tables['complexity_weight'][nonull],joined_tables['min_gameplay_time'][nonull])



In [ ]: