In [1]:
import py_entitymatching as em
import os
In [2]:
path = 'Desktop/cs838_stage3/'
movies = em.read_csv_metadata(path+'datasets/movies.csv', key='id')
tracks = em.read_csv_metadata(path+'datasets/tracks.csv', key='id')
In [3]:
movies['title'] = movies['title'].str.lower()
tracks['title'] = tracks['title'].str.lower()
movies['title'] = movies['title'].str.replace(r"\(.*\)","")
movies['title'] = movies['title'].str.replace(r"\[.*\]","")
movies['title'] = movies['title'].str.replace(r"\*.*\*","")
tracks['title'] = tracks['title'].str.replace(r"\(.*\)","")
tracks['title'] = tracks['title'].str.replace(r"\[.*\]","")
tracks['title'] = tracks['title'].str.replace(r"\*.*\*","")
print "finish pre-processing"
em.to_csv_metadata(movies, path+'datasets/processed_a.csv');
em.to_csv_metadata(tracks, path+'datasets/processed_b.csv');
In [4]:
processed_A = em.read_csv_metadata(path+'datasets/processed_a.csv');
processed_B = em.read_csv_metadata(path+'datasets/processed_b.csv');
In [5]:
sample_movies, sample_tracks = em.down_sample(movies, tracks, size=2000, y_param=2, show_progress=False)
em.set_key(sample_movies, 'id')
em.set_key(sample_tracks, 'id')
em.to_csv_metadata(sample_movies, path+'datasets/tmp_movies_8.csv')
em.to_csv_metadata(sample_tracks, path+'datasets/tmp_tracks_8.csv')
sample_movies = em.read_csv_metadata(path+'datasets/tmp_movies_8.csv')
sample_tracks = em.read_csv_metadata(path+'datasets/tmp_tracks_8.csv')
In [6]:
ob = em.OverlapBlocker()
ab = em.AttrEquivalenceBlocker()
rb = em.RuleBasedBlocker()
In [7]:
C1 = ab.block_tables(sample_movies, sample_tracks, 'year', 'year', l_output_attrs=['title', 'year'], r_output_attrs=['title','year'])
In [8]:
C2 = ob.block_candset(C1, 'title', 'title', word_level=True, rem_stop_words=True, overlap_size=1)
In [9]:
block_f = em.get_features_for_blocking(sample_movies, sample_tracks);
rb.add_rule(['title_title_cos_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.6'] ,block_f)
C3 = rb.block_candset(C1, n_jobs=-1,show_progress = False)
In [10]:
D = em.combine_blocker_outputs_via_union([C2, C3])
In [11]:
em.to_csv_metadata(D, path+'datasets/tbl_blocked.csv');
tbl_blocked = em.read_csv_metadata(path+'datasets/tbl_blocked.csv',\
ltable=sample_movies, rtable=sample_tracks)
S = em.sample_table(tbl_blocked, 400)
em.to_csv_metadata(S, path+'datasets/sampled_8.csv')
Out[11]: