In [1]:
import py_entitymatching as em
import os

In [2]:
path = 'Desktop/cs838_stage3/'
movies = em.read_csv_metadata(path+'datasets/movies.csv', key='id')
tracks = em.read_csv_metadata(path+'datasets/tracks.csv', key='id')


No handlers could be found for logger "py_entitymatching.io.parsers"
/home/hfu/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py:2821: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):

In [3]:
movies['title'] = movies['title'].str.lower()
tracks['title'] = tracks['title'].str.lower()

movies['title'] = movies['title'].str.replace(r"\(.*\)","")
movies['title'] = movies['title'].str.replace(r"\[.*\]","")
movies['title'] = movies['title'].str.replace(r"\*.*\*","")
tracks['title'] = tracks['title'].str.replace(r"\(.*\)","")
tracks['title'] = tracks['title'].str.replace(r"\[.*\]","")
tracks['title'] = tracks['title'].str.replace(r"\*.*\*","")
print "finish pre-processing"
em.to_csv_metadata(movies, path+'datasets/processed_a.csv');
em.to_csv_metadata(tracks, path+'datasets/processed_b.csv');


finish pre-processing

In [4]:
processed_A = em.read_csv_metadata(path+'datasets/processed_a.csv');
processed_B = em.read_csv_metadata(path+'datasets/processed_b.csv');

In [5]:
sample_movies, sample_tracks = em.down_sample(movies, tracks, size=2000, y_param=2, show_progress=False)
em.set_key(sample_movies, 'id')
em.set_key(sample_tracks, 'id')
em.to_csv_metadata(sample_movies, path+'datasets/tmp_movies_8.csv')
em.to_csv_metadata(sample_tracks, path+'datasets/tmp_tracks_8.csv')
sample_movies = em.read_csv_metadata(path+'datasets/tmp_movies_8.csv')
sample_tracks = em.read_csv_metadata(path+'datasets/tmp_tracks_8.csv')

In [6]:
ob = em.OverlapBlocker()
ab = em.AttrEquivalenceBlocker()
rb = em.RuleBasedBlocker()

In [7]:
C1 = ab.block_tables(sample_movies, sample_tracks, 'year', 'year', l_output_attrs=['title', 'year'], r_output_attrs=['title','year'])

In [8]:
C2 = ob.block_candset(C1, 'title', 'title', word_level=True, rem_stop_words=True, overlap_size=1)


0%                          100%
[##############################] | ETA: 00:00:02 | ETA: 00:00:02 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:01 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00 | ETA: 00:00:00
Total time elapsed: 00:00:02

In [9]:
block_f = em.get_features_for_blocking(sample_movies, sample_tracks);
rb.add_rule(['title_title_cos_dlm_dc0_dlm_dc0(ltuple, rtuple) < 0.6'] ,block_f)
C3 = rb.block_candset(C1, n_jobs=-1,show_progress = False)

In [10]:
D = em.combine_blocker_outputs_via_union([C2, C3])

In [11]:
em.to_csv_metadata(D, path+'datasets/tbl_blocked.csv');
tbl_blocked = em.read_csv_metadata(path+'datasets/tbl_blocked.csv',\
 ltable=sample_movies, rtable=sample_tracks)

S = em.sample_table(tbl_blocked, 400)
em.to_csv_metadata(S, path+'datasets/sampled_8.csv')


Out[11]:
True