Data preprocessing: convert idomaar to CSV

Convert 30Music dataset from idomaar format to CSV format.


In [ ]:
import os, sys, csv, json#, gc
import numpy as np
import pandas as pd

In [ ]:
COLUMNS = ['Type', 'ID', 'Timestamp', 'Properties', 'LinkedEntities']

In [ ]:
data_dir = 'data/30music'

Convert albums data to CSV


In [ ]:
falbums = os.path.join(data_dir, 'albums.idomaar')

In [ ]:
#with open(falbums) as tsvin:
#    tsvin = csv.reader(tsvin, delimiter='\t')
#    for row in tsvin:
#        if len(row[4]) > 2:
#            print(row)

In [ ]:
albums = pd.read_csv(falbums, delimiter='\t', header=None)

In [ ]:
albums.columns = COLUMNS

In [ ]:
albums.head()

In [ ]:
albums[albums['Timestamp'] != -1]

In [ ]:
albums[albums['LinkedEntities'] != '{}']

In [ ]:
albums.set_index('ID', inplace=True)

In [ ]:
albums.head()

Deal with illegally formated JSON.


In [ ]:
%%script false
cnt = 0
for ix in albums.index:
    try:
        prop = json.loads(albums.loc[ix, 'Properties'])
        aa = (prop['MBID'], prop['title'])
    except:
        cnt += 1
        #print(ix)
print(cnt)

In [ ]:
def parse_properties(props, debug=False): 
    props = props.replace(';', ',')
    try:
        prop = json.loads(props)
    except:
        # deal with duplicated " in json string
        props = props.replace('"title":', '"TITLE":').replace('"', '').replace('\\', '')\
                     .replace('MBID:', '"MBID":"').replace(', TITLE:', '", "title":"').replace('}', '"}')
        if debug is True: print(props)
        prop = json.loads(props)
    return pd.Series({'MBID': prop['MBID'], 'Title': prop['title']})

In [ ]:
props = albums.loc[708, 'Properties']
props

In [ ]:
#json.loads(props)  # causes exception

In [ ]:
parse_properties(props)

In [ ]:
prop_df = albums['Properties'].apply(lambda s: parse_properties(s, debug=False))

In [ ]:
prop_df.head()

In [ ]:
#albums = albums.merge(prop_df, left_index=True, right_index=True)

In [ ]:
#albums.drop(['Type', 'Timestamp', 'Properties', 'LinkedEntities'], axis=1, inplace=True)

In [ ]:
fcsv_falbums = os.path.join(data_dir, 'albums.csv')
prop_df.to_csv(fcsv_falbums, sep=';')

In [ ]:
pd.read_csv(fcsv_falbums, index_col='ID', sep=';').head()

Convert artists data to CSV


In [ ]:
fartists = os.path.join(data_dir, 'persons.idomaar')

In [ ]:
artists = pd.read_csv(fartists, delimiter='\t', header=None)

In [ ]:
artists.columns = COLUMNS
artists.head()

In [ ]:
artists.set_index('ID', inplace=True)

In [ ]:
artists[artists['Timestamp'] != -1]

In [ ]:
artists[artists['LinkedEntities'] != '{}']

In [ ]:
artists.drop(['Timestamp', 'LinkedEntities'], axis=1, inplace=True)

In [ ]:
print(artists.shape)
artists.head()

In [ ]:
def parse_artist_properties(props): 
    props = props.replace(';', ',')
    try:
        prop = json.loads(props)
    except:
        print(props)
        sys.exit(0)
    return pd.Series({'MBID': prop['MBID'], 'Name': prop['name']})

In [ ]:
artists_prop = artists['Properties'].apply(lambda s: parse_artist_properties(s))

In [ ]:
#artists_prop = artists['Properties'].apply(lambda s: \
#                                           pd.Series({'MBID': json.loads(s)['MBID'], 'Name': json.loads(s)['name']}))

In [ ]:
#set(list(artists.index)) - set(list(artists_prop.index))

In [ ]:
print(artists_prop.shape)
artists_prop.head()

In [ ]:
#artists_prop['Type'] = 'person'
#artists_prop.head()

In [ ]:
#artists.sort_index(inplace=True)

In [ ]:
#artists = artists.merge(artists_prop, left_index=True, right_index=True)
#print(artists.shape)

Deal with duplications.


In [ ]:
print(artists.shape[0])
print(artists.index.unique().shape[0])

In [ ]:
artists_prop.reset_index(inplace=True)

In [ ]:
artists_prop['Duplicated'] = artists_prop['ID'].duplicated(keep='first')

In [ ]:
artists_nodup = artists_prop[artists_prop['Duplicated'] == False].copy()
artists_dup = artists_prop[artists_prop['Duplicated'] == True].copy()

In [ ]:
artists_nodup.set_index('ID', inplace=True)

In [ ]:
print(artists_dup.shape[0])
artists_dup.head()

In [ ]:
cnt = 0
print(artists_dup.shape[0])
for ix in artists_dup.index:
    artist_id = artists_dup.loc[ix, 'ID']
    mbid = artists_dup.loc[ix, 'MBID']; mbid = str(mbid) if mbid is not None else mbid
    name = artists_dup.loc[ix, 'Name']; name = str(name) if name is not None else name
    
    # update existing artist info
    mbid0 = artists_nodup.loc[artist_id, 'MBID']; mbid0 = str(mbid0) if mbid0 is not None else mbid0
    name0 = artists_nodup.loc[artist_id, 'Name']; name0 = str(name0) if name0 is not None else name0
    
    if mbid is not None:
        if mbid0 is None or len(mbid) > len(mbid0): 
            artists_nodup.loc[artist_id, 'MBID'] = mbid
    if name is not None:
        if name0 is None or len(name) > len(name0):
            artists_nodup.loc[artist_id, 'Name'] = name
            
    cnt += 1
    if cnt % 100 == 0:
        sys.stdout.write('\r%d' % (cnt))
        sys.stdout.flush()

In [ ]:
artists_nodup.drop('Duplicated', axis=1, inplace=True)

In [ ]:
print(artists_nodup.shape[0])
print(artists_nodup.index.unique().shape[0])
artists_nodup.head()

In [ ]:
fcsv_artists = os.path.join(data_dir, 'persons.csv')
artists_nodup.to_csv(fcsv_artists, sep=';')

In [ ]:
pd.read_csv(fcsv_artists, index_col='ID', sep=';').head()

Convert users data to CSV


In [ ]:
fusers = os.path.join(data_dir, 'users.idomaar')

In [ ]:
users = pd.read_csv(fusers, header=None, delimiter='\t')

In [ ]:
users.columns = COLUMNS[:-1]
users.set_index('ID', inplace=True)
users.head()

In [ ]:
def parse_user_properties(props):
    props = props.replace(';', ',')
    try:
        prop = json.loads(props)
    except:
        props = props.replace('""', 'null').replace(':,', ':null,')
        try: prop = json.loads(props)
        except: print(props); sys.exit(0)
    return pd.Series({'Username': prop['lastfm_username'],
                          'Gender': str.upper(prop['gender']) if prop['gender'] is not None else None,
                          'Age': prop['age'],
                          'Country': prop['country'],
                          'Playcount': prop['playcount'],
                          '#Playlists': prop['playlists'],
                          'Subscribertype': prop['subscribertype']})

In [ ]:
user_prop = users['Properties'].apply(lambda s: parse_user_properties(s))

In [ ]:
user_prop.shape

In [ ]:
users.drop(['Type', 'Properties'], axis=1, inplace=True)

In [ ]:
users = users.merge(user_prop, left_index=True, right_index=True)
print(users.shape)

In [ ]:
users.head()

In [ ]:
fcsv_users = os.path.join(data_dir, 'users.csv')
users.to_csv(fcsv_users, sep=';')

In [ ]:
pd.read_csv(fcsv_users, index_col='ID', sep=';').head()

Convert tags data to CSV


In [ ]:
ftags = os.path.join(data_dir, 'tags.idomaar')

In [ ]:
tags = pd.read_csv(ftags, header=None, delimiter='\t')

In [ ]:
tags.columns = COLUMNS
tags.set_index('ID', inplace=True)
tags.head()

In [ ]:
tags[tags['Timestamp'] != -1]

In [ ]:
tags[tags['LinkedEntities'] != '{}']

In [ ]:
tags.drop(['Type', 'Timestamp', 'LinkedEntities'], axis=1, inplace=True)

In [ ]:
print(tags.shape)
tags.head()

In [ ]:
tags.loc[58983, 'Properties']

In [ ]:
def parse_tag_properties(props):
    props = props.replace(';', ',')
    try:
        prop = json.loads(props)
    except:
        props = props.replace('u"', '').replace('\\', '').replace('\\n', '')\
                     .replace('"value":', 'VALUE:').replace('"url":', 'URL:').replace('"', '')\
                     .replace('VALUE:', '"value":"').replace(', URL:', '", "url":"').replace('}', '"}')
        try: prop = json.loads(props)
        except: print(props); sys.exit(0)
    return pd.Series({'Value': prop['value'].replace('\n', ''), 'URL': prop['url']})

In [ ]:
tags_prop = tags['Properties'].apply(lambda s: parse_tag_properties(s))

In [ ]:
print(tags_prop.shape)
tags_prop.head()

In [ ]:
tags_prop.loc[230795, 'Value']

In [ ]:
fcsv_tags = os.path.join(data_dir, 'tags.csv')
tags_prop.to_csv(fcsv_tags, sep=';')

In [ ]:
pd.read_csv(fcsv_tags, index_col='ID', sep=';').head()

Convert tracks data to CSV

NOTE: there are duplicated lines (duplicated track ID with possibly different information) in tracks data, need to deal with this.


In [ ]:
ftracks = os.path.join(data_dir, 'tracks.idomaar')

In [ ]:
tracks = pd.read_csv(ftracks, header=None, delimiter='\t')

In [ ]:
tracks.columns = COLUMNS
#tracks.set_index('ID', inplace=True) # there's duplications
tracks.head()

In [ ]:
tracks[tracks['Timestamp'] != -1]

In [ ]:
tracks.drop(['Type', 'Timestamp'], axis=1, inplace=True)

Check duplications.


In [ ]:
tracks['ID'].unique().shape

In [ ]:
print(tracks.shape)

Deal with duplications.


In [ ]:
tracks['Duplicated'] = tracks['ID'].duplicated(keep='first')
tracks.head()

In [ ]:
tracks[tracks['ID'] == 170]

In [ ]:
fnodup = os.path.join(data_dir, 'tracks.nodup')
fdup = os.path.join(data_dir, 'tracks.dup')

In [ ]:
tracks[tracks['Duplicated'] == False][COLUMNS].to_csv(fnodup, quoting=csv.QUOTE_NONE, sep='\t', header=False,index=False)

In [ ]:
tracks[tracks['Duplicated'] == True][COLUMNS].to_csv(fdup, quoting=csv.QUOTE_NONE, sep='\t', header=False, index=False)

Clear object in memory.


In [ ]:
#%xdel tracks
#%xdel -n tracks
#gc.collect()
#gc.collect()
# memory usage are still huge after these operations

In [ ]:
#%reset
# this works!
# but needs re-import any modules needed

In [ ]:
prop_columns = ['Duration', 'Playcount', 'MBID', 'Name']
entity_columns = ['ArtistsID', 'AlbumsID', 'TagsID']

In [ ]:
def parse_track_properties(props):
    try:
        props = props.replace(';', ',')
        prop = json.loads(props)
    except:
        print(props)
        sys.exit(0)
    return pd.Series({'Duration': prop['duration'], 'Playcount': prop['playcount'], \
                      'MBID': prop['MBID'], 'Name': prop['name']})

In [ ]:
#tracks.loc[0, 'Properties']

In [ ]:
#json.loads(tracks.loc[0, 'LinkedEntities'])

In [ ]:
def parse_track_entities(entities):
    try:
        entities = entities.replace(';', ',')
        entity = json.loads(entities)
    except:
        print(entities)
        sys.exit(0)
    return pd.Series({
        'ArtistsID': ','.join([str(x['id']) for x in entity['artists']]) if \
                               entity['artists'] is not None and len(entity['artists']) > 0 else None,
        'AlbumsID': ','.join([str(x['id']) for x in entity['albums']]) if \
                              entity['albums'] is not None and len(entity['albums']) > 0 else None,
        'TagsID': ','.join([str(x['id']) for x in entity['tags']]) if \
                            entity['tags'] is not None and len(entity['tags']) > 0 else None})

In [ ]:
#tracks.loc[0, 'LinkedEntities']

In [ ]:
#parse_track_entities(tracks.loc[0, 'LinkedEntities'])

Parse non-duplicated tracks and save them to csv file.


In [ ]:
tracks_nodup = tracks[tracks['Duplicated'] == False]
lines = []
cnt = 0
print(tracks_nodup.shape[0])
for ix in tracks_nodup.index:
    track_id = tracks_nodup.loc[ix, 'ID']
    props = parse_track_properties(tracks_nodup.loc[ix, 'Properties'])
    entities = parse_track_entities(tracks_nodup.loc[ix, 'LinkedEntities'])

    # add new track record
    #tracks_df.ID = track_id
    #tracks_df.loc[track_id, prop_columns] = props
    #tracks_df.loc[track_id, entity_columns] = entities
    line = [str(track_id)]
    for prop in prop_columns: line.append(str(props[prop]) if props[prop] is not None else '')
    for entity in entity_columns: line.append(str(entities[entity]) if entities[entity] is not None else '')
    lines.append(';'.join(line))
    
    cnt += 1
    if cnt % 1000 == 0:
        sys.stdout.write('\r%d' % (cnt))
        sys.stdout.flush()

In [ ]:
fcsv_tracks = os.path.join(data_dir, 'tracks.csv')

In [ ]:
with open(fcsv_tracks, 'w') as fcsv:
    for line in lines: fcsv.write(line + '\n')

In [ ]:
tracks_df = pd.read_csv(fcsv_tracks, sep=';', keep_default_na=False, header=None)

Deal with duplications.


In [ ]:
tracks_df.columns = ['ID'] + prop_columns + entity_columns
tracks_df.set_index('ID', inplace=True)

In [ ]:
print(tracks_df.shape[0])
print(tracks_df.index.unique().shape[0])
tracks_df.head()

In [ ]:
#tracks_df['Duration'] = tracks_df['Duration'].astype(float)
#tracks_df['Playcount'] = tracks_df['Playcount'].astype(float)

In [ ]:
cnt = 0
tracks_dup = tracks[tracks['Duplicated'] == True]
print(tracks_dup.shape[0])
for ix in tracks_dup.index:
    track_id = tracks_dup.loc[ix, 'ID']
    props = parse_track_properties(tracks_dup.loc[ix, 'Properties'])
    entities = parse_track_entities(tracks_dup.loc[ix, 'LinkedEntities'])

    # update existing track
    # ['Duration', 'Playcount', 'MBID', 'Name']
    duration = tracks_df.loc[track_id, 'Duration']
    playcount = tracks_df.loc[track_id, 'Playcount']
    if type(duration) == str: 
        try: duration = int(duration)
        except: duration = -1
    if type(playcount) == str:
        try: playcount = int(playcount)
        except: playcount = -1
    mbid = tracks_df.loc[track_id, 'MBID']; mbid = str(mbid) if mbid is not None else mbid
    name = tracks_df.loc[track_id, 'Name']; name = str(name) if name is not None else name
    if type(mbid) == float: print(track_id)
    if props['Duration'] is not None:
        if duration is None or props['Duration'] > duration: 
            tracks_df.loc[track_id, 'Duration'] = props['Duration']
    if props['Playcount'] is not None:
        if playcount is None or props['Playcount'] > playcount:
            tracks_df.loc[track_id, 'Playcount'] = props['Playcount']
    if props['MBID'] is not None:
        if mbid is None or len(props['MBID']) > len(mbid):
            tracks_df.loc[track_id, 'MBID'] = props['MBID']
    if props['Name'] is not None:
        if name is None or len(props['Name']) > len(name):
            tracks_df.loc[track_id, 'Name'] = props['Name']

    # ['ArtistsID', 'AlbumsID', 'TagsID']
    aid = tracks_df.loc[track_id, 'ArtistsID']; aid = str(aid) if aid is not None else aid
    bid = tracks_df.loc[track_id, 'AlbumsID']; bid = str(bid) if bid is not None else bid
    tid = tracks_df.loc[track_id, 'TagsID']; tid = str(tid) if tid is not None else tid
    if entities['ArtistsID'] is not None:
        if aid is None or len(str(entities['ArtistsID'])) > len(aid):
            tracks_df.loc[track_id, 'ArtistsID'] = entities['ArtistsID']
    if entities['AlbumsID'] is not None:
        if bid is None or len(str(entities['AlbumsID'])) > len(bid):
            tracks_df.loc[track_id, 'AlbumsID'] = entities['AlbumsID']
    if entities['TagsID'] is not None:
        if tid is None or len(str(entities['TagsID'])) > len(tid):
            tracks_df.loc[track_id, 'TagsID'] = entities['TagsID']
    cnt += 1
    if cnt % 100 == 0:
        sys.stdout.write('\r%d' % (cnt))
        sys.stdout.flush()

In [ ]:
print(tracks['ID'].unique().shape[0])
print(tracks_df.shape[0])
tracks_df.head()

In [ ]:
tracks_df.to_csv(fcsv_tracks, sep=';')

In [ ]:
pd.read_csv(fcsv_tracks, sep=';', keep_default_na=False).head()

Convert playlist data to CSV


In [ ]:
fplaylist = os.path.join(data_dir, 'playlist.idomaar')

In [ ]:
playlist = pd.read_csv(fplaylist, header=None, delimiter='\t')

In [ ]:
playlist.columns = COLUMNS
print(playlist.shape)
playlist.head()

In [ ]:
playlist.drop(['Type', 'ID'], axis=1, inplace=True)

In [ ]:
playlist.head()

In [ ]:
def parse_playlist_properties(props):
    props = props.replace(';', ',')
    try:
        prop = json.loads(props)
    except:
        props = props.replace('\\', '').replace('Title', 'TITLE').replace('numtracks', 'NUMTRACKS')\
                     .replace('"duration":', 'DURATION:').replace('"', '')\
                     .replace('ID:', '"ID":').replace('TITLE:', '"Title":"').replace(',NUMTRACKS:', '","numtracks":')\
                     .replace('DURATION:', '"duration":')
        try: prop = json.loads(props)
        except: print(props); sys.exit(0)
    return pd.Series({'ID': prop['ID'], 'Title': prop['Title'], '#Tracks': prop['numtracks'], 
                      'Duration': prop['duration']})

In [ ]:
#playlist.loc[0, 'Properties']

In [ ]:
#parse_playlist_properties(playlist.loc[0, 'Properties'])

In [ ]:
def parse_playlist_entities(entities):
    entities = entities.replace(';', ',')
    try:
        entity = json.loads(entities.replace('[[]]', '[]'))
    except:
        try: entity = json.loads(entities)
        except: print(entities); sys.exit(0)
    return pd.Series({
        'UserID': ','.join([str(x['id']) for x in entity['subjects']]) if len(entity['subjects']) > 0 else None,
        'TracksID': ','.join([str(x['id']) for x in entity['objects']]) if len(entity['objects']) > 0 else None})

In [ ]:
#playlist.loc[0, 'LinkedEntities']

In [ ]:
#parse_playlist_entities(playlist.loc[0, 'LinkedEntities'])#['TracksID']

In [ ]:
prop_columns = ['ID', 'Title', '#Tracks', 'Duration']
entity_columns = ['UserID', 'TracksID']

In [ ]:
for col in prop_columns: playlist[col] = None
for col in entity_columns: playlist[col] = None

In [ ]:
cnt = 0
for ix in playlist.index:
    playlist.loc[ix, prop_columns] = parse_playlist_properties(playlist.loc[ix, 'Properties'])
    playlist.loc[ix, entity_columns] = parse_playlist_entities(playlist.loc[ix, 'LinkedEntities'])
    cnt += 1
    if cnt % 100 == 0:
        sys.stdout.write('\r%d' % (cnt))
        sys.stdout.flush()

In [ ]:
playlist.drop(['Properties', 'LinkedEntities'], axis=1, inplace=True)

In [ ]:
print(playlist.shape)
playlist.head()

Set index.


In [ ]:
playlist[playlist['UserID'].str.contains(',')]

In [ ]:
playlist['ID'].unique().shape

In [ ]:
playlist.set_index('ID', inplace=True)

Remove empty playlist (with 0 tracks).


In [ ]:
print(playlist[playlist['#Tracks'] == 0].shape)
playlist[playlist['#Tracks'] == 0].head()

In [ ]:
playlist = playlist[playlist['#Tracks'] > 0]
playlist.shape

In [ ]:
fcsv_playlist = os.path.join(data_dir, 'playlist.csv')
playlist.to_csv(fcsv_playlist, sep=';')

In [ ]:
pd.read_csv(fcsv_playlist, index_col='ID', sep=';').head()

Histogram of playlist length (i.e., the number of tracks/songs).


In [ ]:
%matplotlib inline
playlist['#Tracks'].hist()

Convert user preference data to CSV


In [ ]:
fpref = os.path.join(data_dir, 'love.idomaar')

In [ ]:
pref = pd.read_csv(fpref, header=None, delimiter='\t')

In [ ]:
print(pref.shape)
pref.head()

Deal with ugly seperated columns (the seperator should be TAB, but it's not for the last 3 columns here).

Check Type values.


In [ ]:
pref[pref[0] != 'preference'].shape[0]

Check Timestamp and Properties values.


In [ ]:
pref[pref[2].str.startswith('-1 {"value":"love"}')].shape[0] == pref.shape[0]

In [ ]:
pref.columns = ['Type', 'ID', 'LinkedEntities']
pref.drop('Type', axis=1, inplace=True)
pref.head()

In [ ]:
pref.loc[0, 'LinkedEntities']

In [ ]:
def parse_pref_entity(entities):
    entities = entities.replace('-1 {"value":"love"}', '').replace(';', ',')
    try:
        entity = json.loads(entities)
    except:
        try: entity = json.loads(entities)
        except: print(entities); sys.exit(0)
    return pd.Series({
        'UserID': ','.join([str(x['id']) for x in entity['subjects']]) if len(entity['subjects']) > 0 else None,
        'TrackID': ','.join([str(x['id']) for x in entity['objects']]) if len(entity['objects']) > 0 else None})

In [ ]:
#pref_entity = pref['LinkedEntities'].apply(lambda s: parse_pref_entity(s))  # use too much memory

In [ ]:
pref_columns=['UserID', 'TrackID']

In [ ]:
#pref_entity = pd.DataFrame(columns=pref_columns)

In [ ]:
lines = []
ix = 0
print(pref.shape[0])
with open(fpref) as tsvin:
    tsvin = csv.reader(tsvin, delimiter='\t')
    for row in tsvin:
        assert(len(row) == 3)
        #pref_entity.loc[ix] = parse_pref_entity(row[2])
        prefs = parse_pref_entity(row[2])
        lines.append(';'.join([str(prefs[col]) for col in pref_columns]))
        
        ix += 1
        if ix % 1000 == 0:
            sys.stdout.write('\r%d' % (ix))
            sys.stdout.flush()

In [ ]:
fcsv_pref = os.path.join(data_dir, 'love.csv')
#pref_entity.to_csv(fcsv_pref, sep=';', index=False)

In [ ]:
with open(fcsv_pref, 'w') as fcsv:
    fcsv.write(';'.join(pref_columns) + '\n')
    for line in lines: fcsv.write(line + '\n')

In [ ]:
pd.read_csv(fcsv_pref, sep=';').head()