Convert 30Music dataset from idomaar format to CSV format.
In [ ]:
import os, sys, csv, json#, gc
import numpy as np
import pandas as pd
In [ ]:
COLUMNS = ['Type', 'ID', 'Timestamp', 'Properties', 'LinkedEntities']
In [ ]:
data_dir = 'data/30music'
In [ ]:
falbums = os.path.join(data_dir, 'albums.idomaar')
In [ ]:
#with open(falbums) as tsvin:
# tsvin = csv.reader(tsvin, delimiter='\t')
# for row in tsvin:
# if len(row[4]) > 2:
# print(row)
In [ ]:
albums = pd.read_csv(falbums, delimiter='\t', header=None)
In [ ]:
albums.columns = COLUMNS
In [ ]:
albums.head()
In [ ]:
albums[albums['Timestamp'] != -1]
In [ ]:
albums[albums['LinkedEntities'] != '{}']
In [ ]:
albums.set_index('ID', inplace=True)
In [ ]:
albums.head()
Deal with illegally formated JSON.
In [ ]:
%%script false
cnt = 0
for ix in albums.index:
try:
prop = json.loads(albums.loc[ix, 'Properties'])
aa = (prop['MBID'], prop['title'])
except:
cnt += 1
#print(ix)
print(cnt)
In [ ]:
def parse_properties(props, debug=False):
props = props.replace(';', ',')
try:
prop = json.loads(props)
except:
# deal with duplicated " in json string
props = props.replace('"title":', '"TITLE":').replace('"', '').replace('\\', '')\
.replace('MBID:', '"MBID":"').replace(', TITLE:', '", "title":"').replace('}', '"}')
if debug is True: print(props)
prop = json.loads(props)
return pd.Series({'MBID': prop['MBID'], 'Title': prop['title']})
In [ ]:
props = albums.loc[708, 'Properties']
props
In [ ]:
#json.loads(props) # causes exception
In [ ]:
parse_properties(props)
In [ ]:
prop_df = albums['Properties'].apply(lambda s: parse_properties(s, debug=False))
In [ ]:
prop_df.head()
In [ ]:
#albums = albums.merge(prop_df, left_index=True, right_index=True)
In [ ]:
#albums.drop(['Type', 'Timestamp', 'Properties', 'LinkedEntities'], axis=1, inplace=True)
In [ ]:
fcsv_falbums = os.path.join(data_dir, 'albums.csv')
prop_df.to_csv(fcsv_falbums, sep=';')
In [ ]:
pd.read_csv(fcsv_falbums, index_col='ID', sep=';').head()
In [ ]:
fartists = os.path.join(data_dir, 'persons.idomaar')
In [ ]:
artists = pd.read_csv(fartists, delimiter='\t', header=None)
In [ ]:
artists.columns = COLUMNS
artists.head()
In [ ]:
artists.set_index('ID', inplace=True)
In [ ]:
artists[artists['Timestamp'] != -1]
In [ ]:
artists[artists['LinkedEntities'] != '{}']
In [ ]:
artists.drop(['Timestamp', 'LinkedEntities'], axis=1, inplace=True)
In [ ]:
print(artists.shape)
artists.head()
In [ ]:
def parse_artist_properties(props):
props = props.replace(';', ',')
try:
prop = json.loads(props)
except:
print(props)
sys.exit(0)
return pd.Series({'MBID': prop['MBID'], 'Name': prop['name']})
In [ ]:
artists_prop = artists['Properties'].apply(lambda s: parse_artist_properties(s))
In [ ]:
#artists_prop = artists['Properties'].apply(lambda s: \
# pd.Series({'MBID': json.loads(s)['MBID'], 'Name': json.loads(s)['name']}))
In [ ]:
#set(list(artists.index)) - set(list(artists_prop.index))
In [ ]:
print(artists_prop.shape)
artists_prop.head()
In [ ]:
#artists_prop['Type'] = 'person'
#artists_prop.head()
In [ ]:
#artists.sort_index(inplace=True)
In [ ]:
#artists = artists.merge(artists_prop, left_index=True, right_index=True)
#print(artists.shape)
Deal with duplications.
In [ ]:
print(artists.shape[0])
print(artists.index.unique().shape[0])
In [ ]:
artists_prop.reset_index(inplace=True)
In [ ]:
artists_prop['Duplicated'] = artists_prop['ID'].duplicated(keep='first')
In [ ]:
artists_nodup = artists_prop[artists_prop['Duplicated'] == False].copy()
artists_dup = artists_prop[artists_prop['Duplicated'] == True].copy()
In [ ]:
artists_nodup.set_index('ID', inplace=True)
In [ ]:
print(artists_dup.shape[0])
artists_dup.head()
In [ ]:
cnt = 0
print(artists_dup.shape[0])
for ix in artists_dup.index:
artist_id = artists_dup.loc[ix, 'ID']
mbid = artists_dup.loc[ix, 'MBID']; mbid = str(mbid) if mbid is not None else mbid
name = artists_dup.loc[ix, 'Name']; name = str(name) if name is not None else name
# update existing artist info
mbid0 = artists_nodup.loc[artist_id, 'MBID']; mbid0 = str(mbid0) if mbid0 is not None else mbid0
name0 = artists_nodup.loc[artist_id, 'Name']; name0 = str(name0) if name0 is not None else name0
if mbid is not None:
if mbid0 is None or len(mbid) > len(mbid0):
artists_nodup.loc[artist_id, 'MBID'] = mbid
if name is not None:
if name0 is None or len(name) > len(name0):
artists_nodup.loc[artist_id, 'Name'] = name
cnt += 1
if cnt % 100 == 0:
sys.stdout.write('\r%d' % (cnt))
sys.stdout.flush()
In [ ]:
artists_nodup.drop('Duplicated', axis=1, inplace=True)
In [ ]:
print(artists_nodup.shape[0])
print(artists_nodup.index.unique().shape[0])
artists_nodup.head()
In [ ]:
fcsv_artists = os.path.join(data_dir, 'persons.csv')
artists_nodup.to_csv(fcsv_artists, sep=';')
In [ ]:
pd.read_csv(fcsv_artists, index_col='ID', sep=';').head()
In [ ]:
fusers = os.path.join(data_dir, 'users.idomaar')
In [ ]:
users = pd.read_csv(fusers, header=None, delimiter='\t')
In [ ]:
users.columns = COLUMNS[:-1]
users.set_index('ID', inplace=True)
users.head()
In [ ]:
def parse_user_properties(props):
props = props.replace(';', ',')
try:
prop = json.loads(props)
except:
props = props.replace('""', 'null').replace(':,', ':null,')
try: prop = json.loads(props)
except: print(props); sys.exit(0)
return pd.Series({'Username': prop['lastfm_username'],
'Gender': str.upper(prop['gender']) if prop['gender'] is not None else None,
'Age': prop['age'],
'Country': prop['country'],
'Playcount': prop['playcount'],
'#Playlists': prop['playlists'],
'Subscribertype': prop['subscribertype']})
In [ ]:
user_prop = users['Properties'].apply(lambda s: parse_user_properties(s))
In [ ]:
user_prop.shape
In [ ]:
users.drop(['Type', 'Properties'], axis=1, inplace=True)
In [ ]:
users = users.merge(user_prop, left_index=True, right_index=True)
print(users.shape)
In [ ]:
users.head()
In [ ]:
fcsv_users = os.path.join(data_dir, 'users.csv')
users.to_csv(fcsv_users, sep=';')
In [ ]:
pd.read_csv(fcsv_users, index_col='ID', sep=';').head()
In [ ]:
ftags = os.path.join(data_dir, 'tags.idomaar')
In [ ]:
tags = pd.read_csv(ftags, header=None, delimiter='\t')
In [ ]:
tags.columns = COLUMNS
tags.set_index('ID', inplace=True)
tags.head()
In [ ]:
tags[tags['Timestamp'] != -1]
In [ ]:
tags[tags['LinkedEntities'] != '{}']
In [ ]:
tags.drop(['Type', 'Timestamp', 'LinkedEntities'], axis=1, inplace=True)
In [ ]:
print(tags.shape)
tags.head()
In [ ]:
tags.loc[58983, 'Properties']
In [ ]:
def parse_tag_properties(props):
props = props.replace(';', ',')
try:
prop = json.loads(props)
except:
props = props.replace('u"', '').replace('\\', '').replace('\\n', '')\
.replace('"value":', 'VALUE:').replace('"url":', 'URL:').replace('"', '')\
.replace('VALUE:', '"value":"').replace(', URL:', '", "url":"').replace('}', '"}')
try: prop = json.loads(props)
except: print(props); sys.exit(0)
return pd.Series({'Value': prop['value'].replace('\n', ''), 'URL': prop['url']})
In [ ]:
tags_prop = tags['Properties'].apply(lambda s: parse_tag_properties(s))
In [ ]:
print(tags_prop.shape)
tags_prop.head()
In [ ]:
tags_prop.loc[230795, 'Value']
In [ ]:
fcsv_tags = os.path.join(data_dir, 'tags.csv')
tags_prop.to_csv(fcsv_tags, sep=';')
In [ ]:
pd.read_csv(fcsv_tags, index_col='ID', sep=';').head()
NOTE: there are duplicated lines (duplicated track ID with possibly different information) in tracks data, need to deal with this.
In [ ]:
ftracks = os.path.join(data_dir, 'tracks.idomaar')
In [ ]:
tracks = pd.read_csv(ftracks, header=None, delimiter='\t')
In [ ]:
tracks.columns = COLUMNS
#tracks.set_index('ID', inplace=True) # there's duplications
tracks.head()
In [ ]:
tracks[tracks['Timestamp'] != -1]
In [ ]:
tracks.drop(['Type', 'Timestamp'], axis=1, inplace=True)
Check duplications.
In [ ]:
tracks['ID'].unique().shape
In [ ]:
print(tracks.shape)
Deal with duplications.
In [ ]:
tracks['Duplicated'] = tracks['ID'].duplicated(keep='first')
tracks.head()
In [ ]:
tracks[tracks['ID'] == 170]
In [ ]:
fnodup = os.path.join(data_dir, 'tracks.nodup')
fdup = os.path.join(data_dir, 'tracks.dup')
In [ ]:
tracks[tracks['Duplicated'] == False][COLUMNS].to_csv(fnodup, quoting=csv.QUOTE_NONE, sep='\t', header=False,index=False)
In [ ]:
tracks[tracks['Duplicated'] == True][COLUMNS].to_csv(fdup, quoting=csv.QUOTE_NONE, sep='\t', header=False, index=False)
Clear object in memory.
In [ ]:
#%xdel tracks
#%xdel -n tracks
#gc.collect()
#gc.collect()
# memory usage are still huge after these operations
In [ ]:
#%reset
# this works!
# but needs re-import any modules needed
In [ ]:
prop_columns = ['Duration', 'Playcount', 'MBID', 'Name']
entity_columns = ['ArtistsID', 'AlbumsID', 'TagsID']
In [ ]:
def parse_track_properties(props):
try:
props = props.replace(';', ',')
prop = json.loads(props)
except:
print(props)
sys.exit(0)
return pd.Series({'Duration': prop['duration'], 'Playcount': prop['playcount'], \
'MBID': prop['MBID'], 'Name': prop['name']})
In [ ]:
#tracks.loc[0, 'Properties']
In [ ]:
#json.loads(tracks.loc[0, 'LinkedEntities'])
In [ ]:
def parse_track_entities(entities):
try:
entities = entities.replace(';', ',')
entity = json.loads(entities)
except:
print(entities)
sys.exit(0)
return pd.Series({
'ArtistsID': ','.join([str(x['id']) for x in entity['artists']]) if \
entity['artists'] is not None and len(entity['artists']) > 0 else None,
'AlbumsID': ','.join([str(x['id']) for x in entity['albums']]) if \
entity['albums'] is not None and len(entity['albums']) > 0 else None,
'TagsID': ','.join([str(x['id']) for x in entity['tags']]) if \
entity['tags'] is not None and len(entity['tags']) > 0 else None})
In [ ]:
#tracks.loc[0, 'LinkedEntities']
In [ ]:
#parse_track_entities(tracks.loc[0, 'LinkedEntities'])
Parse non-duplicated tracks and save them to csv file.
In [ ]:
tracks_nodup = tracks[tracks['Duplicated'] == False]
lines = []
cnt = 0
print(tracks_nodup.shape[0])
for ix in tracks_nodup.index:
track_id = tracks_nodup.loc[ix, 'ID']
props = parse_track_properties(tracks_nodup.loc[ix, 'Properties'])
entities = parse_track_entities(tracks_nodup.loc[ix, 'LinkedEntities'])
# add new track record
#tracks_df.ID = track_id
#tracks_df.loc[track_id, prop_columns] = props
#tracks_df.loc[track_id, entity_columns] = entities
line = [str(track_id)]
for prop in prop_columns: line.append(str(props[prop]) if props[prop] is not None else '')
for entity in entity_columns: line.append(str(entities[entity]) if entities[entity] is not None else '')
lines.append(';'.join(line))
cnt += 1
if cnt % 1000 == 0:
sys.stdout.write('\r%d' % (cnt))
sys.stdout.flush()
In [ ]:
fcsv_tracks = os.path.join(data_dir, 'tracks.csv')
In [ ]:
with open(fcsv_tracks, 'w') as fcsv:
for line in lines: fcsv.write(line + '\n')
In [ ]:
tracks_df = pd.read_csv(fcsv_tracks, sep=';', keep_default_na=False, header=None)
Deal with duplications.
In [ ]:
tracks_df.columns = ['ID'] + prop_columns + entity_columns
tracks_df.set_index('ID', inplace=True)
In [ ]:
print(tracks_df.shape[0])
print(tracks_df.index.unique().shape[0])
tracks_df.head()
In [ ]:
#tracks_df['Duration'] = tracks_df['Duration'].astype(float)
#tracks_df['Playcount'] = tracks_df['Playcount'].astype(float)
In [ ]:
cnt = 0
tracks_dup = tracks[tracks['Duplicated'] == True]
print(tracks_dup.shape[0])
for ix in tracks_dup.index:
track_id = tracks_dup.loc[ix, 'ID']
props = parse_track_properties(tracks_dup.loc[ix, 'Properties'])
entities = parse_track_entities(tracks_dup.loc[ix, 'LinkedEntities'])
# update existing track
# ['Duration', 'Playcount', 'MBID', 'Name']
duration = tracks_df.loc[track_id, 'Duration']
playcount = tracks_df.loc[track_id, 'Playcount']
if type(duration) == str:
try: duration = int(duration)
except: duration = -1
if type(playcount) == str:
try: playcount = int(playcount)
except: playcount = -1
mbid = tracks_df.loc[track_id, 'MBID']; mbid = str(mbid) if mbid is not None else mbid
name = tracks_df.loc[track_id, 'Name']; name = str(name) if name is not None else name
if type(mbid) == float: print(track_id)
if props['Duration'] is not None:
if duration is None or props['Duration'] > duration:
tracks_df.loc[track_id, 'Duration'] = props['Duration']
if props['Playcount'] is not None:
if playcount is None or props['Playcount'] > playcount:
tracks_df.loc[track_id, 'Playcount'] = props['Playcount']
if props['MBID'] is not None:
if mbid is None or len(props['MBID']) > len(mbid):
tracks_df.loc[track_id, 'MBID'] = props['MBID']
if props['Name'] is not None:
if name is None or len(props['Name']) > len(name):
tracks_df.loc[track_id, 'Name'] = props['Name']
# ['ArtistsID', 'AlbumsID', 'TagsID']
aid = tracks_df.loc[track_id, 'ArtistsID']; aid = str(aid) if aid is not None else aid
bid = tracks_df.loc[track_id, 'AlbumsID']; bid = str(bid) if bid is not None else bid
tid = tracks_df.loc[track_id, 'TagsID']; tid = str(tid) if tid is not None else tid
if entities['ArtistsID'] is not None:
if aid is None or len(str(entities['ArtistsID'])) > len(aid):
tracks_df.loc[track_id, 'ArtistsID'] = entities['ArtistsID']
if entities['AlbumsID'] is not None:
if bid is None or len(str(entities['AlbumsID'])) > len(bid):
tracks_df.loc[track_id, 'AlbumsID'] = entities['AlbumsID']
if entities['TagsID'] is not None:
if tid is None or len(str(entities['TagsID'])) > len(tid):
tracks_df.loc[track_id, 'TagsID'] = entities['TagsID']
cnt += 1
if cnt % 100 == 0:
sys.stdout.write('\r%d' % (cnt))
sys.stdout.flush()
In [ ]:
print(tracks['ID'].unique().shape[0])
print(tracks_df.shape[0])
tracks_df.head()
In [ ]:
tracks_df.to_csv(fcsv_tracks, sep=';')
In [ ]:
pd.read_csv(fcsv_tracks, sep=';', keep_default_na=False).head()
In [ ]:
fplaylist = os.path.join(data_dir, 'playlist.idomaar')
In [ ]:
playlist = pd.read_csv(fplaylist, header=None, delimiter='\t')
In [ ]:
playlist.columns = COLUMNS
print(playlist.shape)
playlist.head()
In [ ]:
playlist.drop(['Type', 'ID'], axis=1, inplace=True)
In [ ]:
playlist.head()
In [ ]:
def parse_playlist_properties(props):
props = props.replace(';', ',')
try:
prop = json.loads(props)
except:
props = props.replace('\\', '').replace('Title', 'TITLE').replace('numtracks', 'NUMTRACKS')\
.replace('"duration":', 'DURATION:').replace('"', '')\
.replace('ID:', '"ID":').replace('TITLE:', '"Title":"').replace(',NUMTRACKS:', '","numtracks":')\
.replace('DURATION:', '"duration":')
try: prop = json.loads(props)
except: print(props); sys.exit(0)
return pd.Series({'ID': prop['ID'], 'Title': prop['Title'], '#Tracks': prop['numtracks'],
'Duration': prop['duration']})
In [ ]:
#playlist.loc[0, 'Properties']
In [ ]:
#parse_playlist_properties(playlist.loc[0, 'Properties'])
In [ ]:
def parse_playlist_entities(entities):
entities = entities.replace(';', ',')
try:
entity = json.loads(entities.replace('[[]]', '[]'))
except:
try: entity = json.loads(entities)
except: print(entities); sys.exit(0)
return pd.Series({
'UserID': ','.join([str(x['id']) for x in entity['subjects']]) if len(entity['subjects']) > 0 else None,
'TracksID': ','.join([str(x['id']) for x in entity['objects']]) if len(entity['objects']) > 0 else None})
In [ ]:
#playlist.loc[0, 'LinkedEntities']
In [ ]:
#parse_playlist_entities(playlist.loc[0, 'LinkedEntities'])#['TracksID']
In [ ]:
prop_columns = ['ID', 'Title', '#Tracks', 'Duration']
entity_columns = ['UserID', 'TracksID']
In [ ]:
for col in prop_columns: playlist[col] = None
for col in entity_columns: playlist[col] = None
In [ ]:
cnt = 0
for ix in playlist.index:
playlist.loc[ix, prop_columns] = parse_playlist_properties(playlist.loc[ix, 'Properties'])
playlist.loc[ix, entity_columns] = parse_playlist_entities(playlist.loc[ix, 'LinkedEntities'])
cnt += 1
if cnt % 100 == 0:
sys.stdout.write('\r%d' % (cnt))
sys.stdout.flush()
In [ ]:
playlist.drop(['Properties', 'LinkedEntities'], axis=1, inplace=True)
In [ ]:
print(playlist.shape)
playlist.head()
Set index.
In [ ]:
playlist[playlist['UserID'].str.contains(',')]
In [ ]:
playlist['ID'].unique().shape
In [ ]:
playlist.set_index('ID', inplace=True)
Remove empty playlist (with 0 tracks).
In [ ]:
print(playlist[playlist['#Tracks'] == 0].shape)
playlist[playlist['#Tracks'] == 0].head()
In [ ]:
playlist = playlist[playlist['#Tracks'] > 0]
playlist.shape
In [ ]:
fcsv_playlist = os.path.join(data_dir, 'playlist.csv')
playlist.to_csv(fcsv_playlist, sep=';')
In [ ]:
pd.read_csv(fcsv_playlist, index_col='ID', sep=';').head()
Histogram of playlist length (i.e., the number of tracks/songs).
In [ ]:
%matplotlib inline
playlist['#Tracks'].hist()
In [ ]:
fpref = os.path.join(data_dir, 'love.idomaar')
In [ ]:
pref = pd.read_csv(fpref, header=None, delimiter='\t')
In [ ]:
print(pref.shape)
pref.head()
Deal with ugly seperated columns (the seperator should be TAB, but it's not for the last 3 columns here).
Check Type
values.
In [ ]:
pref[pref[0] != 'preference'].shape[0]
Check Timestamp
and Properties
values.
In [ ]:
pref[pref[2].str.startswith('-1 {"value":"love"}')].shape[0] == pref.shape[0]
In [ ]:
pref.columns = ['Type', 'ID', 'LinkedEntities']
pref.drop('Type', axis=1, inplace=True)
pref.head()
In [ ]:
pref.loc[0, 'LinkedEntities']
In [ ]:
def parse_pref_entity(entities):
entities = entities.replace('-1 {"value":"love"}', '').replace(';', ',')
try:
entity = json.loads(entities)
except:
try: entity = json.loads(entities)
except: print(entities); sys.exit(0)
return pd.Series({
'UserID': ','.join([str(x['id']) for x in entity['subjects']]) if len(entity['subjects']) > 0 else None,
'TrackID': ','.join([str(x['id']) for x in entity['objects']]) if len(entity['objects']) > 0 else None})
In [ ]:
#pref_entity = pref['LinkedEntities'].apply(lambda s: parse_pref_entity(s)) # use too much memory
In [ ]:
pref_columns=['UserID', 'TrackID']
In [ ]:
#pref_entity = pd.DataFrame(columns=pref_columns)
In [ ]:
lines = []
ix = 0
print(pref.shape[0])
with open(fpref) as tsvin:
tsvin = csv.reader(tsvin, delimiter='\t')
for row in tsvin:
assert(len(row) == 3)
#pref_entity.loc[ix] = parse_pref_entity(row[2])
prefs = parse_pref_entity(row[2])
lines.append(';'.join([str(prefs[col]) for col in pref_columns]))
ix += 1
if ix % 1000 == 0:
sys.stdout.write('\r%d' % (ix))
sys.stdout.flush()
In [ ]:
fcsv_pref = os.path.join(data_dir, 'love.csv')
#pref_entity.to_csv(fcsv_pref, sep=';', index=False)
In [ ]:
with open(fcsv_pref, 'w') as fcsv:
fcsv.write(';'.join(pref_columns) + '\n')
for line in lines: fcsv.write(line + '\n')
In [ ]:
pd.read_csv(fcsv_pref, sep=';').head()