In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
!ls


August 2015_2.pdf   Exercise_3.ipynb    artistToGaguiel.csv

In [3]:
#Load file into rows of User,Artist,Freq
rows = []
with open("artistToGaguiel.csv") as fh:
    for line in fh:
        user,artists,freqs = line.rstrip("\n").split("\t")
        user = user.strip('"')
        artists = artists.strip('"').split(";")
        freqs = freqs.strip('"').split(";")
        for art,freq in zip(artists,freqs):
            rows.append( (user,art,freq) )
            
#Transform data into a dataframe
df = pd.DataFrame(rows,columns=["user","artist","freq"])
#Verify data was loaded properly
df.head()


Out[3]:
user artist freq
0 00000c289a1829a808ac09c00daf10bc3c4e223b betty blowtorch 2137
1 00000c289a1829a808ac09c00daf10bc3c4e223b die Ärzte 1099
2 00000c289a1829a808ac09c00daf10bc3c4e223b melissa etheridge 897
3 00000c289a1829a808ac09c00daf10bc3c4e223b elvenking 717
4 00000c289a1829a808ac09c00daf10bc3c4e223b juliette & the licks 706

Question 1: top 10 users listening to the beatles


In [4]:
df[df["artist"] == "the beatles"].sort("freq",ascending=False).head(10)


Out[4]:
user artist freq
505035 077621e8f30383c871cb7ff31c99e20478e63fac the beatles 998
682765 0a18b5594d8aa92e6927ce005945eca7db8deb7f the beatles 997
261098 03e747bacc61d5dcce576a43533eb7a7138c8f0f the beatles 996
847495 0c91bd8ce6160cc49661f4582ac465d36ff9034d the beatles 995
280284 0434d80d4541e0c82a9c441c9b3c23b105d8f79e the beatles 992
607344 08fe489cc740366bbc693b888ac4ea1ddbaf6382 the beatles 991
966585 0e4c00f3b963cc1b37745dbf5a1dde553c7235b5 the beatles 99
66891 00f47faf418dfaade4bb0abb4fc86e2a827cd76b the beatles 99
261948 03eb8a9e3b1100945e38d611bd76113dd1e9642e the beatles 99
76458 0119d17a6fed9a662bea8f0f38a279729fc0a5b3 the beatles 99

In [5]:
df[["artist","freq"]].groupby("artist",sort=False).count().sort("freq",ascending=False).head(1)


Out[5]:
freq
artist
radiohead 4348

Question 3: Two users with the most bands


In [6]:
df[["user","artist"]].groupby("user").count().sort("artist",ascending=False).head(2)


Out[6]:
artist
user
0a7c493718902fe4590cad2f76db9abb6dce6fae 131
0701715a7c6c6bc35036ad5fa7d89ddfe6691c95 114