I'll be using pandas to clean and structure the data and bokeh for visualization.
In [1]:
import pandas as pd
from bokeh.charts import Donut, HeatMap, Histogram, Line, Scatter, show, output_notebook, output_file
from bokeh.plotting import figure
In [2]:
output_notebook()
Get the album list from the xls file into a pandas dataframe
In [3]:
album_list = pd.read_excel('albumlist.xls')
In [4]:
album_list.dtypes
Out[4]:
In [5]:
album_list.head()
Out[5]:
The Genre and Subgenre categories have multiple comma separated values. I'm going to keep just the first value and drop the others for the category.
In [13]:
#efficent method to do the same thing commented below
for lab, row in album_list.iterrows():
album_list.loc[lab, "Genre_Refined"] = row["Genre"].split(',')[0]
album_list.loc[lab, "Subgenre_Refined"] = row["Subgenre"].split(',')[0]
In [14]:
#add Genre_Refined column by selecting just the first value
#album_list['Genre_Refined'] = album_list['Genre']
#for i in range(len(album_list)):
#album_list['Genre_Refined'][i] = album_list['Genre'][i].split(',')[0]
In [15]:
#add Subgenre_Refined column by selecting just the first value
#album_list['Subgenre_Refined'] = album_list['Subgenre']
#for i in range(len(album_list)):
#album_list['Subgenre_Refined'][i] = album_list['Subgenre'][i].split(',')[0]
In [16]:
album_list.head()
Out[16]:
In [17]:
album_list.dtypes
Out[17]:
In [18]:
#get a count of how many times each artist made it into the list
artists_count = album_list.groupby(['Artist'], as_index=False).count()
In [19]:
#find the top 10 artists
top_artists = artists_count.sort_values(by='Number', ascending=False).head(10)
top_artists = top_artists.reset_index().drop(['index', 'Album','Year','Genre','Subgenre', 'Genre_Refined', 'Subgenre_Refined'], axis=1)
top_artists.head(10)
Out[19]:
In [20]:
#get the artists and corresponding counts into two lists for plotting
top_artists_list = top_artists.Artist.values.tolist()
top_artists_count = top_artists.Number.astype(float).values.tolist()
In [21]:
#visualize the data using bokeh
#output_file("top_artists.html", title="top artists")
p = figure(x_range=top_artists_list, plot_height = 500, plot_width = 500)
#set x-axis properties
p.xgrid.visible = False
p.xaxis.major_label_orientation = 3.14/4
p.xaxis.axis_label = 'Artist Name'
#set y-axis properties
p.ygrid.visible = False
p.yaxis.axis_label = 'Album Count'
#draw circles
p.circle(y=top_artists_count, x=top_artists_list, size=15, fill_color="black")
show(p)
In [22]:
#get count of albums in each year
yearwise_albums = album_list.groupby(['Year'], as_index=False).count()
yearwise_albums = yearwise_albums.sort_values(by='Year').reset_index().drop(['index', 'Album', 'Artist','Genre','Subgenre','Subgenre_Refined','Genre_Refined'], axis=1)
yearwise_albums.head()
Out[22]:
In [23]:
#visulaizing the data using bokeh line graphs
#output_file("yearwise_albums.html", title="yearwise_albums")
line = Line(data=yearwise_albums, x='Year', y='Number')
line.yaxis.axis_label = 'Number of Albums'
show(line)
In [24]:
#pivot the data and get a subset of the pivoted data where each subgenre has a count of more than 5
pivoted = pd.pivot_table(album_list, index=['Genre_Refined', 'Subgenre_Refined'], values=['Number'], aggfunc='count')
pivoted_subset = pivoted[pivoted['Number'] > 5]
pivoted_subset = pivoted_subset.reset_index()
pivoted_subset
Out[24]:
In [25]:
#visualizing the data using the bokeh donut chart
#output_file("donut.html", title="donut")
from bokeh.palettes import Purples9 as palette1
palette1 = palette1[::-1]
d = Donut(pivoted_subset, label=['Genre_Refined', 'Subgenre_Refined'], values='Number',
text_font_size='15pt', plot_height=1000, plot_width=1000, palette=palette1)
show(d)
In [26]:
#getting yearwise data for each genre
yearwise_data = album_list.groupby(['Year', 'Genre_Refined'], as_index=False).count()
yearwise_data = yearwise_data.sort_values(by='Year').reset_index().drop(['index', 'Album', 'Artist','Genre','Subgenre','Subgenre_Refined'], axis=1)
yearwise_data.head(25)
Out[26]:
In [27]:
#visualizing the data using a bokeh heatmap
#output_file("yearwise_genre.html", title="yearwise_subgenre")
from bokeh.palettes import Reds9 as palette2
palette2 = palette2[::-1]
hm_year = HeatMap(yearwise_data, x='Year', y='Genre_Refined', values='Number', stat=None,
width=750, plot_height=500, palette=palette2)
#y-axis properties
hm_year.yaxis.axis_label = 'Genre'
hm_year.yaxis.major_label_orientation = 'horizontal'
show(hm_year)
In [28]:
#count subgenres yearwise and subset it for rock music
yearwise_subgenres = album_list.groupby(['Year', 'Genre_Refined', 'Subgenre_Refined'], as_index=False).count()
rock_subgenres_yearwise = yearwise_subgenres[yearwise_subgenres['Genre_Refined'] == 'Rock'].reset_index().drop(['index', 'Album', 'Artist','Genre','Subgenre'], axis=1)
rock_subgenres_yearwise.head()
Out[28]:
In [29]:
#visualizing the data using bokeh scatterplot
#output_file("rock_subgenres_yearwise.html", title="rock_subgenres_yearwise")
hm_rock_subgenres = Scatter(rock_subgenres_yearwise, x='Year', y='Subgenre_Refined', width=800, plot_height=800)
#x-axis properties
hm_rock_subgenres.xgrid.visible = False
#y-axis properties
hm_rock_subgenres.yaxis.major_label_orientation = 'horizontal'
hm_rock_subgenres.yaxis.axis_label = 'Subgenres of Rock'
hm_rock_subgenres.ygrid.visible = False
show(hm_rock_subgenres)
In [30]:
#top 10 albums
top_albums = album_list.head(10)
#Get artists and albums into a new data frame
top_albums_a = top_albums['Artist']
top_albums_b = top_albums['Album']
top_albums_final = pd.concat([top_albums_a, top_albums_b], axis=1)
#groupby and summarize
top_albums_chart = top_albums_final.groupby(['Artist', 'Album']).count()
top_albums_chart
Out[30]:
In [ ]: