By David Taylor, http://www.prooffreader.com
Note that all timings listed here are on my cheapo Windows laptop, so chances are you'll do at least as well.
In [1]:
%%time
import pandas as pd
import re
import os
import urllib
import rarfile # not part of standard distro
import glob
import numpy as np
import matplotlib.pyplot as plt
from difflib import SequenceMatcher
from collections import Counter
%matplotlib inline
In [2]:
%%time
# download and extract charts.rar
# note that this archive is periodically updated
# you can skip this cell if you manually download http://bullfrogspond.com/charts.rar
# and put it in the same directory as this notebook
# use this command if unrar.exe is not in your PATH, changing to your path:
rarfile.UNRAR_TOOL = r"C:\Program Files\WinRAR\UnRAR.exe"
urllib.urlretrieve('http://bullfrogspond.com/charts.rar', 'charts.rar')
with rarfile.RarFile('charts.rar') as rf:
for member in rf.infolist():
rf.extract(member)
In [3]:
%%time
# create dataframe from .xls file and manipulate it
# use most recent .xls file in case more than one is in directory, i.e.
# you've downloaded and extracted charts.rar on different dates, after
# it's been updated
globlist = glob.glob('*.xls')
globlist.sort()
filename = globlist[-1]
# read excel file into pandas dataframe. it's a huge file, but only four columns are required.
df_tracks = pd.read_excel(filename, sheetname="\"Pop Annual\"", parse_cols='A,B,K,Q')
print "ORIGINAL DATAFRAME:"
print df_tracks.head()
In [4]:
%%time
# The Yearly Rank column has some alphabetic data, e.g. 95a, 95b
# This is sometimes multiple releases from the same artist, which we
# wish to keep, and sometimes Parts 1 and 2 of the same track,
# which we don't.
# Some Yearly Ranks are n/a, which we will change to 999 to avoid NaNs
# (No year has over 998 entries)
# BTW, we use 'ranked' instead of 'rank' as column name because
# the latter is in the pandas namespace
# Add a column ranked as float, with 0.1 added for a, 0.2 added for b, etc.
# while we're at it, change all column names to lower case with underscores
df_tracks.columns = [['year', 'yearly_rank', 'artist', 'track']]
df_tracks['ranked'] = 0.0
def calc_rankfloat(row):
rank = row.yearly_rank
if type(rank) != int:
try:
try:
suffix = re.search('([^0-9])', rank).group(1) #extract alphabetic
assert len(suffix) == 1 #just in case
rank = float(rank[:-1])
rank += (ord(suffix) - 96) * 0.1
except AttributeError:
# sometimes Yearly Rank looks like an int, but doesn't pass the
# type test.
rank = float(rank.strip())
except ValueError:
rank = 999 # for n/as
return float(rank)
df_tracks['ranked'] = df_tracks.apply(calc_rankfloat, axis=1)
# calculate difference in consecutive ranks so we can evaluate cases
# where difference < 1, i.e. 82a, 82b which became 82.1, 82.2, etc.
df_tracks.sort(['year', 'ranked'], ascending=True, inplace=True)
df_tracks.reset_index(inplace = True, drop=True)
df_tracks['diff_rank'] = 0.0
for i in range(len(df_tracks)):
if i == 0:
df_tracks.diff_rank.iloc[i] = 1
elif df_tracks.year.iloc[i] != df_tracks.year.iloc[i-1]:
df_tracks.diff_rank.iloc[i] = 1
else:
df_tracks.diff_rank.iloc[i] = df_tracks.ranked.iloc[i] - df_tracks.ranked.iloc[i-1]
# go through dataframe and find consecutive entries where the difference in rank
# is less than one. Perform actions according to the following scenarios
# 1: Artist same, track names similar tracks contain 'Part 1' and 'Part 2'
# Keep first entry, without 'Part 1'
# 2: Artist same, track names similar
# Keep first entry
# Note that 'similar' means SequenceMatcher's result is > 0.5
# Note that entries are tagged for deletion by changing the year to 0.
# At the end, all rows with year == 0 are deleted
for i in range(len(df_tracks)):
if df_tracks.diff_rank.iloc[i] < 0.5 and df_tracks.ranked.iloc[i] != 0:
diff_rank = df_tracks.diff_rank.iloc[i]
year = df_tracks.year.iloc[i]
artist_prev = df_tracks.artist.iloc[i-1]
artist = df_tracks.artist.iloc[i]
ranked_prev = df_tracks.ranked.iloc[i-1]
ranked = df_tracks.ranked.iloc[i]
track_prev = df_tracks.track.iloc[i-1]
track = df_tracks.track.iloc[i]
seq_match = SequenceMatcher(None, track_prev, track).ratio()
#scenario 1
if (re.search('[Pp]art 1', track_prev) and
re.search('[Pp]art 2', track) and
seq_match > 0.5):
df_tracks.track.iloc[i-1] = re.sub('[Pp]art 1', '', track_prev)
df_tracks.year.iloc[i] = 0
elif seq_match > 0.5:
df_tracks.year.iloc[i] = 0
df_tracks = df_tracks[df_tracks.year != 0] # remove those flagged for removal
# remove duplicate song titles in one year -- before the 1960s, it was
# very common for multiple artists to appear in the Billboard chart with
# the same song at about the same time; this skews the results towards
# these songs. After removal, the highest-ranking version will be kept.
print "Before duplicates removed:"
print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)]
print ""
df_tracks.drop_duplicates(['track', 'year'], inplace=True)
print "After duplicates removed:"
print df_tracks[(df_tracks.track == 'Mona Lisa') & (df_tracks.year == 1950)]
df_tracks.to_pickle('df_tracks_v1.pickle')
df_tracks.to_pickle('df_tracks_v1.pickle')
df_tracks.head()
Starting from df_tracks, we will create:
Note that the following changes to song titles are performed:
Note that a very, very few records with NaN values are removed (less than 1 per 10 000 song titles).
In [6]:
# Make some lists and dicts and functions we will use
# in case you start here
df_tracks = pd.read_pickle('df_tracks_v1.pickle')
# lists of years and decades in df_tracks
decades = list(df_tracks.decade.unique())
decades.sort()
years = list(df_tracks.year.unique())
years.sort()
# dict comprehension to create dicts of
# lists of words with decades or years as key
# lists are empty for now, when initialized
decades_words = {decade: [] for decade in decades}
years_words = {year: [] for year in years}
# Define our log-likelihood function
def loglike(n1, t1, n2, t2):
"""Calculates Dunning log likelihood of an observation of
frequency n1 in a corpus of size t1, compared to a frequency n2
in a corpus of size t2. If result is positive, it is more
likely to occur in corpus 1, otherwise in corpus 2."""
from numpy import log
e1 = t1*1.0*(n1+n2)/(t1+t2) # expected values
e2 = t2*1.0*(n1+n2)/(t1+t2)
LL = 2 * ((n1 * log(n1/e1)) + n2 * (log(n2/e2)))
if n2*1.0/t2 > n1*1.0/t1:
LL = -LL
return LL
len_before = len(df_tracks)
df_tracks = df_tracks.dropna()
print "{} NaN-containing tracks dropped; {} remain".format(len_before - len(df_tracks), len(df_tracks))
In [7]:
%%time
# make lists of words per song, per year and per decade
df_tracks['wordlist'] = ''
for idx, row in df_tracks.iterrows():
track = unicode(row.track)
track = re.sub('[^A-Za-z0-9 \']', '', track) # remove punctuation
track = re.sub('[Pp]art [0-9]', '', track)
track = track.lower()
words = list(set(track.split())) #removes duplicates in one song title
for word in words:
decades_words[row.decade].append(word)
years_words[row.year].append(word)
df_tracks.wordlist[idx] = ' '.join(words)
# create dict of total word counts per decade and per word
decades_count = {decade: len(decades_words[decade]) for decade in decades}
decades_count_max = max(decades_count.values())
years_count = {year: len(years_words[year]) for year in years}
In [8]:
%%time
# create df_year and df_decade dataframes
# 'counted' is raw count (called 'counted' to avoid namespace
# conflict with 'count' method)
dfy_words = []
dfy_years = []
dfy_counts = []
for year in years:
for word in set(years_words[year]):
dfy_years.append(year)
dfy_words.append(word)
dfy_counts.append(years_words[year].count(word))
df_year = pd.DataFrame({'word':dfy_words, 'year':dfy_years, 'counted':dfy_counts})
def calc_yr_pct(row):
return row.counted * 100.0 / years_count[row.year]
df_year['pct'] = df_year.apply(calc_yr_pct, axis=1)
dfd_words = []
dfd_decades = []
dfd_counts = []
for decade in decades:
for word in set(decades_words[decade]):
dfd_decades.append(decade)
dfd_words.append(word)
dfd_counts.append(decades_words[decade].count(word))
df_decade = pd.DataFrame({'word':dfd_words, 'decade':dfd_decades, 'counted':dfd_counts})
def calc_dec_pct(row):
return row.counted * 100.0 / decades_count[row.decade]
df_decade['pct'] = df_decade.apply(calc_dec_pct, axis=1)
In [9]:
%%time
# add calculated log-likelihood column
decades_pct = {decade: df_decade[df_decade.decade == decade].pct.sum() for decade in decades}
# create dict of total counts and total pct per word
word_counts = {}
for word in df_decade.word.unique():
word_counts[word] = df_decade[df_decade.word == word].counted.sum()
word_counts_total = sum(decades_count.values())
assert word_counts_total == df_decade.counted.sum()
word_pcts = {}
for word in df_decade.word.unique():
word_pcts[word] = df_decade[df_decade.word == word].pct.sum()
word_pcts_total = df_decade.pct.sum()
def calc_ll(row):
return loglike(row.counted,
decades_count[row.decade],
word_counts[row.word],
word_counts_total)
df_decade['loglike'] = df_decade.apply(calc_ll, axis=1)
In [10]:
#pickle all dataframes
df_tracks.to_pickle('df_tracks_v2.pickle')
df_decade.to_pickle('df_decade.pickle')
df_year.to_pickle('df_year.pickle')
In [10]:
# read from pickle in case you start here:
df_tracks = pd.read_pickle('df_tracks_v2.pickle')
df_tracks = df_tracks[['year', 'decade', 'artist', 'track', 'ranked', 'wordlist']]
df_decade = pd.read_pickle('df_decade.pickle')
df_year = pd.read_pickle('df_year.pickle')
In [16]:
df_tracks.tail()
Out[16]:
In [15]:
df_decade.tail()
Out[15]:
In [14]:
df_year.tail()
Out[14]:
In [17]:
df_decade.sort('loglike', ascending=False, inplace=True)
#determine how many rows are needed until each decade is represented
#at least once
from collections import Counter
c = Counter()
decades = list(df_decade.decade.unique())
remaining_decades = list(df_decade.decade.unique())
decadespop = decades
num_rows = 0
while len(remaining_decades) > 0:
decade = df_decade.decade.iloc[num_rows]
c[decade] += 1
if decade in remaining_decades:
remaining_decades.remove(decade)
num_rows += 1
print '{} rows required for each decade to be represented.'.format(num_rows)
print c
In [ ]:
# with this approach, there would be 32 of 64 before 1930.
# instead, let's use the top five for each decade.
In [19]:
import csv
with open('billboard_output.csv', 'wb+') as csvfile:
csvwriter = csv.writer(csvfile, delimiter='\t',
quotechar='\"', quoting=csv.QUOTE_MINIMAL)
decades = range(1890, 2020, 10)
for decade in decades:
dftemp = df_decade[df_decade.decade == decade].sort('loglike', ascending=False)
for i in range(5):
output = []
word = dftemp.word.iloc[i]
keyness = int(dftemp.loglike.iloc[i])
regex = '(^{0} |^{0}$| {0}$| {0} )'.format(word)
dftemp2 = df_tracks[(df_tracks.decade == decade) &
(df_tracks.wordlist.str.contains(regex))]
dftemp2.sort(['ranked', 'year'], ascending=True, inplace=True)
artist = dftemp2.artist.iloc[0]
track = dftemp2.track.iloc[0]
year = dftemp2.year.iloc[0]
print decade, word, keyness, artist, track, year
output.append(decade)
output.append(word)
output.append(keyness)
output.append(artist)
output.append(track)
output.append(year)
for year in range(1890,2015):
dftemp3 = df_year[(df_year.word == word) & (df_year.year == year)]
if len(dftemp3) > 0:
output.append(dftemp3.pct.iloc[0])
else:
output.append(0)
csvwriter.writerow(output)