In [ ]:
%pylab inline
%pdb
import os
import pickle
from unidecode import unidecode
from sklearn.feature_extraction import DictVectorizer
import yaml
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from organiser import tm_utils
plt.style.use('seaborn-deep')
rcParams['figure.figsize'] = 25, 19
rcParams['font.size'] = 24
In [ ]:
from tango_graphs import compare_recdates, plot_similarity, plot_mds, orchestra_plot, singer_plot
from tango_stats import norm, get_mds
In [ ]:
def get_authors(text):
if type(text) == float:
return None
elif text == '?':
return None
text = text.replace('Comp: <', '')
text = text.replace('> || Lyr: <', ', ')
text = text.replace('>', '')
return unique(text.split(', '))
In [ ]:
def composer_table(df):
df = df.dropna(subset=['Composer'])
series = df['Composer'].map(get_authors).apply(pd.Series, 1).stack()
series.index = series.index.droplevel(-1) # to line up with df's index
series.name = 'Composer' # needs a name to join
del df['Composer']
df = df.join(series)
return df
In [ ]:
def text_vector(vectorizer, table, orch, field):
composers = table[table['Director'] == orch][field]
vector = vectorizer.transform([{field: val} for val in composers if val])
vector = vector.sum(axis=0) / vector.shape[0]
return vector
In [ ]:
organiser_dir = os.path.join(os.path.split(os.path.realpath("__file__"))[0], '..', 'organiser')
pandas_dataframe = os.environ["TM_PANDAS_LIBRARY"]
orchestras = yaml.load(open(os.path.join(organiser_dir, 'tango_directors.yml')))
In [ ]:
# Set up pandas database
df = pickle.load(open(pandas_dataframe, 'rb'))
df = df[df['Grouping'] == '+']
df.rename(columns={'Total Time': 'Time'}, inplace=True)
for col in ('Year', 'BPM', 'Rating', 'Time'):
df.loc[:, col] = df.loc[:, col].apply(pd.to_numeric)
In [ ]:
# Include specific years
year_range = range(1926, 1958) #range(1926, 1976)
df = df[df['Year'].isin(year_range)]
In [ ]:
df_sing = df.copy()
s = df['Singers'].str.split(' & ').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1) # to line up with df's index
s.name = 'Singers'
del df_sing['Singers']
df_sing = df_sing.join(s)
df_sing = df_sing[df_sing['Singers'].notnull()]
df_sing = df_sing[df_sing['Release Date'].notnull()]
In [ ]:
df_rec = df_sing.copy()
df_rec['Release Date'] = pd.to_datetime(df_rec['Release Date'])
In [ ]:
df_tan = df[df['Genre'] == 'Tango']
df_val = df[df['Genre'] == 'Vals']
df_mil = df[df['Genre'].isin(('Milonga', 'Milonga [Candombe]'))]
In [ ]:
df_proc = composer_table(df)
In [ ]:
n_orch = len(orchestras['Main'])
orch_sizes = np.ones((n_orch,))
for i, orch in enumerate(orchestras['Main']):
orch_sizes[i] = sum(df['Director'] == orch)
In [ ]:
def jaccard_similarity(query, document):
intersection = set(query).intersection(set(document))
union = set(query).union(set(document))
return float(len(intersection))/len(union)
In [ ]:
orchA = 'Troilo, Anibal'
orchB = 'Calo, Miguel'
setA = df[df['Director'].map(unidecode).str.contains(orchA)]['Title Clean']
setB = df[df['Director'].map(unidecode).str.contains(orchB)]['Title Clean']
jaccard_similarity(setA, setB)
In [ ]:
for i, orch in enumerate(orchestras['Main']):
orch_sizes[i] = sum(df['Director'] == orch)
vectorizer = DictVectorizer()
titles = [{'Title': val} for val in df_proc['Title'] if val]
X = vectorizer.fit_transform(titles)
vectors = []
for i, orch in enumerate(orchestras['Main']):
vectors.append(text_vector(vectorizer, df_proc, orch, 'Title'))
matrix = np.vstack(vectors)
In [ ]:
from scipy.spatial.distance import pdist, squareform
orch_sim = 1-pdist(matrix, 'cosine')
orch_sim = squareform(orch_sim)
In [ ]:
pos = get_mds(1-orch_sim)
In [ ]:
fig = plot_similarity(orch_sim, labels=orchestras['Main'])
plt.savefig('../plots/OrchSim_songs')
In [ ]:
fig = plot_mds(pos, orch_sim, orchestras['Main'], orch_sizes)
plt.savefig('../plots/OrchMDS_songs')
In [ ]:
for i, orchA in enumerate(orchestras['Main']):
for j, orchB in enumerate(orchestras['Main']):
if j > i:
compare_recdates(df_rec, orchA, orchB)
In [ ]:
vectorizer = DictVectorizer()
titles = [{'Title': val} for val in df_proc['Title'] if val]
X = vectorizer.fit_transform(titles)
vectors = []
for i, orch in enumerate(orchestras['Main']):
vectors.append(text_vector(vectorizer, df_proc, orch, 'Title'))
matrix = np.vstack(vectors)
In [ ]:
vectorizer = DictVectorizer()
composers = [{'Composer': val} for val in df_proc['Composer'] if val]
X = vectorizer.fit_transform(composers)
n_orch = len(orchestras['Main'])
vectors = []
for i, orch in enumerate(orchestras['Main']):
vectors.append(text_vector(vectorizer, df_proc, orch, 'Composer'))
matrix = np.vstack(vectors)
In [ ]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.imshow(matrix, interpolation='nearest', aspect='auto')
plt.yticks(np.arange(0, n_orch), orchestras['Main'])
ax.grid(False)
plt.colorbar()
In [ ]:
n_comp = 3
for i, orchestra in enumerate(orchestras['Main']):
vector = matrix[i, :].ravel()
ind = np.argpartition(vector, -n_comp)[:, -n_comp:]
print(orchestra + ' (', end='')
for j in range(n_comp):
print(' ' + str(j + 1) + ' --> ' + str(vectorizer.feature_names_[ind[:, j][0,0]]), end='')
print(' )')
In [ ]:
from scipy.spatial.distance import pdist, squareform
orch_sim = 1-pdist(matrix, 'cosine')
orch_sim = squareform(orch_sim)
In [ ]:
pos = get_mds(1-orch_sim)
In [ ]:
fig = plot_similarity(orch_sim, orchestras['Main'])
plt.savefig('../plots/OrchSim_authors')
In [ ]:
fig = plot_mds(pos, orch_sim, orchestras['Main'], orch_sizes)
plt.savefig('../plots/OrchMDS_authors')
In [ ]:
plt.figure()
ax = plt.subplot(3,1,1)
df_tan.hist(column='Year', bins=year_range, ax=ax)
ax.set_title('Tango')
ax = plt.subplot(3,1,2)
df_val.hist(column='Year', bins=year_range, ax=ax)
ax.set_title('Vals')
ax = plt.subplot(3,1,3)
df_mil.hist(column='Year', bins=year_range, ax=ax)
ax.set_title('Milonga')
plt.savefig('../plots/Song_Hist')
In [ ]:
fig = sns.violinplot(df_tan[df_tan['BPM'] > 0].BPM, groupby=df_tan[df_tan['BPM'] > 0].Year)
In [ ]:
sns.violinplot(df_val[df_val['BPM'] > 0].BPM, groupby=df_val[df_val['BPM'] > 0].Year)
In [ ]:
sns.violinplot(df_mil[df_mil['BPM'] > 0].BPM, groupby=df_mil[df_mil['BPM'] > 0].Year)
In [ ]:
sns.violinplot(df_tan[df_tan['Rating'] > 0].Rating, groupby=df_tan[df_tan['Rating'] > 0].Year)
In [ ]:
sns.violinplot(df_val[df_val['Rating'] > 0].Rating, groupby=df_val[df_val['Rating'] > 0].Year)
In [ ]:
sns.violinplot(df_mil[df_mil['Rating'] > 0].Rating, groupby=df_mil[df_mil['Rating'] > 0].Year)
In [ ]:
def plot_time(df, labelsize=32):
fig = plt.figure()
ax = plt.subplot(111)
sns.violinplot(df_tan.Time, groupby=df_tan.Year)
plt.xticks(rotation='vertical')
ax.tick_params(axis='both', which='major', labelsize=labelsize)
In [ ]:
plot_time(df_tan)
In [ ]:
sns.violinplot(df_val.Time, groupby=df_val.Year)
In [ ]:
sns.violinplot(df_mil.Time, groupby=df_mil.Year)
In [ ]:
df_tan.plot.scatter(x='Time', y='BPM', alpha=0.1)
z = np.polyfit(df_tan['Time'], df_tan['BPM'], 1)
p = np.poly1d(z)
pylab.plot(df_tan['Time'], p(df_tan['Time']), "r--")
print("y={:0.6f}x+({:0.6f})".format(z[0], z[1]))
In [ ]:
df_val.plot.scatter(x='Time', y='BPM', alpha=0.3)
z = np.polyfit(df_val['Time'], df_val['BPM'], 1)
p = np.poly1d(z)
pylab.plot(df_val['Time'], p(df_val['Time']), "r--")
print("y={:0.6f}x+({:0.6f})".format(z[0], z[1]))
In [ ]:
df_mil.plot.scatter(x='Time', y='BPM', alpha=0.3)
z = np.polyfit(df_mil['Time'], df_mil['BPM'], 1)
p = np.poly1d(z)
pylab.plot(df_mil['Time'], p(df_mil['Time']), "r--")
print("y={:0.6f}x+({:0.6f})".format(z[0], z[1]))
In [ ]:
df_date = df[df['Release Date'].notnull()]
In [ ]:
orch = orchestras['Main'][0]
orchestra_plot(df, orch)
In [ ]:
for orch in orchestras['Main']:
try:
orchestra_plot(df, orch)
except:
pass
In [ ]:
singer = 'Ernesto Fama'
singer_plot(df_sing, singer)
In [ ]:
singers = unique(df_sing['Singers'])
for singer in singers:
try:
singer_plot(df_sing, singer)
except:
pass
In [ ]: