In [ ]:
%pylab inline
%pdb
import os
import pickle
from unidecode import unidecode
from sklearn.feature_extraction import DictVectorizer
import yaml
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from organiser import tm_utils
plt.style.use('seaborn-deep')
rcParams['figure.figsize'] = 25, 19
rcParams['font.size'] = 24

In [ ]:
from tango_graphs import compare_recdates, plot_similarity, plot_mds, orchestra_plot, singer_plot
from tango_stats import norm, get_mds

In [ ]:
def get_authors(text):
    if type(text) == float:
        return None
    elif text == '?':
        return None
    text = text.replace('Comp: <', '')
    text = text.replace('> || Lyr: <', ', ')
    text = text.replace('>', '')
    return unique(text.split(', '))

In [ ]:
def composer_table(df):
    df = df.dropna(subset=['Composer'])
    series = df['Composer'].map(get_authors).apply(pd.Series, 1).stack()
    series.index = series.index.droplevel(-1)  # to line up with df's index
    series.name = 'Composer'  # needs a name to join
    del df['Composer']
    df = df.join(series)
    return df

In [ ]:
def text_vector(vectorizer, table, orch, field):
    composers = table[table['Director'] == orch][field]
    vector = vectorizer.transform([{field: val} for val in composers if val])
    vector = vector.sum(axis=0) / vector.shape[0]
    return vector

Clean songs


In [ ]:
organiser_dir = os.path.join(os.path.split(os.path.realpath("__file__"))[0], '..', 'organiser')
pandas_dataframe = os.environ["TM_PANDAS_LIBRARY"]
orchestras = yaml.load(open(os.path.join(organiser_dir, 'tango_directors.yml')))

In [ ]:
# Set up pandas database
df = pickle.load(open(pandas_dataframe, 'rb'))
df = df[df['Grouping'] == '+']
df.rename(columns={'Total Time': 'Time'}, inplace=True)
for col in ('Year', 'BPM', 'Rating', 'Time'):
    df.loc[:, col] = df.loc[:, col].apply(pd.to_numeric)

In [ ]:
# Include specific years
year_range = range(1926, 1958) #range(1926, 1976)
df = df[df['Year'].isin(year_range)]

In [ ]:
df_sing = df.copy()
s = df['Singers'].str.split(' & ').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1) # to line up with df's index
s.name = 'Singers'
del df_sing['Singers']
df_sing = df_sing.join(s)
df_sing = df_sing[df_sing['Singers'].notnull()]
df_sing = df_sing[df_sing['Release Date'].notnull()]

In [ ]:
df_rec = df_sing.copy()
df_rec['Release Date'] = pd.to_datetime(df_rec['Release Date'])

In [ ]:
df_tan = df[df['Genre'] == 'Tango']
df_val = df[df['Genre'] == 'Vals']
df_mil = df[df['Genre'].isin(('Milonga', 'Milonga [Candombe]'))]

In [ ]:
df_proc = composer_table(df)

In [ ]:
n_orch = len(orchestras['Main'])
orch_sizes = np.ones((n_orch,))

for i, orch in enumerate(orchestras['Main']):
    orch_sizes[i] = sum(df['Director'] == orch)

Jaccard


In [ ]:
def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return float(len(intersection))/len(union)

In [ ]:
orchA = 'Troilo, Anibal'
orchB = 'Calo, Miguel'

setA = df[df['Director'].map(unidecode).str.contains(orchA)]['Title Clean']
setB = df[df['Director'].map(unidecode).str.contains(orchB)]['Title Clean']

jaccard_similarity(setA, setB)

Orchestra by song


In [ ]:
for i, orch in enumerate(orchestras['Main']):
    orch_sizes[i] = sum(df['Director'] == orch)

vectorizer = DictVectorizer()
titles = [{'Title': val} for val in df_proc['Title'] if val]
X = vectorizer.fit_transform(titles)

vectors = []

for i, orch in enumerate(orchestras['Main']):
    vectors.append(text_vector(vectorizer, df_proc, orch, 'Title'))
matrix = np.vstack(vectors)

In [ ]:
from scipy.spatial.distance import pdist, squareform
orch_sim = 1-pdist(matrix, 'cosine')
orch_sim = squareform(orch_sim)

In [ ]:
pos = get_mds(1-orch_sim)

In [ ]:
fig = plot_similarity(orch_sim, labels=orchestras['Main'])
plt.savefig('../plots/OrchSim_songs')

In [ ]:
fig = plot_mds(pos, orch_sim, orchestras['Main'], orch_sizes)
plt.savefig('../plots/OrchMDS_songs')

Release dates (orchestra vs. orchestra)


In [ ]:
for i, orchA in enumerate(orchestras['Main']):
    for j, orchB in enumerate(orchestras['Main']):
        if j > i:
            compare_recdates(df_rec, orchA, orchB)

Orchestra by composer/lyricist


In [ ]:
vectorizer = DictVectorizer()
titles = [{'Title': val} for val in df_proc['Title'] if val]
X = vectorizer.fit_transform(titles)

vectors = []

for i, orch in enumerate(orchestras['Main']):
    vectors.append(text_vector(vectorizer, df_proc, orch, 'Title'))
matrix = np.vstack(vectors)

In [ ]:
vectorizer = DictVectorizer()
composers = [{'Composer': val} for val in df_proc['Composer'] if val]
X = vectorizer.fit_transform(composers)

n_orch = len(orchestras['Main'])

vectors = []

for i, orch in enumerate(orchestras['Main']):
    vectors.append(text_vector(vectorizer, df_proc, orch, 'Composer'))
matrix = np.vstack(vectors)

In [ ]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.imshow(matrix, interpolation='nearest', aspect='auto')
plt.yticks(np.arange(0, n_orch), orchestras['Main'])
ax.grid(False)
plt.colorbar()

In [ ]:
n_comp = 3
for i, orchestra in enumerate(orchestras['Main']):
    vector = matrix[i, :].ravel()
    ind = np.argpartition(vector, -n_comp)[:, -n_comp:]
    print(orchestra + ' (', end='')
    for j in range(n_comp):
        print(' ' + str(j + 1) + ' --> ' + str(vectorizer.feature_names_[ind[:, j][0,0]]), end='')
    print(' )')

In [ ]:
from scipy.spatial.distance import pdist, squareform
orch_sim = 1-pdist(matrix, 'cosine')
orch_sim = squareform(orch_sim)

In [ ]:
pos = get_mds(1-orch_sim)

In [ ]:
fig = plot_similarity(orch_sim, orchestras['Main'])
plt.savefig('../plots/OrchSim_authors')

In [ ]:
fig = plot_mds(pos, orch_sim, orchestras['Main'], orch_sizes)
plt.savefig('../plots/OrchMDS_authors')

Number of tangos by year


In [ ]:
plt.figure()
ax = plt.subplot(3,1,1)
df_tan.hist(column='Year', bins=year_range, ax=ax)
ax.set_title('Tango')
ax = plt.subplot(3,1,2)
df_val.hist(column='Year', bins=year_range, ax=ax)
ax.set_title('Vals')
ax = plt.subplot(3,1,3)
df_mil.hist(column='Year', bins=year_range, ax=ax)
ax.set_title('Milonga')
plt.savefig('../plots/Song_Hist')

BPM by year


In [ ]:
fig = sns.violinplot(df_tan[df_tan['BPM'] > 0].BPM, groupby=df_tan[df_tan['BPM'] > 0].Year)

In [ ]:
sns.violinplot(df_val[df_val['BPM'] > 0].BPM, groupby=df_val[df_val['BPM'] > 0].Year)

In [ ]:
sns.violinplot(df_mil[df_mil['BPM'] > 0].BPM, groupby=df_mil[df_mil['BPM'] > 0].Year)

Rating by year


In [ ]:
sns.violinplot(df_tan[df_tan['Rating'] > 0].Rating, groupby=df_tan[df_tan['Rating'] > 0].Year)

In [ ]:
sns.violinplot(df_val[df_val['Rating'] > 0].Rating, groupby=df_val[df_val['Rating'] > 0].Year)

In [ ]:
sns.violinplot(df_mil[df_mil['Rating'] > 0].Rating, groupby=df_mil[df_mil['Rating'] > 0].Year)

Length by year


In [ ]:
def plot_time(df, labelsize=32):
    fig = plt.figure()
    ax = plt.subplot(111)
    sns.violinplot(df_tan.Time, groupby=df_tan.Year)
    plt.xticks(rotation='vertical')
    ax.tick_params(axis='both', which='major', labelsize=labelsize)

In [ ]:
plot_time(df_tan)

In [ ]:
sns.violinplot(df_val.Time, groupby=df_val.Year)

In [ ]:
sns.violinplot(df_mil.Time, groupby=df_mil.Year)

Correlation between BPM and duration


In [ ]:
df_tan.plot.scatter(x='Time', y='BPM', alpha=0.1)
z = np.polyfit(df_tan['Time'], df_tan['BPM'], 1)
p = np.poly1d(z)
pylab.plot(df_tan['Time'], p(df_tan['Time']), "r--")
print("y={:0.6f}x+({:0.6f})".format(z[0], z[1]))

In [ ]:
df_val.plot.scatter(x='Time', y='BPM', alpha=0.3)
z = np.polyfit(df_val['Time'], df_val['BPM'], 1)
p = np.poly1d(z)
pylab.plot(df_val['Time'], p(df_val['Time']), "r--")
print("y={:0.6f}x+({:0.6f})".format(z[0], z[1]))

In [ ]:
df_mil.plot.scatter(x='Time', y='BPM', alpha=0.3)
z = np.polyfit(df_mil['Time'], df_mil['BPM'], 1)
p = np.poly1d(z)
pylab.plot(df_mil['Time'], p(df_mil['Time']), "r--")
print("y={:0.6f}x+({:0.6f})".format(z[0], z[1]))

Singers by year


In [ ]:
df_date = df[df['Release Date'].notnull()]

In [ ]:
orch = orchestras['Main'][0]
orchestra_plot(df, orch)

In [ ]:
for orch in orchestras['Main']:
    try:
        orchestra_plot(df, orch)
    except:
        pass

In [ ]:
singer = 'Ernesto Fama'
singer_plot(df_sing, singer)

In [ ]:
singers = unique(df_sing['Singers'])

for singer in singers:
    try:
        singer_plot(df_sing, singer)
    except:
        pass

In [ ]: