Pulling in chart appearances from mongo

In [1]:
from pymongo import MongoClient
from sets import Set
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

client = MongoClient('10.123.14.47')
db = client.charts

#bb200
charts = db.crawled_charts
records = charts.find({"chart_name_id":125})
client.close()
artists = []
artist_names = []
days = []
for rec in records:
    if rec.has_key("artist_id"):
        artists.append(rec['artist_id'])
        artist_names.append(rec['artist_name'])
        days.append(rec['day'])
client.close()
bbappearances = pd.DataFrame({'id':artists, 'name':artist_names, 'appearance_day':days})
bb_range = bbappearances.groupby(['id','name'])['appearance_day'].agg({'min' : np.min, 'max' : np.max})
Grouping by bands that appeared before and after 1/1/14

In [2]:
# 16071 is 1/1/2014, 16436 is 1/1/2015
bb_range['appeared_before_2014'] = bb_range['min']<16071
bb_range['appeared_in_2014'] = (bb_range['max']>=16071) & (bb_range['max']<16436)
bb_range.reset_index(inplace=True)
bb_range.head()


Out[2]:
id name max min appeared_before_2014 appeared_in_2014
0 3 Motion City Soundtrack 15534 15521 True False
1 7 The Cab 16207 15227 True True
2 11 Cobra Starship 15241 15234 True False
3 12 Forever The Sickest Kids 15899 15899 True False
4 13 All Time Low 15997 15150 True False
Pulling in the metric data from impala

In [3]:
from nbsutils import imp
query_b200 = """
select entity_id, value as val_310 from idx_entity idx
where metric_id=310
and unix_seconds=1388534400 
and count_type='d'
order by value desc
limit 3000
"""

query_other = """
select entity_id, avg(value) as val_{0} from idx_entity idx
where metric_id={0}
and unix_seconds<1388534400 
and unix_seconds>=1380585600  
and count_type='d'
group by entity_id
order by avg(value) desc
limit 3000
"""

db = imp.Connection()
data = db.fetchAll(query_b200)  # bb200 values on 1/1/2014
db.close()

In [4]:
data_all = data.copy()
db = imp.Connection()
for mid in [11,28,42,44,256,41]:
    data_other = db.fetchAll(query_other.format(mid))
    data_all = pd.merge(data_all,data_other,on='entity_id',how='outer')  # merging in other values from 1/1/2014
db.close()
merging metric data with chart appearance data

In [7]:
bbrange_slim = bb_range[['id','appeared_before_2014','appeared_in_2014','min']]
bbrange_slim.columns = ['entity_id', 'appeared_before_2014', 'appeared_in_2014','min']

# merging metric data with chart appearance data
data2 = pd.merge(data_all.copy(), bbrange_slim, on='entity_id', how='left')  
data2.loc[:,'appeared_before_2014'] = data2['appeared_before_2014'].fillna(False)  # fill in missing values
data2.loc[:,'appeared_in_2014'] = data2['appeared_in_2014'].fillna(False)
data_never_charted = data2.loc[np.logical_not(data2['appeared_before_2014']),:]
see how we did!

In [6]:
for mid in [310,11,28,42,44,256,41]:
    hits_in_1000 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(1000))
    hits_in_100 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(100))
    hits_in_50 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(50))
    print "{}: {} hits in 100, {} hits in 1000, {} hits in 50".format(mid,hits_in_100,hits_in_1000,hits_in_50)


310: 20 hits in 100, 108 hits in 1000, 15 hits in 50
11: 3 hits in 100, 30 hits in 1000, 3 hits in 50
28: 2 hits in 100, 20 hits in 1000, 1 hits in 50
42: 15 hits in 100, 66 hits in 1000, 10 hits in 50
44: 2 hits in 100, 19 hits in 1000, 0 hits in 50
256: 7 hits in 100, 55 hits in 1000, 3 hits in 50
41: 1 hits in 100, 11 hits in 1000, 0 hits in 50
distribution of how many days after 1/1/14 artists chart

In [38]:
from matplotlib import pyplot as plt
days_to_chart = (data_never_charted.sort('val_310'.format(mid), ascending=False).head(300).loc[data_never_charted['appeared_in_2014'],'min']-16071).tolist()
plt.hist(days_to_chart,bins=15)
plt.show()



In [41]:
print "Of the top 500 artists, average days to hit chart"
for mid in [310,11,28,42,44,256,41]:
    days_to_chart = (data_never_charted.sort('val_{}'.format(mid), ascending=False).head(500).loc[data_never_charted['appeared_in_2014'],'min']-16071).tolist()
    print "{}: average days = {}".format(mid,np.mean(days_to_chart))


Of the top 500 artists, average days to hit chart
310: average days = 149.818181818
11: average days = 163.066666667
28: average days = 180.1
42: average days = 152.081081081
44: average days = 146.5
256: average days = 187.470588235
41: average days = 179.75

In [ ]: