Pulling in chart appearances from mongo



In [1]:

    
from pymongo import MongoClient
from sets import Set
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

client = MongoClient('10.123.14.47')
db = client.charts

#bb200
charts = db.crawled_charts
records = charts.find({"chart_name_id":125})
client.close()
artists = []
artist_names = []
days = []
for rec in records:
    if rec.has_key("artist_id"):
        artists.append(rec['artist_id'])
        artist_names.append(rec['artist_name'])
        days.append(rec['day'])
client.close()
bbappearances = pd.DataFrame({'id':artists, 'name':artist_names, 'appearance_day':days})
bb_range = bbappearances.groupby(['id','name'])['appearance_day'].agg({'min' : np.min, 'max' : np.max})

Grouping by bands that appeared before and after 1/1/14



In [2]:

    
# 16071 is 1/1/2014, 16436 is 1/1/2015
bb_range['appeared_before_2014'] = bb_range['min']<16071
bb_range['appeared_in_2014'] = (bb_range['max']>=16071) & (bb_range['max']<16436)
bb_range.reset_index(inplace=True)
bb_range.head()









    Out[2]:






  
    
      
      id
      name
      max
      min
      appeared_before_2014
      appeared_in_2014
    
  
  
    
      0
        3
         Motion City Soundtrack
       15534
       15521
       True
       False
    
    
      1
        7
                        The Cab
       16207
       15227
       True
        True
    
    
      2
       11
                 Cobra Starship
       15241
       15234
       True
       False
    
    
      3
       12
       Forever The Sickest Kids
       15899
       15899
       True
       False
    
    
      4
       13
                   All Time Low
       15997
       15150
       True
       False

Pulling in the metric data from impala



In [3]:

    
from nbsutils import imp
query_b200 = """
select entity_id, value as val_310 from idx_entity idx
where metric_id=310
and unix_seconds=1388534400 
and count_type='d'
order by value desc
limit 3000
"""

query_other = """
select entity_id, avg(value) as val_{0} from idx_entity idx
where metric_id={0}
and unix_seconds<1388534400 
and unix_seconds>=1380585600  
and count_type='d'
group by entity_id
order by avg(value) desc
limit 3000
"""

db = imp.Connection()
data = db.fetchAll(query_b200)  # bb200 values on 1/1/2014
db.close()



In [4]:

    
data_all = data.copy()
db = imp.Connection()
for mid in [11,28,42,44,256,41]:
    data_other = db.fetchAll(query_other.format(mid))
    data_all = pd.merge(data_all,data_other,on='entity_id',how='outer')  # merging in other values from 1/1/2014
db.close()

merging metric data with chart appearance data



In [7]:

    
bbrange_slim = bb_range[['id','appeared_before_2014','appeared_in_2014','min']]
bbrange_slim.columns = ['entity_id', 'appeared_before_2014', 'appeared_in_2014','min']

# merging metric data with chart appearance data
data2 = pd.merge(data_all.copy(), bbrange_slim, on='entity_id', how='left')  
data2.loc[:,'appeared_before_2014'] = data2['appeared_before_2014'].fillna(False)  # fill in missing values
data2.loc[:,'appeared_in_2014'] = data2['appeared_in_2014'].fillna(False)
data_never_charted = data2.loc[np.logical_not(data2['appeared_before_2014']),:]

see how we did!



In [6]:

    
for mid in [310,11,28,42,44,256,41]:
    hits_in_1000 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(1000))
    hits_in_100 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(100))
    hits_in_50 = np.sum(data_never_charted.sort('val_{}'.format(mid), ascending=False)['appeared_in_2014'].head(50))
    print "{}: {} hits in 100, {} hits in 1000, {} hits in 50".format(mid,hits_in_100,hits_in_1000,hits_in_50)









    



310: 20 hits in 100, 108 hits in 1000, 15 hits in 50
11: 3 hits in 100, 30 hits in 1000, 3 hits in 50
28: 2 hits in 100, 20 hits in 1000, 1 hits in 50
42: 15 hits in 100, 66 hits in 1000, 10 hits in 50
44: 2 hits in 100, 19 hits in 1000, 0 hits in 50
256: 7 hits in 100, 55 hits in 1000, 3 hits in 50
41: 1 hits in 100, 11 hits in 1000, 0 hits in 50

distribution of how many days after 1/1/14 artists chart



In [38]:

    
from matplotlib import pyplot as plt
days_to_chart = (data_never_charted.sort('val_310'.format(mid), ascending=False).head(300).loc[data_never_charted['appeared_in_2014'],'min']-16071).tolist()
plt.hist(days_to_chart,bins=15)
plt.show()



In [41]:

    
print "Of the top 500 artists, average days to hit chart"
for mid in [310,11,28,42,44,256,41]:
    days_to_chart = (data_never_charted.sort('val_{}'.format(mid), ascending=False).head(500).loc[data_never_charted['appeared_in_2014'],'min']-16071).tolist()
    print "{}: average days = {}".format(mid,np.mean(days_to_chart))









    



Of the top 500 artists, average days to hit chart
310: average days = 149.818181818
11: average days = 163.066666667
28: average days = 180.1
42: average days = 152.081081081
44: average days = 146.5
256: average days = 187.470588235
41: average days = 179.75



In [ ]:

	id	name	max	min	appeared_before_2014	appeared_in_2014
0	3	Motion City Soundtrack	15534	15521	True	False
1	7	The Cab	16207	15227	True	True
2	11	Cobra Starship	15241	15234	True	False
3	12	Forever The Sickest Kids	15899	15899	True	False
4	13	All Time Low	15997	15150	True	False