Click here for the Jupyter homepage
In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from IPython.core.display import display
In [3]:
data = pd.read_table('mylistening.txt',header=None,
names=['user_id','item_id','artist_id','timestamp'],parse_dates=['timestamp'])
In [4]:
# import MySQLdb
# db=MySQLdb.connect(host='rdc04.uits.iu.edu',port=XXXX,user=XXXX,passwd=XXX,db='analysis_lastfm')
# cursor = db.cursor()
# cursor.execute("SELECT column_a, column_b,c olumn_c FROM some_table")
# result = cursor.fetchall()
In [5]:
# preview the data:
data.head()
Out[5]:
In [6]:
data.tail()
Out[6]:
In [7]:
# simple stats:
data.describe()
Out[7]:
In [8]:
# sorting
data = data.sort('timestamp',ascending=True)
data.head()
Out[8]:
In [9]:
# "vanilla" python
x = range(10)
print x
print x*3
print x+10
In [10]:
new1 = [i*3 for i in x]
print new1
new2 = [i+10 for i in x]
new2
Out[10]:
In [11]:
x_arr = np.array(x)
display(x_arr)
display(x_arr*3)
display(x_arr+10)
In [12]:
print np.sin(x_arr)
print np.exp(x_arr)
print "Mean of array -> %s" % x_arr.mean()
print "Index of highest value in array -> %s" % x_arr.argmax()
print "Sum of array -> %s" % x_arr.sum()
print "Product of array -> %s" % x_arr.prod()
In [13]:
data.mean(axis=0) # axis=0: operate over rows
### note that these only operate on *numeric* columns
Out[13]:
In [14]:
print data.sum(axis=1).head(15) # # axis=0: operate over columns
In [15]:
# get a column:
data['artist_id'].head(10)
Out[15]:
In [16]:
# get rows matching a condition
data[data['artist_id']==405].head(10)
Out[16]:
In [17]:
# SQL-style joins and other operations
artists = pd.read_table('tmp')
artists.head()
Out[17]:
In [18]:
data = data.merge(artists,on='artist_id',how='left')
data.head()
Out[18]:
In [19]:
# Drop column
data = data.drop('user_id',axis=1)
In [20]:
# Group and aggregate!
data.groupby('artist').count().tail(10)
Out[20]:
In [21]:
data['artist'].value_counts().head(10)
Out[21]:
In [22]:
data.groupby(['artist','item_id']).count().tail(10)
Out[22]:
In [23]:
def group_by_first_letter(artist_name):
return artist_name[0] # first letter of names
data.set_index('artist').groupby(group_by_first_letter).count().head(10)
Out[23]:
In [24]:
time_indexed = data.set_index('timestamp')
time_indexed.resample('M',how='count')['artist'].plot()
Out[24]:
In [25]:
fix,axes = plt.subplots(1,2,figsize=(12,4))
time_indexed.resample('A',how='count')['artist'].plot(ax=axes[0])
time_indexed.resample('W',how='count')['artist'].plot(ax=axes[1])
Out[25]:
In [26]:
monthly_listens_by_artist = data.groupby(['artist','timestamp']).count()['item_id']
monthly_listens_by_artist.tail(25)
Out[26]:
In [27]:
monthly_listens_by_artist = monthly_listens_by_artist.unstack()
monthly_listens_by_artist.tail(10)
Out[27]:
In [28]:
monthly_listens_by_artist.sum(axis=1).tail(10)
Out[28]:
In [29]:
top_artists = data['artist'].value_counts()[:5]
top_artists
Out[29]:
In [30]:
monthly_listens_by_artist.reindex(top_artists.index)
Out[30]:
In [31]:
monthly_listens_by_artist.reindex(top_artists.index).fillna(0)
Out[31]:
In [32]:
monthly_listens_by_artist.reindex(top_artists.index).fillna(0).T.head(10)
Out[32]:
In [33]:
to_plot = monthly_listens_by_artist.reindex(top_artists.index).fillna(0).T.resample('M',how='sum')
to_plot.head()
Out[33]:
In [34]:
to_plot.plot()
Out[34]:
In [35]:
to_plot.cumsum().plot()
Out[35]:
In [36]:
data.groupby(['artist','timestamp']).count()['item_id'].unstack().reindex(top_artists.index).fillna(0).T.resample('M',how='sum').cumsum().plot()
Out[36]:
In [37]:
import scipy.stats as stats
import statsmodels
In [38]:
data['subject'] = np.random.random_integers(0,19,len(data))
data['measurement'] = np.random.random(len(data))
data['condition'] = data['subject'].apply(lambda x: 'a' if x%2==0 else 'b')
data.head()
Out[38]:
In [39]:
# A simple group comparison
grpa = data[data['condition']=='a']['measurement']
grpb = data[data['condition']=='b']['measurement']
print 'Group A'
print grpa.describe()
print 'Group B'
print grpb.describe()
stats.ttest_ind(grpa,grpb)
Out[39]:
In [40]:
data['some_IV'] = np.random.choice(['x','y','z'],len(data))
data.head()
Out[40]:
In [41]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
model=ols('measurement ~ C(condition) + C(some_IV) + C(condition):C(some_IV)', data=data).fit()
print anova_lm(model)
In [42]:
from matplotlib import pyplot as plt
%matplotlib inline
example = pd.DataFrame({'a':np.random.random(10),'b':np.random.random(10),'c':np.random.random(10)})
example.head()
Out[42]:
In [43]:
plt.plot(example['a'])
Out[43]:
In [44]:
fig,ax = plt.subplots(1,1)
for column in ['a','b','c']:
ax.plot(example[column])
In [45]:
fig,ax = plt.subplots(1,1)
for column in ['a','b','c']:
ax.plot(example[column],label='column '+column)
ax.legend()
Out[45]:
In [46]:
example.plot()
Out[46]:
In [47]:
fix,axes = plt.subplots(3,3,figsize=(16,12))
ax_iter = axes.flat
example.plot(kind='bar',ax=ax_iter.next())
example.plot(kind='barh',ax=ax_iter.next())
example.plot(kind='hist',ax=ax_iter.next())
example.plot(kind='box',ax=ax_iter.next())
example.plot(kind='density',ax=ax_iter.next())
example.plot(kind='area',ax=ax_iter.next())
example.plot(kind='scatter',x='a',y='b',ax=ax_iter.next())
example.plot(kind='pie',y='a',ax=ax_iter.next())
example.plot(kind='hexbin',x='a',y='b',gridsize=20,ax=ax_iter.next())
Out[47]:
In [48]:
example = data[['condition','some_IV','measurement']]
example.head()
Out[48]:
In [49]:
grouped = example.groupby(['condition','some_IV'])
avg = grouped.mean()
avg
Out[49]:
In [50]:
avg.unstack().plot(kind='bar')
avg.unstack()['measurement'].plot(kind='bar')
Out[50]:
In [51]:
SE = grouped.apply(lambda x: np.std(x)/np.sqrt(len(x)))
SE
Out[51]:
In [52]:
avg.unstack()['measurement'].plot(kind='bar',yerr=10*SE.unstack()['measurement'])
Out[52]:
In [53]:
fig,ax = plt.subplots(1,1,figsize=(8,6))
avg.unstack()['measurement'].plot(kind='bar',yerr=10*SE.unstack()['measurement'],ax=ax,legend=None,color=['cyan','blue','purple'],alpha=0.6)
ax.legend(bbox_to_anchor=(1.05,1))
ax.set_ylabel('This is my dependent variable',fontdict={'fontname':'Comic Sans MS','fontsize':24})
ax.set_title('Check out my plot!',fontdict={'fontname':'Arial','fontsize':32})
plt.xticks([0,1],['Group A','Group B'],rotation=45)
ax.set_xlabel('Condition',fontsize=18)
ax.grid()
ax.annotate("*",(0.155,0.53),fontsize=16)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
In [57]:
plot_data = data.groupby(['artist','timestamp']).count()['item_id'].unstack().reindex(top_artists.index[:3]).fillna(0).T.resample('M',how='sum')
total_monthly_playcounts = time_indexed.resample('M',how='count')['item_id']
fig,axes = plt.subplots(1,2,figsize = (18,6))
ax_iter = axes.flat
ax = ax_iter.next()
plot_data.divide(total_monthly_playcounts,axis=0).plot(ax=ax,grid=True,legend=False,lw=2,colormap='rainbow')
ax.set_ylabel('Monthly proportion of listening')
ax = ax_iter.next()
plot_data.cumsum().divide(total_monthly_playcounts.cumsum(),axis=0).plot(ax=ax,grid=True,legend=False,lw=2,colormap='rainbow')
ax.set_ylabel('Cumulative monthly proportion of listening')
leg = ax.legend()
from urllib import unquote_plus
text = leg.get_texts()
for t in text:
t.set_text(unquote_plus(t.get_text().encode('ascii')).decode('utf8').title())
for ax in axes:
ax.set_axis_bgcolor('#DADADA')
#ax.set_ylim(0,0.22)
fig.suptitle('Month-by-month and cumulative listening, top 3 artists',fontsize=20)
Out[57]: