This is a first attempt to look at some of the data resulting from the chefkoch link data (without getting fancy about graph databases).
Note these analyses currently use a subset of approx. 30k recipes (of a total of 800k).
In [14]:
# Libraries and graphics settings
import pandas as pd
import re
import statsmodels as sm
from ggplot import *
theme_bw()
import matplotlib.pyplot as mpl
mpl.style.use('ggplot')
%matplotlib inline
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5
In [2]:
# Read data
df = pd.read_csv("/Users/Leon/Documents/02_Research_Learning/Research/Recipes/03_Data/link_data.csv",
index_col=0)
In [3]:
# Fix data formats
df['activationdate'] = pd.to_datetime(df['activationdate']) # Well that was easy!
df['difficulty'] = df['difficulty'].astype('category')
# Fix data format for preparation time
df['preptime'] = df['preptime'].astype('string')
df['prep_mins'] = df['preptime'].apply(lambda x: x.replace(" min.", "")).astype('float64')
# Fix data format for strings
df['subtitle'] = df['subtitle'].astype('str') # These aren't working (for obscure reasons)
df['title'] = df['title'].astype('str')
# df.dtypes
# Define additional variable for yearmonth (for plotting)
df['yearmonth'] = df['activationdate'].map(lambda x: x.year*1000 + x.month)
df['year'] = df['activationdate'].map(lambda x: x.year)
# Add one to vote count for plotting with log axis
df['votes_n_plus1'] = df['votes_n'] + 1
In [4]:
df.shape
Out[4]:
In [12]:
df.head(5)
Out[12]:
In [5]:
df.columns
Out[5]:
In [6]:
df.describe()
Out[6]:
In [7]:
df[['category','votes_n']].groupby('category').count()
Out[7]:
In [10]:
small = df[['year', 'votes_avg']]
small.groupby('year').count().plot(kind='bar', color='darkblue', title="Number of recipes added each year", legend=None)
Out[10]:
In the subsample of the data parsed so far, the number of recipes added grew strongly from 2003-2008, then decreased with a notable jump in 2015.
In [11]:
cumsum = small.sort_values(by='year', ascending=True).groupby('year').count().cumsum(axis=0)
cumsum.plot(kind='bar', color='darkblue', title='Total number of recipes on the platform', legend=None)
Out[11]:
In [125]:
df.plot(kind='hist', y='votes_avg',
color='darkblue', alpha=0.7, legend=None,
title='Distribution of recipe scores')
Out[125]:
Rating for recipes are dispoportionately clustered around 3.
In [55]:
small.boxplot(by='year')
Out[55]:
A few things things are immediately visible from the boxplot:
In [13]:
df.plot(style=".", x='activationdate', y='votes_avg', legend=None, color='darkblue', alpha=0.1)
Out[13]:
In [ ]:
# lowess = sm.nonparametric.lowess(df[['votes_avg']], df[['activationdate']], frac=.3)
mod = sm.formula.api.ols(formula='votes_avg ~ activationdate', data=df)
res = mod.fit()
print res.summary()
In [115]:
df.plot(kind='scatter', x='votes_n_plus1', y='votes_avg',
color='darkblue', s=20, alpha=0.5,
title='Count and average of votes for recipes', logx=True).set_ylim([1,5])
Out[115]:
Notes on the plot above:
In [129]:
df.plot(kind='hist', y='votes_n',
bins=100, logy=True,
legend=None, color='darkblue', alpha=0.7)
Out[129]:
In [144]:
# df.groupby(df['difficulty','year']).count()
df[['votes_n','difficulty','year']].groupby(['year','difficulty']).count().unstack().plot(kind='bar', stacked=True)
Out[144]:
In [ ]: