Google Trends is pretty awesome, except that on the site you cannot do more than overlay plots. Here we'll play with search term data downloaded from Google and draw our own conclusions.
We will be using numpy and matplotlib to explore the data. Remember you can import all these modules at once using:
In [2]:
%pylab inline
In [3]:
# we can import the CSV data as a numpy rec array
from matplotlib.pylab import csv2rec
trends = csv2rec('trends.csv')
In [4]:
plot(trends.week_start, trends.spring_break, label='spring break')
plot(trends.week_start, trends.textbooks, label='texbooks')
plot(trends.week_start, trends.norad, label='norad')
plot(trends.week_start, trends.skiing, label='skiing')
legend()
Out[4]:
In [5]:
# create vector of year and month numbers
dates = trends.week_start
yrs = zeros_like(dates)
wks = zeros_like(dates)
for i in range(len(dates)):
yrs[i] = dates[i].year
wks[i] = dates[i].isocalendar()[1]
In [30]:
# For each year, list week numbers corresponding to maximum search values
trend = trends.global_warming
for yr in range(2004,2016):
idx = find(yrs==yr)
print yr, wks[find(trend[idx] == max(trend[idx]))]
In [44]:
# study scatter about median values
def std_median(datums):
return sqrt( sum( (datums - median(datums))**2 ) )
print "spring break: ",std_median(trends.spring_break)
print "textbooks: ",std_median(trends.textbooks)
print "skiing:",std_median(trends.skiing)
print "norad:",std_median(trends.norad)
print "global warming:",std_median(trends.global_warming)
numpy
has tools for cross-correlations:
result = np.correlate(trends.spring_break,trends.spring_break,mode='full')
plot(arange(result.size) - result.size/2,result)
In [16]:
result = np.correlate(trends.norad,trends.spring_break,mode='full')
plot(arange(result.size) - result.size/2,result)
plot(gap,result)
print gap[find(result==max(result))]
In [20]:
result = np.correlate(trends.textbooks,trends.spring_break, mode='full')
gap = arange(result.size) - result.size/2
plot(gap,result)
print gap[find(result==max(result))]