In [19]:
from datetime import datetime
import calendar
import numpy as np
from bs4 import BeautifulSoup 
import requests
import csv
import pandas as pd

In [20]:
def generateURL():
    crossfitBaseUrl = 'http://www.crossfit.com/mt-archive2/YYYY_MM.html'
    #yearRange = ["2005","2007","2008","2009","2010","2011","2012","2013"]
    yearRange = range(2002, 2014)
    monthRange = ['01','02','03','04','05','06','07','08','09','10','11','12']
    #monthRange = ['01']
    crossfitUrls = [];
    for year in yearRange:
        for month in monthRange:
            crossfitYearUrl = crossfitBaseUrl.replace('YYYY', str(year))
            crossfitUrl = crossfitYearUrl.replace('MM', str(month))
            crossfitUrls.append(crossfitUrl);
    return crossfitUrls

In [21]:
urls = generateURL();

with open("workouts.tsv", "w+") as f:
        fieldnames = ("Date", "Workout")
        output = csv.writer(f, delimiter="\t")
        output.writerow(fieldnames)
        for url in urls:
            r = requests.get(url)
            soup = BeautifulSoup(r.text)
            #print soup
            blogBodies = soup.findAll("div", {"class" : "blogbody"})
            for line  in blogBodies:
                #print('\n~break~\n');
                workoutName = line.find("h3", {"class" : "title"})
                #print workoutName
                if workoutName is not None:
                    if(len(workoutName.text.split()) < 2):
                        # Some entries only have the day and not the actual date
                        #workoutDate = np.nan;
                        break;
                    else:
                        #Add 20 to the year part, so Pandas can easily extract the date format
                        workoutDate = "20"+workoutName.text.split()[1];
                para =line.findAll('p')
                workout = "";
                for p in para:
                    # Sometime their workouts have links and they need to be filtered for
                    anchorExists = p.find("a")
                    if (p.text=='Enlarge image') |('Post' in p.text)|('comments' in p.text)|(p.text == "") :
                        break;
                    else:
                        workout = workout+p.text.encode('utf-8') 
                output.writerow([workoutDate,workout.strip().replace('\n',' ')])

In [22]:
cols =['Date','Workout']
df = pd.read_csv('workouts.tsv', sep='\t', converters={'Date': str})  
df = df.dropna()
print df


        Date                                            Workout
0   20020131  Row 2000 meters Rest for 2000 meter time. Row ...
1   20020131  For time: 50 Push-ups 50 Pull-ups 75 Sit-ups 7...
2   20020130  Row 2000 meters Rest for 2000 meter time. Row ...
3   20020129  Complete as many rounds in 20 minutes as you c...
4   20020128  Bike ten minutes, hard. Rest five minutes. Fro...
5   20020127                                           Rest day
6   20020126                                           Rest day
7   20020125  Five rounds for time of: 15 Back/Hip Extension...
8   20020124  Three rounds of: Jump Rope "double-unders"; 2 ...
9   20020123  Row 1 minute at sub 1:30/500 meter pace,  Row ...
10  20020122  20 Back squat Rest 5 minutes Run 800 meters Re...
11  20020121  24 inch Box jump, 25 reps 25 Pull-ups Rest 22 ...
12  20020120                                           Rest day
13  20020119                                           Rest day
14  20020118  Complets as many rounds in 20 minutes as you c...
15  20020117  Complete as many rounds in 20 minutes as you c...
16  20020116  Complete as many rounds in 20 minutes as you c...
17  20020115  Complete as many rounds in 20 minutes as you c...
18  20020114  Complete as many rounds in 20 minutes as you c...
19  20020113                                           Rest day
20  20020112                                           Rest day
21  20020111  Three rounds for time of: 45 pound Front squat...
22  20020110  Complete as many rounds in 20 minutes as you c...
23  20020109  1000 meter Row Snatch 10-10-10-10-10 reps 1000...
24  20020108  Have a partner gently assist you through twent...
25  20020107  Deadlift 5-3-1-3-5 reps Run 1 mile Deadlift 5-...
26  20020106                                           Rest day
27  20020105                                           Rest day
28  20020104  Complete 3-5 rounds of: 800 meter Run 15 Power...
30  20020228  Move through this circuit slowly, but steadily...
31  20020227  For time: 1 mile Bike 75 pound Thrusters, 20 r...
32  20020226                                           Rest day
33  20020225  Seven rounds of: 3 Deadlift 3 Bench press ____...
34  20020224  Three to five rounds of:  65 pound Powerclean,...
35  20020223  Can you perform this sequence every minute on ...
36  20020222                                           Rest day
37  20020221  Deadlift 1-1-1-1-1-1-1-1-1-1, repNotes: 1. Sta...
38  20020220  1 mile Bike Rest as needed. 20 inch Box jump, ...
39  20020219  20 Pull ups 5 Dips 15 Pull ups 10 Dips 5 Pull ...
40  20020218                                           Rest day
41  20020217  Back sqaut 3-3-3-3-3-3-3-3-3-3, repsPace as ne...
42  20020216  Five rounds, 21-18-15-12 and 9 reps, for time ...
43  20020215  800 meter Run Rest 5 minutes 500 meter Row Res...
44  20020214                                           Rest day
45  20020213  Back squat 5-3-1-3-5, reps Bench press 5-3-1-3...
46  20020212  Five rounds each for time of: 400 meter Run 20...
47  20020211                                           Rest day
48  20020210  Muscle ups 3-3-3-3-3-3-3-3-3-3, repsNotes: 1. ...
49  20020209  Back squat 5-5-5-3-3-3-1-1-1 repsNotes:  1. Lo...
50  20020208  In any order and within thirty minutes:  Row a...
51  20020207  Seven rounds for time of: 5 Pull ups 7 Dips (u...
52  20020206  Seven rounds for time of: 5 Pull ups 7 Dips (u...
53  20020205  Jump rope for sixty seconds,  rest for sixty s...
54  20020204  Deadlift 5-4-4-3-3-3-2-2-2-2-1-1-1-1-1 reps __...
55  20020203                                           Rest day
56  20020202                                           Rest day
57  20020201  Powerclean 10-10-10-10-10-10-10 repsNotes: 1. ...
58  20020331  Three rounds of: 1 mile Run 21 Bench press; bo...
59  20020330  Rest day"There is no single sport or activity ...
60  20020329  Three rounds of: 1 mile Bike 20 Cleans; altern...
         ...                                                ...

[4187 rows x 2 columns]

In [17]:


In [ ]: