In [99]:
%matplotlib inline
import matplotlib.pyplot as plt
import requests
import BeautifulSoup as soup
import time
import numpy as np
import pandas as pd
import bleach

In [18]:
dldict = {}

In [19]:
start = 0
num_page = 20
sc = 200
while sc==200 and start < num_page * 100:
    url = 'https://scholar.google.com/scholar?start=' + str(start) + '&q=%22Fast+Algorithms+for+Mining+Association+Rules%22&hl=en&num=' + str(num_page) + '&as_sdt=0,34'
    try:
        # Expect this to fail for pages we haven't downloaded yet, then catch and download
        dldict[url]
        sc = 200
    except KeyError:
        print 'Retrieving page', start/num_page
        page = requests.get(url)
        time.sleep(1)
        sc = page.status_code
        if sc==200:
            dldict[url] = page.text
    start += num_page


Retrieving page 0
Retrieving page 1
Retrieving page 2
Retrieving page 3
Retrieving page 4
Retrieving page 5
Retrieving page 6
Retrieving page 7
Retrieving page 8
Retrieving page 9
Retrieving page 10
Retrieving page 11
Retrieving page 12
Retrieving page 13
Retrieving page 14
Retrieving page 15
Retrieving page 16
Retrieving page 17
Retrieving page 18
Retrieving page 19
Retrieving page 20
Retrieving page 21
Retrieving page 22
Retrieving page 23
Retrieving page 24
Retrieving page 25
Retrieving page 26
Retrieving page 27
Retrieving page 28
Retrieving page 29
Retrieving page 30
Retrieving page 31
Retrieving page 32
Retrieving page 33
Retrieving page 34
Retrieving page 35
Retrieving page 36
Retrieving page 37
Retrieving page 38
Retrieving page 39
Retrieving page 40
Retrieving page 41
Retrieving page 42
Retrieving page 43
Retrieving page 44
Retrieving page 45
Retrieving page 46
Retrieving page 47
Retrieving page 48
Retrieving page 49
Retrieving page 50
Retrieving page 51
Retrieving page 52
Retrieving page 53
Retrieving page 54
Retrieving page 55
Retrieving page 56
Retrieving page 57
Retrieving page 58
Retrieving page 59
Retrieving page 60
Retrieving page 61
Retrieving page 62
Retrieving page 63
Retrieving page 64
Retrieving page 65
Retrieving page 66
Retrieving page 67
Retrieving page 68
Retrieving page 69
Retrieving page 70
Retrieving page 71
Retrieving page 72
Retrieving page 73
Retrieving page 74
Retrieving page 75
Retrieving page 76
Retrieving page 77
Retrieving page 78
Retrieving page 79
Retrieving page 80
Retrieving page 81
Retrieving page 82
Retrieving page 83
Retrieving page 84
Retrieving page 85
Retrieving page 86
Retrieving page 87
Retrieving page 88
Retrieving page 89
Retrieving page 90
Retrieving page 91
Retrieving page 92
Retrieving page 93
Retrieving page 94
Retrieving page 95
Retrieving page 96
Retrieving page 97
Retrieving page 98
Retrieving page 99

In [90]:
template_years = map(lambda x: str(x), np.arange(1994,2016))

In [109]:
start = 0
num_page = 20
running = True
titles = []
abstracts = []
details = []
years = []
while running:
    url = 'https://scholar.google.com/scholar?start=' + str(start) + '&q=%22Fast+Algorithms+for+Mining+Association+Rules%22&hl=en&num=' + str(num_page) + '&as_sdt=0,34'
    try:
        page= dldict[url]
        s = soup.BeautifulSoup(page)
        papers = s.findAll('div', {'class': 'gs_ri'})
        for paper in papers:
            title = paper.find('a').text
            atxt = paper.find('div', {'class': 'gs_rs'})
            if atxt is not None:
                abstract = bleach.clean(atxt.text)
            else:
                abstract = ''
            detail = paper.find('div', {'class': 'gs_a'}).text
            y = -1
            for y in template_years:
                i = detail.find(y)
                if i != -1:
                    year = y
                    break
            titles.append(title)
            abstracts.append(abstract)
            details.append(detail)
            years.append(year)
        print 'For page', start/num_page, 'found', len(papers), 'papers'
    except KeyError:
        running = False
    start += num_page


For page 0 found 20 papers
For page 1 found 20 papers
For page 2 found 20 papers
For page 3 found 20 papers
For page 4 found 20 papers
For page 5 found 20 papers
For page 6 found 20 papers
For page 7 found 20 papers
For page 8 found 20 papers
For page 9 found 20 papers
For page 10 found 20 papers
For page 11 found 20 papers
For page 12 found 20 papers
For page 13 found 20 papers
For page 14 found 20 papers
For page 15 found 20 papers
For page 16 found 20 papers
For page 17 found 20 papers
For page 18 found 20 papers
For page 19 found 20 papers
For page 20 found 20 papers
For page 21 found 20 papers
For page 22 found 20 papers
For page 23 found 20 papers
For page 24 found 20 papers
For page 25 found 20 papers
For page 26 found 20 papers
For page 27 found 20 papers
For page 28 found 20 papers
For page 29 found 20 papers
For page 30 found 20 papers
For page 31 found 20 papers
For page 32 found 20 papers
For page 33 found 20 papers
For page 34 found 20 papers
For page 35 found 20 papers
For page 36 found 20 papers
For page 37 found 20 papers
For page 38 found 20 papers
For page 39 found 20 papers
For page 40 found 20 papers
For page 41 found 20 papers
For page 42 found 20 papers
For page 43 found 20 papers
For page 44 found 20 papers
For page 45 found 20 papers
For page 46 found 20 papers
For page 47 found 20 papers
For page 48 found 20 papers
For page 49 found 0 papers
For page 50 found 0 papers
For page 51 found 0 papers
For page 52 found 0 papers
For page 53 found 0 papers
For page 54 found 0 papers
For page 55 found 0 papers
For page 56 found 0 papers
For page 57 found 0 papers
For page 58 found 0 papers
For page 59 found 0 papers
For page 60 found 0 papers
For page 61 found 0 papers
For page 62 found 0 papers
For page 63 found 0 papers
For page 64 found 0 papers
For page 65 found 0 papers
For page 66 found 0 papers
For page 67 found 0 papers
For page 68 found 0 papers
For page 69 found 0 papers
For page 70 found 0 papers
For page 71 found 0 papers
For page 72 found 0 papers
For page 73 found 0 papers
For page 74 found 0 papers
For page 75 found 0 papers
For page 76 found 0 papers
For page 77 found 0 papers
For page 78 found 0 papers
For page 79 found 0 papers
For page 80 found 0 papers
For page 81 found 0 papers
For page 82 found 0 papers
For page 83 found 0 papers
For page 84 found 0 papers
For page 85 found 0 papers
For page 86 found 0 papers
For page 87 found 0 papers
For page 88 found 0 papers
For page 89 found 0 papers
For page 90 found 0 papers
For page 91 found 0 papers
For page 92 found 0 papers
For page 93 found 0 papers
For page 94 found 0 papers
For page 95 found 0 papers
For page 96 found 0 papers
For page 97 found 0 papers
For page 98 found 0 papers
For page 99 found 0 papers

In [92]:
df = pd.DataFrame({'title': titles, 'abstract': abstracts, 'details': details, 'year': years})

In [103]:
citations = df.groupby(['year'])['title'].count()

In [108]:
plt.figure(figsize=(15,5))
plt.plot(citations.index, citiations)
plt.xlabel('year', fontsize=22)
plt.ylabel('number of citations', fontsize=22)
plt.show()



In [81]:
# USED THIS CELL FOR DEBUGGING PURPOSES

#import IPython
#IPython.display.HTML(page)