In [1]:
from bs4 import BeautifulSoup

import requests
import re

In [2]:
# url = raw_input("http://citeseerx.ist.psu.edu/search?q=attitude&submit=Search&sort=rlv&t=doc")

r  = requests.get("http://citeseerx.ist.psu.edu/search?q=attitude&submit=Search&sort=rlv&t=doc")

data = r.text

soup = BeautifulSoup(data)
# soup.prettify

In [3]:
papers_on_page = soup.findAll('a', {'class':'remove doc_details'})
# papers_on_page

In [4]:
for paper_on_page in papers_on_page:
    print(paper_on_page.get('href'))


/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.31.1768&rank=1
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.317.9673&rank=2
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.320.8769&rank=3
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.169.4836&rank=4
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.37.7970&rank=5
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.197.1486&rank=6
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.119.2204&rank=7
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.217.2021&rank=8
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.177.779&rank=9
/viewdoc/summary;jsessionid=7C12F6C8F5BB7349EF7660E9F78747C5?doi=10.1.1.248.4138&rank=10

In [5]:
string = '/viewdoc/summary;jsessionid=4C1CD7E8F0D4A4E4BABAE601DE8D326F?doi=10.1.1.317.9673&rank=1'
# print(string)

paper_suffix = re.sub(';.*\?', '?', string)

In [6]:
paper_url = 'http://citeseerx.ist.psu.edu' + paper_suffix
paper_url


Out[6]:
'http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.317.9673&rank=1'

get citations from seed


In [7]:
r  = requests.get('http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.317.9673&rank=1')

data = r.text

soup = BeautifulSoup(data)
# soup.prettify

In [8]:
soup.findAll('div', id='docAuthors')


Out[8]:
[<div id="docAuthors">
         by 
           
             
               
               
                 Icek Ajzen
               
               
             
           
         </div>]