Movies have form www.imdb.com/title/tt2406566/ #atomic blonde

To get full credits cast, writers, etc:

look for

inside it will have alternating a tags linking back to the actor names and pages


To download full source of website: view-source:http://www.imdb.com/title/tt2406566/fullcredits


In [ ]:


In [2]:
import requests
import urllib.parse as parse
from time import sleep

In [4]:
from requests import Request, Session

In [3]:
title_nums = ['tt0085244'] #The Big Chill

In [5]:
title_nums2 = ["tt0295700"] #Wrong Turn

In [7]:
def make_urls():
    base_url = "http://www.imdb.com/title/"
    urls = []
    
    for title in title_nums:
        urls.append(base_url + title + '/')
    return urls

In [6]:
def my_count():
    n = 1000
    while True:
        yield n
        n += 1
    
numbers = my_count()

In [42]:
def start(my_session = None):
    
    urls = make_urls()
    print('Urls', urls)
    
    for url in urls:
        try:
            r = my_session.get(url)
            print("request headers",r.request.headers)
            print("response headers",r.headers)
        except Exception as e:
            print("accessing url", url)    
            print('Exception encountered at position 1:', e)
            
    return r, urls

In [8]:
def write_result(response, **kwargs):

    print('writing file from..',response.url)  
    filename =  "test1.html"
    with open(filename, 'wb') as f:
        f.write(response.content)
        f.write(response.url.encode('utf-8'))
        print('saved file %s' % filename)

In [23]:
import asyncio
import aiofiles
import aiohttp

base_url = 'http://stats.nba.com/stats'
HEADERS = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5)'
}

async def get_players(player_args):
    endpoint = '/commonallplayers'
    params = {'leagueid': '00', 'season': '2016-17', 'isonlycurrentseason': '1'}
    
    url = base_url + endpoint
    
    print('Getting all players...')
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=None, params=params, timeout=20) as resp:
            data = await resp.json()
    player_args.extend(
        [(item[0], item[2]) for item in data['resultSets'][0]['rowSet']])

async def get_player(player_id, player_name):
    endpoint = '/commonplayerinfo'
    params = {'playerid': player_id}
    url = base_url + endpoint
    print('Getting player', player_name)
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=HEADERS, params=params) as resp:
            print(resp)
            data = await resp.text()
    async with aiofiles.open(
            '{}'.format({player_name.replace(" ", "_")}) + '.json', 'w') as file:
        await file.write(data)

loop = asyncio.get_event_loop()
player_args = []
loop.run_until_complete(get_players(player_args))
loop.run_until_complete(
    asyncio.gather(
        *(get_player(*args) for args in player_args)
    )
)


Getting all players...
---------------------------------------------------------------------------
TimeoutError                              Traceback (most recent call last)
<ipython-input-23-4d321bf2baad> in <module>()
     36 loop = asyncio.get_event_loop()
     37 player_args = []
---> 38 loop.run_until_complete(get_players(player_args))
     39 loop.run_until_complete(
     40     asyncio.gather(

/anaconda/lib/python3.6/asyncio/base_events.py in run_until_complete(self, future)
    464             raise RuntimeError('Event loop stopped before Future completed.')
    465 
--> 466         return future.result()
    467 
    468     def stop(self):

<ipython-input-23-4d321bf2baad> in get_players(player_args)
     16     print('Getting all players...')
     17     async with aiohttp.ClientSession() as session:
---> 18         async with session.get(url, headers=None, params=params, timeout=20) as resp:
     19             data = await resp.json()
     20     player_args.extend(

/anaconda/lib/python3.6/site-packages/aiohttp/client.py in __aenter__(self)
    634         @asyncio.coroutine
    635         def __aenter__(self):
--> 636             self._resp = yield from self._coro
    637             return self._resp
    638 

/anaconda/lib/python3.6/site-packages/aiohttp/client.py in _request(self, method, url, params, data, json, headers, skip_auto_headers, auth, allow_redirects, max_redirects, encoding, compress, chunked, expect100, read_until_eof, proxy, proxy_auth, timeout)
    238                         resp = req.send(conn)
    239                         try:
--> 240                             yield from resp.start(conn, read_until_eof)
    241                         except:
    242                             resp.close()

/anaconda/lib/python3.6/site-packages/aiohttp/client_reqrep.py in start(self, connection, read_until_eof)
    563                 if self._continue is not None and not self._continue.done():
    564                     self._continue.set_result(True)
--> 565                     self._continue = None
    566 
    567         # payload eof handler

/anaconda/lib/python3.6/site-packages/aiohttp/helpers.py in __exit__(self, exc_type, exc_val, exc_tb)
    704 
    705         if exc_type is asyncio.CancelledError and self._cancelled:
--> 706             raise asyncio.TimeoutError from None
    707 
    708     def timeout(self):

TimeoutError: 

In [19]:
!ls


1000.html                             The-Numbers-Data-Exploration.ipynb
1001.html                             The-Numbers-HTML-Parser.ipynb
IMDB-Notes.ipynb                      The-Numbers-Parser-v2.ipynb
OMDB-Data-Download.ipynb              aiohttp-and-asyncio.ipynb
The-Numbers-Box-Office-Download.ipynb backup

In [15]:
!conda install -y aiofiles


Fetching package metadata ...........
Solving package specifications: .

Package plan for installation in environment /anaconda:

The following NEW packages will be INSTALLED:

    aiofiles: 0.3.1-py36_0

aiofiles-0.3.1 100% |################################| Time: 0:00:00 688.83 kB/s

In [16]:
!conda install -y aiohttp


Fetching package metadata ...........
Solving package specifications: .

Package plan for installation in environment /anaconda:

The following NEW packages will be INSTALLED:

    aiohttp:       2.1.0-py36_0 
    async-timeout: 1.2.1-py36_0 
    multidict:     2.1.6-py36_0 
    yarl:          0.10.3-py36_0

async-timeout- 100% |################################| Time: 0:00:00   6.58 MB/s
multidict-2.1. 100% |################################| Time: 0:00:00   2.14 MB/s
yarl-0.10.3-py 100% |################################| Time: 0:00:00  13.71 MB/s
aiohttp-2.1.0- 100% |################################| Time: 0:00:00   7.90 MB/s

In [4]:
import requests


def print_url(r, *args, **kwargs):
    print(r.url)


hooks = dict(response=print_url)
r = requests.get('http://httpbin.org', hooks=dict(response=print_url))
print(r.status_code)


http://httpbin.org/
200

In [2]:
hooks=dict(reponse=print_url)

In [3]:
hooks


Out[3]:
{'reponse': <function __main__.print_url>}

In [ ]: