notebook.community

Edit and run



In [1]:

    
import sys
sys.path.append('../src')

from my_aws import S3

import pandas as pd



In [2]:

    
omdb_s3 = S3()

key = 'OMDB_API.csv'
bucket = 'movie-torrents'
omdb_data = omdb_s3.get_data(key, bucket)



In [3]:

    
omdb_data['Year'] = pd.DatetimeIndex(omdb_data['Released']).year
omdb_data = omdb_data.dropna(subset=['Year'])
omdb_data['Year'] = omdb_data['Year'].apply(lambda x: str(int(x)))



In [4]:

    
# creat tuple for torrent search
movie_tup = [(imdb_id, title, year) for imdb_id, title, year in
             zip(omdb_data['imdbID'], omdb_data['Title'], omdb_data['Year'])]



In [5]:

    
def kat_crawl(imdb_num):
    address = 'https://kat.cr/usearch/category:movies%20imdb:{0}/'.format(imdb_num[2:])
    web_req = requests.get(address)
    if web_req.status_code != 200:
        return 'Fail'
    
    soup = BeautifulSoup(web_req.text, 'lxml')
    html_title = soup.div.h2.span

    if not html_title:
        return 'Fail'
    
    title_strip = re.sub(r'(<span>  results )([0-9*]\D[0-9*]*)( from )', '', str(html_title))
    torrent_count = re.sub(r'(</span>)', '', title_strip)

    return torrent_count



In [6]:

    
def pirate_crawl(imdb_num):
    address = 'https://thepiratebay.org/search/{0}/'.format(imdb_num)
    web_req = requests.get(address)
    if web_req.status_code != 200:
        return 'Fail'
    
    soup = BeautifulSoup(web_req.text, 'lxml')
    html_title = soup.body.h2
    
    if not html_title:
        return 'Fail'

    title_strip = re.search(r'(?<=approx )([^ found>]+)', str(html_title))
    torrent_count = title_strip.group(0)
        
    return torrent_count



In [7]:

    
def torrentz_crawl(title, year):
    address = 'http://www.torrentz.eu/search?f={0}+{1}'.format(title, year)
    web_req = requests.get(address)
    if web_req.status_code != 200:
        return 'Fail'
    
    soup = BeautifulSoup(web_req.text, 'lxml')
    html_title = soup.h2
    
    if not html_title:
        return 'Fail'

    title_strip = re.search(r'(?<=none">)([^ torrents>]+)', str(html_title))
    torrent_count = title_strip.group(0)
        
    return torrent_count



In [8]:

    
def torrentz_ver_crawl(title, year):
    address = 'http://www.torrentz.eu/verified?f={0}+{1}'.format(title, year)
    web_req = requests.get(address)
    if web_req.status_code != 200:
        return 'Fail'
    
    soup = BeautifulSoup(web_req.text, 'lxml')
    html_title = soup.h2
    
    if not html_title:
        return 'Fail'

    title_strip = re.search(r'(?<=none">)([^ torrents>]+)', str(html_title))
    torrent_count = title_strip.group(0)
        
    return torrent_count



In [9]:

    
def poll_torrent_counts(omdb_data):
    for imdb_id, title, year in movie_tup:
        time.sleep(1)

        omdb_data.loc[omdb_data['imdbID'] == imdb_id, 'Kat_Count'] = kat_crawl(imdb_id)
        omdb_data.loc[omdb_data['imdbID'] == imdb_id, 'Pirate_Count'] = pirate_crawl(imdb_id)
        omdb_data.loc[omdb_data['imdbID'] == imdb_id, 'Torrentz_Count'] = torrentz_crawl(title, year)
        omdb_data.loc[omdb_data['imdbID'] == imdb_id, 'Torrentz_Ver_Count'] = torrentz_ver_crawl(title, year)

    omdb_s3.put_data(omdb_data, key, bucket)



In [ ]: