In [1]:
import sys
sys.path.append('../src')
from my_aws import S3
import pandas as pd
In [2]:
omdb_s3 = S3()
key = 'OMDB_API.csv'
bucket = 'movie-torrents'
omdb_data = omdb_s3.get_data(key, bucket)
In [3]:
omdb_data['Year'] = pd.DatetimeIndex(omdb_data['Released']).year
omdb_data = omdb_data.dropna(subset=['Year'])
omdb_data['Year'] = omdb_data['Year'].apply(lambda x: str(int(x)))
In [4]:
# creat tuple for torrent search
movie_tup = [(imdb_id, title, year) for imdb_id, title, year in
zip(omdb_data['imdbID'], omdb_data['Title'], omdb_data['Year'])]
In [5]:
def kat_crawl(imdb_num):
address = 'https://kat.cr/usearch/category:movies%20imdb:{0}/'.format(imdb_num[2:])
web_req = requests.get(address)
if web_req.status_code != 200:
return 'Fail'
soup = BeautifulSoup(web_req.text, 'lxml')
html_title = soup.div.h2.span
if not html_title:
return 'Fail'
title_strip = re.sub(r'(<span> results )([0-9*]\D[0-9*]*)( from )', '', str(html_title))
torrent_count = re.sub(r'(</span>)', '', title_strip)
return torrent_count
In [6]:
def pirate_crawl(imdb_num):
address = 'https://thepiratebay.org/search/{0}/'.format(imdb_num)
web_req = requests.get(address)
if web_req.status_code != 200:
return 'Fail'
soup = BeautifulSoup(web_req.text, 'lxml')
html_title = soup.body.h2
if not html_title:
return 'Fail'
title_strip = re.search(r'(?<=approx )([^ found>]+)', str(html_title))
torrent_count = title_strip.group(0)
return torrent_count
In [7]:
def torrentz_crawl(title, year):
address = 'http://www.torrentz.eu/search?f={0}+{1}'.format(title, year)
web_req = requests.get(address)
if web_req.status_code != 200:
return 'Fail'
soup = BeautifulSoup(web_req.text, 'lxml')
html_title = soup.h2
if not html_title:
return 'Fail'
title_strip = re.search(r'(?<=none">)([^ torrents>]+)', str(html_title))
torrent_count = title_strip.group(0)
return torrent_count
In [8]:
def torrentz_ver_crawl(title, year):
address = 'http://www.torrentz.eu/verified?f={0}+{1}'.format(title, year)
web_req = requests.get(address)
if web_req.status_code != 200:
return 'Fail'
soup = BeautifulSoup(web_req.text, 'lxml')
html_title = soup.h2
if not html_title:
return 'Fail'
title_strip = re.search(r'(?<=none">)([^ torrents>]+)', str(html_title))
torrent_count = title_strip.group(0)
return torrent_count
In [9]:
def poll_torrent_counts(omdb_data):
for imdb_id, title, year in movie_tup:
time.sleep(1)
omdb_data.loc[omdb_data['imdbID'] == imdb_id, 'Kat_Count'] = kat_crawl(imdb_id)
omdb_data.loc[omdb_data['imdbID'] == imdb_id, 'Pirate_Count'] = pirate_crawl(imdb_id)
omdb_data.loc[omdb_data['imdbID'] == imdb_id, 'Torrentz_Count'] = torrentz_crawl(title, year)
omdb_data.loc[omdb_data['imdbID'] == imdb_id, 'Torrentz_Ver_Count'] = torrentz_ver_crawl(title, year)
omdb_s3.put_data(omdb_data, key, bucket)
In [ ]: