In [44]:
OMDB_SECRET = "" #keep it secret. Do not commit with this filled out
In [3]:
from pandas import DataFrame, read_csv
import pandas as pd
In [4]:
title_plus_year= read_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/title_plus_year.csv')
In [13]:
title_plus_year.head()
Out[13]:
In [23]:
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False)) #generator expression
''' OMDB Usage =>http://www.omdbapi.com/?apikey=[yourkey]&
OMDB Requests use t for title search=> http://www.omdbapi.com/?t=dark+knight
BEWARE. First Match is returned. Movie Titles are tricky and repetitive http://www.omdbapi.com/?t=dark+knight&r=xml returns
<?xml version="1.0" encoding="UTF-8"?>
but http://www.omdbapi.com/?t=dark+knight&y=2012&r=xml returns
<?xml version="1.0" encoding="UTF-8"?>
In [ ]:
# title_plus_year_gen is a generator expression. call next to get the next item from the dataframe
In [5]:
In [32]:
payload = {'key1': 'value1', 'key2': 'value2'}
In [27]:
import urllib.parse as parse #url parser and constructor model
#https://docs.python.org/3.6/library/urllib.parse.html?highlight=urlparse#urllib.parse.urlparse
In [41]:
item = parse.urlsplit("http://www.omdbapi.com/?t=heart+condition&plot=full&r=xml&apikey=c2")
print(item)
print(item.scheme, item.netloc, item.query)
print(item[0], item[1], item[3])
In [44]:
item2 = parse.urlunsplit(('http','www.omdbapi.com','/','t=heart+condition&plot=full&r=xml&apikey=c2',''))
print(item2)
In [ ]:
payload2={'t':"The Dark Knight", 'y':'2012'}
item3 = parse.urlencode(payload2)
print(item3)
In [45]:
#construct payload dictionary directly from the dataframe
#payload=parse.urlencode(payload_dict)
#create a tuple of form url_tuple =('http','www.omdbapi.com','/','t=heart+condition&plot=full&r=xml&apikey=c2','')
#parse.urlunsplit(url_tuple)
In [5]:
title_plus_year= read_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/title_plus_year.csv')
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False)) #generator expression
In [ ]:
'''
#use requests
import requests
OMDB_SECRET = ""
from itertools import count
import urllib.parse as parse
from time import sleep
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False))
def make_urls():
base_url = "www.omdbapi.com"
urls = []
for title, year in title_plus_year_gen:
#construct payload dictionary directly from the dataframe
payload = parse.urlencode({'t':title, 'y':year, 'plot':'full', 'r':'xml','apikey':OMDB_SECRET})
urls.append(parse.urlunsplit(('http',base_url,'/',payload,'')))
return urls
def start():
urls = make_urls()
print("making {} get requests".format(len(urls)))
for url in urls:
print(url)
try:
requests.get(url, timeout=70, hooks=dict(response=write))
sleep(0.5)
except Exception as e:
pass
return urls
def write(response, **kwargs):
filename = str(next(numbers))+ '.xml'
with open(filename, 'wb') as f:
f.write(response.url.encode('utf-8'))
f.write(response.content)
print('saved file %s' % filename)
def my_count():
n = 1
while True:
yield n
n += 1
numbers = my_count()
try:
urls = start()
except ReadTimeoutError:
pass
except Exception as e:
print(e)
pass
'''
In [42]:
df_url = DataFrame({"urls":urls})
In [43]:
df_url.to_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/omdb_urls.csv')
In [ ]: