In [44]:
OMDB_SECRET = "" #keep it secret. Do not commit with this filled out

In [3]:
from pandas import DataFrame, read_csv
import pandas as pd

In [4]:
title_plus_year= read_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/title_plus_year.csv')

In [13]:
title_plus_year.head()


Out[13]:
Title Release_Year
0 Star Wars Ep. VII: The Force Awakens 2015
1 Avatar 2009
2 Titanic 1997
3 Jurassic World 2015
4 The Avengers 2012

In [23]:
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False)) #generator expression

''' OMDB Usage =>http://www.omdbapi.com/?apikey=[yourkey]&

OMDB Requests use t for title search=> http://www.omdbapi.com/?t=dark+knight

BEWARE. First Match is returned. Movie Titles are tricky and repetitive http://www.omdbapi.com/?t=dark+knight&r=xml returns

<?xml version="1.0" encoding="UTF-8"?>

</root>

but http://www.omdbapi.com/?t=dark+knight&y=2012&r=xml returns

<?xml version="1.0" encoding="UTF-8"?>

</root> '''

Goals: extract the imbdID as string, genre as list, plot as string, rated as string,

director as comma separated list, writer as comma separated list,actors as comma separated list,


In [ ]:
# title_plus_year_gen is a generator expression. call next to get the next item from the dataframe

In [5]:


In [32]:
payload = {'key1': 'value1', 'key2': 'value2'}

In [27]:
import urllib.parse as parse #url parser and constructor model
#https://docs.python.org/3.6/library/urllib.parse.html?highlight=urlparse#urllib.parse.urlparse

In [41]:
item = parse.urlsplit("http://www.omdbapi.com/?t=heart+condition&plot=full&r=xml&apikey=c2")
print(item)
print(item.scheme, item.netloc, item.query)
print(item[0], item[1], item[3])


SplitResult(scheme='http', netloc='www.omdbapi.com', path='/', query='t=heart+condition&plot=full&r=xml&apikey=c2', fragment='')
http www.omdbapi.com t=heart+condition&plot=full&r=xml&apikey=c2
http www.omdbapi.com t=heart+condition&plot=full&r=xml&apikey=c2

In [44]:
item2 = parse.urlunsplit(('http','www.omdbapi.com','/','t=heart+condition&plot=full&r=xml&apikey=c2',''))
print(item2)


http://www.omdbapi.com/?t=heart+condition&plot=full&r=xml&apikey=c2

In [ ]:
payload2={'t':"The Dark Knight", 'y':'2012'}
item3 = parse.urlencode(payload2)
print(item3)

In [45]:
#construct payload dictionary directly from the dataframe
#payload=parse.urlencode(payload_dict)

#create a tuple of form url_tuple =('http','www.omdbapi.com','/','t=heart+condition&plot=full&r=xml&apikey=c2','')
#parse.urlunsplit(url_tuple)

In [5]:
title_plus_year= read_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/title_plus_year.csv')
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False)) #generator expression

In [ ]:
'''
#use requests
import requests
OMDB_SECRET = ""
from itertools import count
import urllib.parse as parse
from time import sleep
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False))


def make_urls():
    base_url = "www.omdbapi.com"
    urls = []

    for title, year in title_plus_year_gen:

        #construct payload dictionary directly from the dataframe
        payload = parse.urlencode({'t':title, 'y':year, 'plot':'full', 'r':'xml','apikey':OMDB_SECRET})        
        urls.append(parse.urlunsplit(('http',base_url,'/',payload,'')))
    return urls

def start():
    urls = make_urls()
    print("making {} get requests".format(len(urls)))
    for url in urls:
        print(url)
        try:
            requests.get(url, timeout=70, hooks=dict(response=write))
            sleep(0.5)
        except Exception as e:
            pass
            
    return urls



def write(response, **kwargs):
    
    filename = str(next(numbers))+ '.xml'
    with open(filename, 'wb') as f:
        f.write(response.url.encode('utf-8'))
        f.write(response.content)
    print('saved file %s' % filename)

def my_count():
    n = 1
    while True:
        yield n
        n += 1
    
numbers = my_count()
try:
    urls = start()
except ReadTimeoutError:
    pass
except Exception as e:
    print(e)
    pass

'''

In [42]:
df_url = DataFrame({"urls":urls})

In [43]:
df_url.to_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/omdb_urls.csv')

In [ ]: