notebook.community

Edit and run



In [44]:

    
OMDB_SECRET = "" #keep it secret. Do not commit with this filled out



In [3]:

    
from pandas import DataFrame, read_csv
import pandas as pd



In [4]:

    
title_plus_year= read_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/title_plus_year.csv')



In [13]:

    
title_plus_year.head()









    Out[13]:






  
    
      
      Title
      Release_Year
    
  
  
    
      0
      Star Wars Ep. VII: The Force Awakens
      2015
    
    
      1
      Avatar
      2009
    
    
      2
      Titanic
      1997
    
    
      3
      Jurassic World
      2015
    
    
      4
      The Avengers
      2012



In [23]:

    
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False)) #generator expression

''' OMDB Usage =>http://www.omdbapi.com/?apikey=[yourkey]&

OMDB Requests use t for title search=> http://www.omdbapi.com/?t=dark+knight

BEWARE. First Match is returned. Movie Titles are tricky and repetitive http://www.omdbapi.com/?t=dark+knight&r=xml returns

<?xml version="1.0" encoding="UTF-8"?>

</root>

but http://www.omdbapi.com/?t=dark+knight&y=2012&r=xml returns

<?xml version="1.0" encoding="UTF-8"?>

</root> '''

Goals: extract the imbdID as string, genre as list, plot as string, rated as string,

director as comma separated list, writer as comma separated list,actors as comma separated list,



In [ ]:

    
# title_plus_year_gen is a generator expression. call next to get the next item from the dataframe



In [5]:



In [32]:

    
payload = {'key1': 'value1', 'key2': 'value2'}



In [27]:

    
import urllib.parse as parse #url parser and constructor model
#https://docs.python.org/3.6/library/urllib.parse.html?highlight=urlparse#urllib.parse.urlparse



In [41]:

    
item = parse.urlsplit("http://www.omdbapi.com/?t=heart+condition&plot=full&r=xml&apikey=c2")
print(item)
print(item.scheme, item.netloc, item.query)
print(item[0], item[1], item[3])









    



SplitResult(scheme='http', netloc='www.omdbapi.com', path='/', query='t=heart+condition&plot=full&r=xml&apikey=c2', fragment='')
http www.omdbapi.com t=heart+condition&plot=full&r=xml&apikey=c2
http www.omdbapi.com t=heart+condition&plot=full&r=xml&apikey=c2



In [44]:

    
item2 = parse.urlunsplit(('http','www.omdbapi.com','/','t=heart+condition&plot=full&r=xml&apikey=c2',''))
print(item2)









    



http://www.omdbapi.com/?t=heart+condition&plot=full&r=xml&apikey=c2



In [ ]:

    
payload2={'t':"The Dark Knight", 'y':'2012'}
item3 = parse.urlencode(payload2)
print(item3)



In [45]:

    
#construct payload dictionary directly from the dataframe
#payload=parse.urlencode(payload_dict)

#create a tuple of form url_tuple =('http','www.omdbapi.com','/','t=heart+condition&plot=full&r=xml&apikey=c2','')
#parse.urlunsplit(url_tuple)



In [5]:

    
title_plus_year= read_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/title_plus_year.csv')
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False)) #generator expression



In [ ]:

    
'''
#use requests
import requests
OMDB_SECRET = ""
from itertools import count
import urllib.parse as parse
from time import sleep
title_plus_year_gen = ((row.Title, row.Release_Year) for row in title_plus_year.itertuples(index=False))


def make_urls():
    base_url = "www.omdbapi.com"
    urls = []

    for title, year in title_plus_year_gen:

        #construct payload dictionary directly from the dataframe
        payload = parse.urlencode({'t':title, 'y':year, 'plot':'full', 'r':'xml','apikey':OMDB_SECRET})        
        urls.append(parse.urlunsplit(('http',base_url,'/',payload,'')))
    return urls

def start():
    urls = make_urls()
    print("making {} get requests".format(len(urls)))
    for url in urls:
        print(url)
        try:
            requests.get(url, timeout=70, hooks=dict(response=write))
            sleep(0.5)
        except Exception as e:
            pass
            
    return urls



def write(response, **kwargs):
    
    filename = str(next(numbers))+ '.xml'
    with open(filename, 'wb') as f:
        f.write(response.url.encode('utf-8'))
        f.write(response.content)
    print('saved file %s' % filename)

def my_count():
    n = 1
    while True:
        yield n
        n += 1
    
numbers = my_count()
try:
    urls = start()
except ReadTimeoutError:
    pass
except Exception as e:
    print(e)
    pass

'''



In [42]:

    
df_url = DataFrame({"urls":urls})



In [43]:

    
df_url.to_csv(r'/Users/seanreed1/AnacondaProjects/scrapy-projects/movie-project/box-office-data/data/omdb_urls.csv')



In [ ]:

	Title	Release_Year
0	Star Wars Ep. VII: The Force Awakens	2015
1	Avatar	2009
2	Titanic	1997
3	Jurassic World	2015
4	The Avengers	2012