In [126]:

    
import pandas as pd

academy_awards = pd.read_csv("academy_awards.csv", encoding="ISO-8859-1")

def preview(df):
    print("Dimensions: {0} rows x {1} columns".format(df.shape[0], df.shape[1]))
    return df.head()

preview(academy_awards)









    



Dimensions: 10137 rows x 11 columns






    Out[126]:






  
    
      
      Year
      Category
      Nominee
      Additional Info
      Won?
      Unnamed: 5
      Unnamed: 6
      Unnamed: 7
      Unnamed: 8
      Unnamed: 9
      Unnamed: 10
    
  
  
    
      0
      2010 (83rd)
      Actor -- Leading Role
      Javier Bardem
      Biutiful {'Uxbal'}
      NO
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      1
      2010 (83rd)
      Actor -- Leading Role
      Jeff Bridges
      True Grit {'Rooster Cogburn'}
      NO
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      2
      2010 (83rd)
      Actor -- Leading Role
      Jesse Eisenberg
      The Social Network {'Mark Zuckerberg'}
      NO
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      3
      2010 (83rd)
      Actor -- Leading Role
      Colin Firth
      The King's Speech {'King George VI'}
      YES
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      4
      2010 (83rd)
      Actor -- Leading Role
      James Franco
      127 Hours {'Aron Ralston'}
      NO
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN

Filtering the data



In [127]:

    
# Check if unnamed columns contain useful data
for column in range(5, 11):
    unnamed = "Unnamed: " + str(column)
    print(academy_awards[unnamed].value_counts())









    



*                                                                                                               7
 error-prone measurements on sets. [Digital Imaging Technology]"                                                1
 resilience                                                                                                     1
 discoverer of stars                                                                                            1
 D.B. "Don" Keele and Mark E. Engebretson has resulted in the over 20-year dominance of constant-directivity    1
Name: Unnamed: 5, dtype: int64
*                                                                   9
 sympathetic                                                        1
 flexibility and water resistance                                   1
 direct radiator bass style cinema loudspeaker systems. [Sound]"    1
Name: Unnamed: 6, dtype: int64
*                                                     1
 kindly                                               1
 while requiring no dangerous solvents. [Systems]"    1
Name: Unnamed: 7, dtype: int64
*                                                 1
 understanding comedy genius - Mack Sennett.""    1
Name: Unnamed: 8, dtype: int64
*    1
Name: Unnamed: 9, dtype: int64
*    1
Name: Unnamed: 10, dtype: int64



In [128]:

    
# Cleanse the year column
print(academy_awards["Year"].value_counts().head())
academy_awards["Year"] = academy_awards["Year"].str[0:4].astype(int)









    



1941 (14th)    192
1942 (15th)    190
1943 (16th)    188
1944 (17th)    181
1945 (18th)    163
Name: Year, dtype: int64



In [129]:

    
# Filter data after 2000 and drop unnamed columns
later_than_2000 = academy_awards[academy_awards["Year"] >= 2000].ix[:,0:5]
preview(later_than_2000)









    



Dimensions: 1423 rows x 5 columns






    Out[129]:






  
    
      
      Year
      Category
      Nominee
      Additional Info
      Won?
    
  
  
    
      0
      2010
      Actor -- Leading Role
      Javier Bardem
      Biutiful {'Uxbal'}
      NO
    
    
      1
      2010
      Actor -- Leading Role
      Jeff Bridges
      True Grit {'Rooster Cogburn'}
      NO
    
    
      2
      2010
      Actor -- Leading Role
      Jesse Eisenberg
      The Social Network {'Mark Zuckerberg'}
      NO
    
    
      3
      2010
      Actor -- Leading Role
      Colin Firth
      The King's Speech {'King George VI'}
      YES
    
    
      4
      2010
      Actor -- Leading Role
      James Franco
      127 Hours {'Aron Ralston'}
      NO



In [130]:

    
# Filter data of particular award categories
award_categories = ["Actor -- Leading Role", "Actor -- Supporting Role", "Actress -- Leading Role", "Actress -- Supporting Role"]
nominations = later_than_2000[later_than_2000["Category"].isin(award_categories)]
preview(nominations)









    



Dimensions: 220 rows x 5 columns






    Out[130]:






  
    
      
      Year
      Category
      Nominee
      Additional Info
      Won?
    
  
  
    
      0
      2010
      Actor -- Leading Role
      Javier Bardem
      Biutiful {'Uxbal'}
      NO
    
    
      1
      2010
      Actor -- Leading Role
      Jeff Bridges
      True Grit {'Rooster Cogburn'}
      NO
    
    
      2
      2010
      Actor -- Leading Role
      Jesse Eisenberg
      The Social Network {'Mark Zuckerberg'}
      NO
    
    
      3
      2010
      Actor -- Leading Role
      Colin Firth
      The King's Speech {'King George VI'}
      YES
    
    
      4
      2010
      Actor -- Leading Role
      James Franco
      127 Hours {'Aron Ralston'}
      NO

Cleaning up the Won? and Unnamed columns



In [131]:

    
yes_no = {
    "YES": 1, 
    "NO": 0
}

print(nominations["Won?"].value_counts())
nominations["Won?"] = nominations["Won?"].map(yes_no)
print(nominations["Won?"].value_counts())

preview(nominations)









    



NO     176
YES     44
Name: Won?, dtype: int64
0    176
1     44
Name: Won?, dtype: int64
Dimensions: 220 rows x 5 columns






    



C:\Users\IBM_ADMIN\Anaconda2\lib\site-packages\ipykernel\__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy






    Out[131]:






  
    
      
      Year
      Category
      Nominee
      Additional Info
      Won?
    
  
  
    
      0
      2010
      Actor -- Leading Role
      Javier Bardem
      Biutiful {'Uxbal'}
      0
    
    
      1
      2010
      Actor -- Leading Role
      Jeff Bridges
      True Grit {'Rooster Cogburn'}
      0
    
    
      2
      2010
      Actor -- Leading Role
      Jesse Eisenberg
      The Social Network {'Mark Zuckerberg'}
      0
    
    
      3
      2010
      Actor -- Leading Role
      Colin Firth
      The King's Speech {'King George VI'}
      1
    
    
      4
      2010
      Actor -- Leading Role
      James Franco
      127 Hours {'Aron Ralston'}
      0

Cleaning up the Additional Info column



In [132]:

    
# Break the additional info column into movie and character
nominations[["Movie", "Character"]] = nominations["Additional Info"].str.split(" {", expand=True)
nominations["Character"] = nominations["Character"].str.lstrip("'").str.rstrip("'}")
nominations = nominations.drop("Additional Info", axis=1)
preview(nominations)









    



Dimensions: 220 rows x 6 columns






    



C:\Users\IBM_ADMIN\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()






    Out[132]:






  
    
      
      Year
      Category
      Nominee
      Won?
      Movie
      Character
    
  
  
    
      0
      2010
      Actor -- Leading Role
      Javier Bardem
      0
      Biutiful
      Uxbal
    
    
      1
      2010
      Actor -- Leading Role
      Jeff Bridges
      0
      True Grit
      Rooster Cogburn
    
    
      2
      2010
      Actor -- Leading Role
      Jesse Eisenberg
      0
      The Social Network
      Mark Zuckerberg
    
    
      3
      2010
      Actor -- Leading Role
      Colin Firth
      1
      The King's Speech
      King George VI
    
    
      4
      2010
      Actor -- Leading Role
      James Franco
      0
      127 Hours
      Aron Ralston

Exporting to SQLite



In [133]:

    
import sqlite3

conn = sqlite3.connect("nominations.db")
nominations.to_sql("nominations", conn, index=False)

Verifying in SQL



In [138]:

    
query = "pragma table_info(nominations);"
results = conn.execute(query).fetchall()

for row in results:
    print(row)









    



(0, u'Year', u'INTEGER', 0, None, 0)
(1, u'Category', u'TEXT', 0, None, 0)
(2, u'Nominee', u'TEXT', 0, None, 0)
(3, u'Won?', u'INTEGER', 0, None, 0)
(4, u'Movie', u'TEXT', 0, None, 0)
(5, u'Character', u'TEXT', 0, None, 0)



In [137]:

    
query = "SELECT * FROM nominations LIMIT 10;"
for row in conn.execute(query).fetchall():
    print(row)

conn.close()









    



(2010, u'Actor -- Leading Role', u'Javier Bardem', 0, u'Biutiful', u'Uxbal')
(2010, u'Actor -- Leading Role', u'Jeff Bridges', 0, u'True Grit', u'Rooster Cogburn')
(2010, u'Actor -- Leading Role', u'Jesse Eisenberg', 0, u'The Social Network', u'Mark Zuckerberg')
(2010, u'Actor -- Leading Role', u'Colin Firth', 1, u"The King's Speech", u'King George VI')
(2010, u'Actor -- Leading Role', u'James Franco', 0, u'127 Hours', u'Aron Ralston')
(2010, u'Actor -- Supporting Role', u'Christian Bale', 1, u'The Fighter', u'Dicky Eklund')
(2010, u'Actor -- Supporting Role', u'John Hawkes', 0, u"Winter's Bone", u'Teardrop')
(2010, u'Actor -- Supporting Role', u'Jeremy Renner', 0, u'The Town', u'James Coughlin')
(2010, u'Actor -- Supporting Role', u'Mark Ruffalo', 0, u'The Kids Are All Right', u'Paul')
(2010, u'Actor -- Supporting Role', u'Geoffrey Rush', 0, u"The King's Speech", u'Lionel Logue')



In [ ]:

	Year	Category	Nominee	Additional Info	Won?	Unnamed: 5	Unnamed: 6	Unnamed: 7	Unnamed: 8	Unnamed: 9	Unnamed: 10
0	2010 (83rd)	Actor -- Leading Role	Javier Bardem	Biutiful {'Uxbal'}	NO	NaN	NaN	NaN	NaN	NaN	NaN
1	2010 (83rd)	Actor -- Leading Role	Jeff Bridges	True Grit {'Rooster Cogburn'}	NO	NaN	NaN	NaN	NaN	NaN	NaN
2	2010 (83rd)	Actor -- Leading Role	Jesse Eisenberg	The Social Network {'Mark Zuckerberg'}	NO	NaN	NaN	NaN	NaN	NaN	NaN
3	2010 (83rd)	Actor -- Leading Role	Colin Firth	The King's Speech {'King George VI'}	YES	NaN	NaN	NaN	NaN	NaN	NaN
4	2010 (83rd)	Actor -- Leading Role	James Franco	127 Hours {'Aron Ralston'}	NO	NaN	NaN	NaN	NaN	NaN	NaN

	Year	Category	Nominee	Additional Info	Won?
0	2010	Actor -- Leading Role	Javier Bardem	Biutiful {'Uxbal'}	NO
1	2010	Actor -- Leading Role	Jeff Bridges	True Grit {'Rooster Cogburn'}	NO
2	2010	Actor -- Leading Role	Jesse Eisenberg	The Social Network {'Mark Zuckerberg'}	NO
3	2010	Actor -- Leading Role	Colin Firth	The King's Speech {'King George VI'}	YES
4	2010	Actor -- Leading Role	James Franco	127 Hours {'Aron Ralston'}	NO