In [126]:
import pandas as pd

academy_awards = pd.read_csv("academy_awards.csv", encoding="ISO-8859-1")

def preview(df):
    print("Dimensions: {0} rows x {1} columns".format(df.shape[0], df.shape[1]))
    return df.head()

preview(academy_awards)


Dimensions: 10137 rows x 11 columns
Out[126]:
Year Category Nominee Additional Info Won? Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10
0 2010 (83rd) Actor -- Leading Role Javier Bardem Biutiful {'Uxbal'} NO NaN NaN NaN NaN NaN NaN
1 2010 (83rd) Actor -- Leading Role Jeff Bridges True Grit {'Rooster Cogburn'} NO NaN NaN NaN NaN NaN NaN
2 2010 (83rd) Actor -- Leading Role Jesse Eisenberg The Social Network {'Mark Zuckerberg'} NO NaN NaN NaN NaN NaN NaN
3 2010 (83rd) Actor -- Leading Role Colin Firth The King's Speech {'King George VI'} YES NaN NaN NaN NaN NaN NaN
4 2010 (83rd) Actor -- Leading Role James Franco 127 Hours {'Aron Ralston'} NO NaN NaN NaN NaN NaN NaN

Filtering the data


In [127]:
# Check if unnamed columns contain useful data
for column in range(5, 11):
    unnamed = "Unnamed: " + str(column)
    print(academy_awards[unnamed].value_counts())


*                                                                                                               7
 error-prone measurements on sets. [Digital Imaging Technology]"                                                1
 resilience                                                                                                     1
 discoverer of stars                                                                                            1
 D.B. "Don" Keele and Mark E. Engebretson has resulted in the over 20-year dominance of constant-directivity    1
Name: Unnamed: 5, dtype: int64
*                                                                   9
 sympathetic                                                        1
 flexibility and water resistance                                   1
 direct radiator bass style cinema loudspeaker systems. [Sound]"    1
Name: Unnamed: 6, dtype: int64
*                                                     1
 kindly                                               1
 while requiring no dangerous solvents. [Systems]"    1
Name: Unnamed: 7, dtype: int64
*                                                 1
 understanding comedy genius - Mack Sennett.""    1
Name: Unnamed: 8, dtype: int64
*    1
Name: Unnamed: 9, dtype: int64
*    1
Name: Unnamed: 10, dtype: int64

In [128]:
# Cleanse the year column
print(academy_awards["Year"].value_counts().head())
academy_awards["Year"] = academy_awards["Year"].str[0:4].astype(int)


1941 (14th)    192
1942 (15th)    190
1943 (16th)    188
1944 (17th)    181
1945 (18th)    163
Name: Year, dtype: int64

In [129]:
# Filter data after 2000 and drop unnamed columns
later_than_2000 = academy_awards[academy_awards["Year"] >= 2000].ix[:,0:5]
preview(later_than_2000)


Dimensions: 1423 rows x 5 columns
Out[129]:
Year Category Nominee Additional Info Won?
0 2010 Actor -- Leading Role Javier Bardem Biutiful {'Uxbal'} NO
1 2010 Actor -- Leading Role Jeff Bridges True Grit {'Rooster Cogburn'} NO
2 2010 Actor -- Leading Role Jesse Eisenberg The Social Network {'Mark Zuckerberg'} NO
3 2010 Actor -- Leading Role Colin Firth The King's Speech {'King George VI'} YES
4 2010 Actor -- Leading Role James Franco 127 Hours {'Aron Ralston'} NO

In [130]:
# Filter data of particular award categories
award_categories = ["Actor -- Leading Role", "Actor -- Supporting Role", "Actress -- Leading Role", "Actress -- Supporting Role"]
nominations = later_than_2000[later_than_2000["Category"].isin(award_categories)]
preview(nominations)


Dimensions: 220 rows x 5 columns
Out[130]:
Year Category Nominee Additional Info Won?
0 2010 Actor -- Leading Role Javier Bardem Biutiful {'Uxbal'} NO
1 2010 Actor -- Leading Role Jeff Bridges True Grit {'Rooster Cogburn'} NO
2 2010 Actor -- Leading Role Jesse Eisenberg The Social Network {'Mark Zuckerberg'} NO
3 2010 Actor -- Leading Role Colin Firth The King's Speech {'King George VI'} YES
4 2010 Actor -- Leading Role James Franco 127 Hours {'Aron Ralston'} NO

Cleaning up the Won? and Unnamed columns


In [131]:
yes_no = {
    "YES": 1, 
    "NO": 0
}

print(nominations["Won?"].value_counts())
nominations["Won?"] = nominations["Won?"].map(yes_no)
print(nominations["Won?"].value_counts())

preview(nominations)


NO     176
YES     44
Name: Won?, dtype: int64
0    176
1     44
Name: Won?, dtype: int64
Dimensions: 220 rows x 5 columns
C:\Users\IBM_ADMIN\Anaconda2\lib\site-packages\ipykernel\__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[131]:
Year Category Nominee Additional Info Won?
0 2010 Actor -- Leading Role Javier Bardem Biutiful {'Uxbal'} 0
1 2010 Actor -- Leading Role Jeff Bridges True Grit {'Rooster Cogburn'} 0
2 2010 Actor -- Leading Role Jesse Eisenberg The Social Network {'Mark Zuckerberg'} 0
3 2010 Actor -- Leading Role Colin Firth The King's Speech {'King George VI'} 1
4 2010 Actor -- Leading Role James Franco 127 Hours {'Aron Ralston'} 0

Cleaning up the Additional Info column


In [132]:
# Break the additional info column into movie and character
nominations[["Movie", "Character"]] = nominations["Additional Info"].str.split(" {", expand=True)
nominations["Character"] = nominations["Character"].str.lstrip("'").str.rstrip("'}")
nominations = nominations.drop("Additional Info", axis=1)
preview(nominations)


Dimensions: 220 rows x 6 columns
C:\Users\IBM_ADMIN\Anaconda2\lib\site-packages\ipykernel\__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
Out[132]:
Year Category Nominee Won? Movie Character
0 2010 Actor -- Leading Role Javier Bardem 0 Biutiful Uxbal
1 2010 Actor -- Leading Role Jeff Bridges 0 True Grit Rooster Cogburn
2 2010 Actor -- Leading Role Jesse Eisenberg 0 The Social Network Mark Zuckerberg
3 2010 Actor -- Leading Role Colin Firth 1 The King's Speech King George VI
4 2010 Actor -- Leading Role James Franco 0 127 Hours Aron Ralston

Exporting to SQLite


In [133]:
import sqlite3

conn = sqlite3.connect("nominations.db")
nominations.to_sql("nominations", conn, index=False)

Verifying in SQL


In [138]:
query = "pragma table_info(nominations);"
results = conn.execute(query).fetchall()

for row in results:
    print(row)


(0, u'Year', u'INTEGER', 0, None, 0)
(1, u'Category', u'TEXT', 0, None, 0)
(2, u'Nominee', u'TEXT', 0, None, 0)
(3, u'Won?', u'INTEGER', 0, None, 0)
(4, u'Movie', u'TEXT', 0, None, 0)
(5, u'Character', u'TEXT', 0, None, 0)

In [137]:
query = "SELECT * FROM nominations LIMIT 10;"
for row in conn.execute(query).fetchall():
    print(row)

conn.close()


(2010, u'Actor -- Leading Role', u'Javier Bardem', 0, u'Biutiful', u'Uxbal')
(2010, u'Actor -- Leading Role', u'Jeff Bridges', 0, u'True Grit', u'Rooster Cogburn')
(2010, u'Actor -- Leading Role', u'Jesse Eisenberg', 0, u'The Social Network', u'Mark Zuckerberg')
(2010, u'Actor -- Leading Role', u'Colin Firth', 1, u"The King's Speech", u'King George VI')
(2010, u'Actor -- Leading Role', u'James Franco', 0, u'127 Hours', u'Aron Ralston')
(2010, u'Actor -- Supporting Role', u'Christian Bale', 1, u'The Fighter', u'Dicky Eklund')
(2010, u'Actor -- Supporting Role', u'John Hawkes', 0, u"Winter's Bone", u'Teardrop')
(2010, u'Actor -- Supporting Role', u'Jeremy Renner', 0, u'The Town', u'James Coughlin')
(2010, u'Actor -- Supporting Role', u'Mark Ruffalo', 0, u'The Kids Are All Right', u'Paul')
(2010, u'Actor -- Supporting Role', u'Geoffrey Rush', 0, u"The King's Speech", u'Lionel Logue')

In [ ]: