In [126]:
import pandas as pd
academy_awards = pd.read_csv("academy_awards.csv", encoding="ISO-8859-1")
def preview(df):
print("Dimensions: {0} rows x {1} columns".format(df.shape[0], df.shape[1]))
return df.head()
preview(academy_awards)
Out[126]:
In [127]:
# Check if unnamed columns contain useful data
for column in range(5, 11):
unnamed = "Unnamed: " + str(column)
print(academy_awards[unnamed].value_counts())
In [128]:
# Cleanse the year column
print(academy_awards["Year"].value_counts().head())
academy_awards["Year"] = academy_awards["Year"].str[0:4].astype(int)
In [129]:
# Filter data after 2000 and drop unnamed columns
later_than_2000 = academy_awards[academy_awards["Year"] >= 2000].ix[:,0:5]
preview(later_than_2000)
Out[129]:
In [130]:
# Filter data of particular award categories
award_categories = ["Actor -- Leading Role", "Actor -- Supporting Role", "Actress -- Leading Role", "Actress -- Supporting Role"]
nominations = later_than_2000[later_than_2000["Category"].isin(award_categories)]
preview(nominations)
Out[130]:
In [131]:
yes_no = {
"YES": 1,
"NO": 0
}
print(nominations["Won?"].value_counts())
nominations["Won?"] = nominations["Won?"].map(yes_no)
print(nominations["Won?"].value_counts())
preview(nominations)
Out[131]:
In [132]:
# Break the additional info column into movie and character
nominations[["Movie", "Character"]] = nominations["Additional Info"].str.split(" {", expand=True)
nominations["Character"] = nominations["Character"].str.lstrip("'").str.rstrip("'}")
nominations = nominations.drop("Additional Info", axis=1)
preview(nominations)
Out[132]:
In [133]:
import sqlite3
conn = sqlite3.connect("nominations.db")
nominations.to_sql("nominations", conn, index=False)
In [138]:
query = "pragma table_info(nominations);"
results = conn.execute(query).fetchall()
for row in results:
print(row)
In [137]:
query = "SELECT * FROM nominations LIMIT 10;"
for row in conn.execute(query).fetchall():
print(row)
conn.close()
In [ ]: