In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [4]:
df = pd.read_csv("../data_forlr.csv",encoding="latin1")
In [3]:
df_act_dir = pd.read_csv("FinalClean.csv",encoding="latin1")
In [2]:
df_act_dir = df_act_dir[["actor_popularity","director_popularity"]]
df = pd.concat([df,df_act_dir],axis=1)
In [5]:
df.shape
Out[5]:
In [6]:
df.isnull().sum()
Out[6]:
In [7]:
col_replace = {"Actors":"actors","Country":"country","Director":"director","Genre":"genre",
"IMDB.Rating":"imdb_rating","IMDB.Votes":"imdb_votes","Language":"language",
"Released":"released","Runtime":"runtime","Year":"year",
"Production":"production","Rated":"rated"}
df = df.drop(["Unnamed: 0","imdb_id","Title","X.x","X.y"],axis=1).rename(columns=col_replace)
In [8]:
# drop_missing_value
mis_val_col = ["actors","director","genre","imdb_votes","runtime","imdb_rating","language"]
for col in mis_val_col:
df = df.drop(df[df[col].isnull()].index)
In [9]:
df.isnull().sum(),df.shape
Out[9]:
In [10]:
num_feat = []
cate_feat = []
for i in df.columns:
if (df[i]).dtype == "int64" or (df[i]).dtype == "float64":
num_feat.append(i)
else:
cate_feat.append(i)
In [11]:
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))
In [12]:
sns.distplot(df["imdb_rating"])
# df["imdb_rating"].hist()
Out[12]:
In [14]:
import math
df["budget"] = df["budget"].map(lambda x:math.log(x))
In [15]:
# df["budget"].hist()
sns.distplot(df["budget"])
Out[15]:
In [16]:
# df["revenue"].hist()
# sns.distplot(df["revenue"],kde=False)
df["revenue"] = df["revenue"].map(lambda x:math.log(x))
In [17]:
sns.distplot(df["revenue"])
Out[17]:
In [18]:
sns.jointplot(x="budget",y="revenue",data=df,kind="reg")
Out[18]:
In [19]:
# # regions/countries involved in producing movies
df["country"] = df["country"].map(lambda x:len(str(x).split(",")))
# df["country"].value_counts().plot.bar(figsize=(16,6))
# df["country"].nunique()
# /df.shape[0]).plot.bar(figsize=(16,6))
# sns.boxplot(x="country",y="revenue",data=df)
In [ ]:
# df["country"].value_counts()
# df = df.drop("country",axis=1)
In [20]:
num_feat.append("country")
print(num_feat,len(num_feat))
cate_feat.remove("country")
print(cate_feat, len(cate_feat))
In [21]:
df = pd.concat([df, df['genre'].str.get_dummies(sep=', ')], axis=1)
df['Thriller'] = df[['Thriller', 'Horror']].sum(axis=1)
df['Fantasy'] = df[['Fantasy', 'Sci-Fi']].sum(axis=1)
df['Other_genre'] = df[['Music', 'History', 'Sport', 'War', 'Western', 'Musical', 'Documentary', 'News']].sum(axis=1)
df.drop(['Music', 'History', 'Sport', 'War', 'Western', 'Musical', 'Documentary', 'News', 'Horror', 'Sci-Fi'],
axis=1, inplace=True)
genre_lst = list(df)[19:32]
for x in genre_lst:
#print(x)
df.loc[df['%s' % x] > 1, '%s' % x] = 1
#print(df['%s' % x].value_counts())
df = df.drop("genre",axis=1)
In [22]:
genre_dict = {}
for i in df.columns[14:]:
genre_dict.update({i:i.lower()})
df = df.rename(columns = genre_dict)
In [23]:
for i in df.columns[14:]:
num_feat.append(i)
print(num_feat,len(num_feat))
cate_feat.remove("genre")
print(cate_feat, len(cate_feat))
In [24]:
df["imdb_votes"] = df["imdb_votes"].astype(str).str.replace("\D+","").astype(int)
# df["imdb_votes"].hist()
sns.distplot(df["imdb_votes"],kde=False)
Out[24]:
In [25]:
df["imdb_votes"] = df["imdb_votes"].map(lambda x:math.log(x))
In [26]:
sns.distplot(df["imdb_votes"])
Out[26]:
In [27]:
num_feat.append("imdb_votes")
cate_feat.remove("imdb_votes")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))
In [28]:
# list length
df["language"] = df["language"].map(lambda x:len(str(x).split(",")))
In [29]:
df["language"].value_counts()
Out[29]:
In [30]:
num_feat.append("language")
cate_feat.remove("language")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))
In [31]:
# frequency encoding
df["production"] = df["production"].replace(np.nan, "Unknown")\
.map(lambda x: x.split(" ")[0] if len(x) > 1 else x)
# (df["production"].value_counts()/df.shape[0])[:100].plot.bar(figsize=(16,6))
# zip_freq = list(df_2014['addrzip'].value_counts()[:20].index)
# df_2014['addrzip'] = df_2014['addrzip'].map(lambda s:'others' if s not in zip_freq else s)
# list(df_2014['addrzip'].value_counts().index)
# zip_map = {'others':20,'750':0,'945':1,'112':2,'606':3,'300':4,'070':5,'331':6,'100':7,'770':8,
# '900':9,'117':10,'917':11,'104':12,'891':13,'330':14,'852':15,'921':16,'913':17,'926':18,'925':19}
# df_2014['addrzip'] = df_2014['addrzip'].map(lambda s: zip_map.get(s) if s in zip_map else s)
In [32]:
prod_freq = list(df["production"].value_counts()[:20].index)
In [33]:
df["production"] = df["production"].map(lambda s:"other_productions" if s not in prod_freq else s)
In [34]:
prod_counts = df["production"].value_counts()
prod_dict = prod_counts.to_dict()
In [35]:
df["production"] = df["production"].map(lambda s:prod_dict.get(s) if s in prod_dict else s)
In [36]:
df["production"].unique()
Out[36]:
In [37]:
# high cardinality: may use frequency encoding
# (df["production"].value_counts()/df.shape[0])[:].sum()
In [38]:
num_feat.append("production")
cate_feat.remove("production")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))
In [39]:
df["rated"].value_counts()
Out[39]:
In [40]:
# df["rated"].value_counts().plot.bar()
sns.countplot(x="rated", data=df)
Out[40]:
In [41]:
sns.set_style('ticks')
fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)
sns.boxplot(x="rated", y="revenue", data=df)
Out[41]:
In [42]:
df["rated"] = df["rated"].replace(np.nan, "UNRATED")\
.replace("NOT RATED", "UNRATED")
In [43]:
df = pd.concat([df, df['rated'].str.get_dummies(sep=', ')], axis=1)
In [44]:
df.columns[28:]
Out[44]:
In [45]:
for i in df.columns[28:]:
num_feat.append(i)
cate_feat.remove("rated")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))
In [46]:
df = df.drop("rated",axis=1)
In [47]:
# index of released date col
index = df.columns.get_loc("released")
#change date data to timestamp
release_dates = pd.to_datetime(df["released"])
# released date is weekend of not
weekend_list = []
for each in release_dates:
day_ofweek = each.dayofweek
if day_ofweek >= 4 and day_ofweek <= 6:
tag = 1
else:
tag = 0
weekend_list.append(tag)
# released date is on dump months
dumpmonth_list = []
for each in release_dates:
month = each.month
if month == 12 or month == 1 or month == 2 or month == 8 or month ==9:
tag = 1
else:
tag = 0
dumpmonth_list.append(tag)
df.insert(loc=index+1,column = "released_on_weekend",value=weekend_list)
df.insert(loc=index+2,column = "released_on_dump_month",value=dumpmonth_list)
In [48]:
num_feat.append("released_on_weekend")
num_feat.append("released_on_dump_month")
cate_feat.remove("released")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))
In [49]:
df = df.drop("released",axis=1)
In [50]:
df["runtime"].dtype
Out[50]:
In [51]:
df["runtime"] = df["runtime"].map(lambda x:int(x.strip("min")))
In [52]:
sns.distplot(df["runtime"])
Out[52]:
In [53]:
num_feat.append("runtime")
cate_feat.remove("runtime")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))
In [54]:
df = df.drop(["actors","director"],axis=1)
In [55]:
sns.distplot(df["actor_popularity"])
Out[55]:
In [83]:
sns.distplot(df["actor_popularity"].map(lambda x:math.log(x)+0.01))
Out[83]:
In [81]:
sns.distplot(df["director_popularity"].map(lambda x:math.log(x)))
Out[81]:
In [57]:
sns.distplot(df["director_popularity"])
Out[57]:
In [58]:
cate_feat.remove("actors")
cate_feat.remove("director")
print(num_feat,len(num_feat))
print(cate_feat, len(cate_feat))
In [60]:
x_train = df[df["year"] <= 2013].drop("revenue",axis=1)
x_test = df[df["year"] > 2013].drop("revenue",axis=1)
y_train = df[df["year"] <= 2013]["revenue"]
y_test = df[df["year"] > 2013]["revenue"]
In [61]:
# num_feat.remove("revenue")
# stand_feat = []
# nonstand_feat = []
# for feat in num_feat:
# if X[feat].nunique() > 2:
# stand_feat.append(feat)
# else:
# nonstand_feat.append(feat)
In [62]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler_feat = scaler.fit_transform(X[stand_feat])
# X_feat = pd.DataFrame(scaler_feat,columns=X[stand_feat].columns)
# pd.concat([X_feat,X[nonstand_feat]],axis=1)
In [63]:
df.shape
Out[63]:
In [65]:
fig, ax = plt.subplots()
fig.set_size_inches(16, 10)
sns.heatmap(df.drop("revenue",axis=1).corr())
Out[65]:
In [66]:
# sns.pairplot(df.drop("revenue",axis=1))
In [67]:
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
In [68]:
from sklearn.linear_model import LinearRegression
In [71]:
lrm = LinearRegression()
lrm.fit(x_train,y_train)
print(lrm.intercept_)
lrm.coef_
cdf = pd.DataFrame(lrm.coef_,x_train.columns,columns=["Coeff"])
predictions = lrm.predict(x_test)
# plt.scatter(y_test, predictions)
sns.distplot((y_test-predictions)) # should be normal distribution
Out[71]:
In [73]:
cdf
Out[73]:
In [84]:
from sklearn import metrics
print(metrics.mean_absolute_error(y_test, predictions))
print(metrics.mean_squared_error(y_test, predictions))
print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
In [ ]: