In [1]:
import csv
datafile = open('/Users/kra7830/Desktop/MSDS_School/Info_Structures/dev/NightThree/tmdb_5000_movies.csv', 'r')
myreader = csv.reader(datafile)
In [338]:
#for i in myreader:
#print i
##### This prints lots of texts
In [2]:
import pandas as pd
# Read the CSV into a pandas data frame (df)
df = pd.read_csv('/Users/kra7830/Desktop/MSDS_School/Info_Structures/dev/NightThree/tmdb_5000_movies.csv', delimiter=','
)
df = pd.DataFrame(df)
"""
a budget
b genres -> embedded lists
c homepage
d id
e keywords -> embedded lists
f original_language
g original_title
h overview
i popularity
j production_companies -> embedded lists
k production_countries -> embedded lists
l release_date
m revenue
n runtime
o spoken_languages -> embedded lists
p status
q tagline
r title
s vote_average
t vote_count
"""
Out[2]:
In [3]:
import json
import pandas as pd
lst = df['genres'].values.tolist()
print lst[1]
print type(lst)
f1 = a dataframe that contain a non-json variable and an embedded json varible. The first test will be to enumerate through genres to unpack the json and loop over two lists simultaneously.
http://treyhunner.com/2016/04/how-to-loop-with-indexes-in-python/
In [4]:
f1 = df[['budget','genres']]
f1.head()
Out[4]:
Testing the unpacking of JSON with enumeration...
In [5]:
### GOAL 2 / TEST 1: Unpack Collapsed Variables
ids =[]
names =[]
mel =[]
dic = lst #list of lists
budget = list(f1['budget']) #corresponding list of budgets should match up to iter
for i, d in enumerate(dic):
d_lst = json.loads(dic[i]) #this json.loads data in usable format
#dl = pd.DataFrame(d_lst) #this put id and name in dataframe
budg = budget[i] #this was used to enumerate over the json.loads
for i, j in enumerate(d_lst):
f_ids = (d_lst[i]['id'])
f2_names = (d_lst[i]['name'])
mel.append(budg)
ids.append(f_ids)
names.append(f2_names)
In [6]:
gf = pd.DataFrame({'Budget':mel, 'ID': ids, 'name': names})
print gf.head()
Looks like it worked... and the variables are unpacked in a individual format.
In [7]:
###
Organizing a little bit...
In [8]:
"""
a budget
b genres -> embedded lists
c homepage
d id
e keywords -> embedded lists
f original_language
g original_title
h overview
i popularity
j production_companies -> embedded lists
k production_countries -> embedded lists
l release_date
m revenue
n runtime
o spoken_languages -> embedded lists
p status
q tagline
r title
s vote_average
t vote_count
"""
df_new = df[['title','budget', 'homepage', 'id', 'original_language', 'original_title', 'overview', 'popularity',
'release_date', 'revenue', 'runtime', 'status', 'tagline', 'vote_average', 'vote_count' ]]
#df_new is the dataframe that has all variables EXCLUDING the JSON data...
Making lists of all the json variables...
In [9]:
import numpy as np
b_list = df['genres'].values.tolist()
e_list = df['keywords'].values.tolist()
j_list = df['production_companies'].values.tolist()
k_list = df['production_countries'].values.tolist()
o_list = df['spoken_languages'].values.tolist()
##thought (enumerate to) join to movie title?? make that a primary key? Then left merge data frames, by title ?
def unpack_json(x):
global bf, ef, jf, kf, of #global dataframes to be saved for further use
ids = []
names = []
movie_key = []
r_list = list(df_new['title']) #list of titles will enumerate to collapsed data
n = json.loads(x[1])
xn = n[0].keys()
#print xn
for i, j in enumerate(x):
movie_title = r_list[i] #movie title is the key
js = json.loads(x[i]) #json loading
#print js #test for working
#es = json.loads(b_list[i])
#print movie_title
for i,j in enumerate(js):
f = js[i][xn[0]]
f2 = js[i][xn[1]]
movie_key.append(movie_title)
ids.append(f)
names.append(f2)
#logic to deterime which list.append goes to which global variable
if x == e_list: #keywords
ef = pd.DataFrame({'Title': movie_key, 'Keyword_ID': ids, 'Keyword_name': names})
#print gf.head()
print 'Success'
elif x == b_list: #genres
bf = pd.DataFrame({'Title': movie_key, 'Genres_ID': ids, 'Genres_name': names})
#print bf.head()
print 'Success'
elif x == j_list: #production_companies
jf = pd.DataFrame({'Title': movie_key, 'ProdComp_ID': ids, 'ProcComp_name': names})
#print jf.head() #Remeber these are backwards
print 'Success'
elif x == k_list: #production_countries
kf = pd.DataFrame({'Title': movie_key, 'ProdCty_ID': ids, 'ProdCty_name': names})
#print kf.head()
print 'Success'
elif x == o_list: #spoken_languages
of = pd.DataFrame({'Title': movie_key, 'Lang_ID': ids, 'Lang_name': names})
print of.head()
print 'Success'
else:
print "NOPE"
unpack_json(e_list)
unpack_json(b_list)
unpack_json(j_list)
unpack_json(k_list)
unpack_json(o_list)
"Wide data sets are good for exploration, but 'long' data sets are good for training. Let's attempt to expand all the collapsed field vertically instead of horizontally. Does this result in data duplication? What do you think about that? Yes and No are both correct -- but what's the context?"
Yes, it duplicated a lot of values. Yes, it is good for increasing signal in some case models. However, wide data can be used in many statistical measues as well such as anything involving the logistic regression.
-- Now that I have the data unpacked, I will left join to the dataframe df_new on movie title. This should elongate the data vertically as each unique value is added.
In [10]:
### JOIN to make long vertical Data Set
#caller.join(other.set_index('key'), on='key')
r = df_new.join(bf.set_index('Title'), on='title', how='left') #bf
r1 = r.join(ef.set_index('Title'), on='title', how='left') #ef
r2 = r1.join(jf.set_index('Title'), on='title', how='left') #jf
r3 = r2.join(kf.set_index('Title'), on='title', how='left') #kf
final_long_df = r3.join(of.set_index('Title'), on='title', how='left') #of = Last join, so final
In [11]:
print final_long_df.head(2)
#print final_long_df.count()
Looking at counts...
In [12]:
print str("#this is OG data with no JSON" "\n"), df_new.count()
In [13]:
print bf['Genres_name'].count()
print "\n"
print str("#this is generes unpacked with movie" "\n") , bf.head()
print "\n"
print str("#this is Genres unpacked with movie counts" "\n"), bf.count()
print "\n"
#print r.count()
In [14]:
pd.set_option('display.max_rows', 500)
In [ ]:
In [15]:
#Getting all the unique values of keys to store
def store_keys(x):
global key_un, gen_un, prod_comp_un, prod_cty_un, lang_un
ident = []
n = json.loads(x[1])
xn = n[0].keys()
for i, j in enumerate(x):
js = json.loads(x[i]) #json loading
for i,j in enumerate(js):
f = js[i][xn[0]]
f2 = js[i][xn[1]]
ident.append(f2)
if x == e_list: #keywords
key_un = pd.DataFrame(np.array(np.unique(ident)))
print key_un.head()
print 'Success'
elif x == b_list: #genres
gen_un = pd.DataFrame(np.array(np.unique(ident)))
#print gen_un.head()
print 'Success'
elif x == j_list: #production_companies
prod_comp_un = pd.DataFrame(np.array(np.unique(ident)))
#print prod_comp_un.head()
print 'Success'
elif x == k_list: #production_countries
prod_cty_un = pd.DataFrame(np.array(np.unique(ident)))
#print prod_cty_un.head()
print 'Success'
elif x == o_list: #spoken_languages
lang_un = pd.DataFrame(np.array(np.unique(ident)))
#print lang_un.head()
print 'Success'
else:
print "NOPE"
store_keys(e_list)
store_keys(b_list)
store_keys(j_list)
store_keys(k_list)
store_keys(o_list)
Testing to see if keys were captured... These should be eventually used as features in the wide data set.
In [16]:
lang_un[0].head()
Out[16]:
In [17]:
df_genres = pd.get_dummies(bf['Genres_name'])
df_keyword = pd.get_dummies(ef['Keyword_name'])
df_prod_name = pd.get_dummies(jf['ProdComp_ID'])
df_prod_country = pd.get_dummies(kf['ProdCty_name'])
df_lang = pd.get_dummies(of['Lang_name'])
In [18]:
df_prod_name.head() #nice function... very simple
Out[18]:
Concat directly back to the unpacked dataframes...
In [19]:
genres_wide = pd.concat([bf, df_genres], axis=1)
keyword_wide = pd.concat([ef, df_keyword], axis=1)
prod_name_wide = pd.concat([jf, df_prod_name], axis=1)
prod_country_wide = pd.concat([kf, df_prod_country], axis=1)
lang_wide = pd.concat([of, df_lang], axis=1)
genres_wide.head()
Out[19]:
Removing Names and ID...
In [20]:
df_gr = pd.DataFrame(genres_wide.drop(['Genres_name','Genres_ID'], 1))
df_ky = keyword_wide.drop(['Keyword_ID', 'Keyword_name'], 1)
df_pn = prod_name_wide.drop(['ProdComp_ID','ProcComp_name'], 1)
df_pc = prod_country_wide.drop(['ProdCty_name','ProdCty_ID'], 1)
df_ln = lang_wide.drop(['Lang_ID','Lang_name'], 1)
In [21]:
df_ln.head()
Out[21]:
In [22]:
### create pivot tables as DFs as records then format title for simplier join...
def multi_table_creation(x):
global gr_table, ky_table, pn_table, pc_table, ln_table
if x is df_gr:
gr_table = pd.DataFrame(pd.pivot_table(x, index=['Title'], aggfunc=np.sum))
gr_table = pd.DataFrame(gr_table.to_records())
gr_table = gr_table.rename(index=str, columns={"Title": "title"})
print "OK"
elif x is df_ky:
ky_table = pd.pivot_table(x, index=['Title'], aggfunc=np.sum)
ky_table = pd.DataFrame(ky_table.to_records())
ky_table = ky_table.rename(index=str, columns={"Title": "title"})
elif x is df_pn:
pn_table = pd.pivot_table(x, index=['Title'], aggfunc=np.sum)
pn_table = pd.DataFrame(pn_table.to_records())
pn_table = pn_table.rename(index=str, columns={"Title": "title"})
elif x is df_pc:
pc_table = pd.pivot_table(x, index=['Title'], aggfunc=np.sum)
pc_table = pd.DataFrame(pc_table.to_records())
pc_table = pc_table.rename(index=str, columns={"Title": "title"})
elif x is df_ln:
ln_table = pd.pivot_table(x, index=['Title'], aggfunc=np.sum)
ln_table = pd.DataFrame(ln_table.to_records())
ln_table = ln_table.rename(index=str, columns={"Title": "title"})
multi_table_creation(df_gr)
multi_table_creation(df_ky)
multi_table_creation(df_pn)
multi_table_creation(df_pc)
multi_table_creation(df_ln)
In [23]:
ky_table.head() #title is now a column name, not a pivoted var...
Out[23]:
Now we have tables with unique Titles of movies with wide set of features.
In [24]:
t = df_new.join(gr_table.set_index('title'), on='title', how='left') #bf
In [27]:
t.head()
Out[27]:
Not sure why, but values changed to float...
In [27]:
t.iloc[:,15].head() ### Values changed to float64... see below for fix
Out[27]:
In [29]:
#gr_table, ky_table, pn_table, pc_table, ln_table
t = df_new.join(gr_table.set_index('title'), on='title', how='left') #bf
ky_table = ky_table.sort_values(by = 'title')
t1 = t.join(ky_table.set_index('title'), on='title', how='left', lsuffix='_left', rsuffix='_right')
t2 = t1.join(pn_table.set_index('title'), on='title', how='left', lsuffix='_left', rsuffix='_right') #bf
t3 = t2.join(pc_table.set_index('title'), on='title', how='left', lsuffix='_left', rsuffix='_right') #bf
t4 = t3.join(ln_table.set_index('title'), on='title', how='left', lsuffix='_left', rsuffix='_right') #bf
In [30]:
t4.head()
Out[30]:
Filling NaNs before INT conversion...
In [31]:
t4.fillna(0, inplace=True)
In [32]:
#t['History'] = t['History'].apply(int)
#key_un, gen_un, prod_comp_un, prod_cty_un, lang_un . #the keys I stored earlier
for i in gen_un[0]:
t4[i] = t4[i].apply(int)
#for i in key_un[0]:
#t4[i] = t4[i].apply(int)
#for i in prod_comp_un[0]:
# t4[i] = t4[i].apply(int)
#for i in prod_cty_un[0]:
# t4[i] = t4[i].apply(int)
#for i in lang_un[0]:
# t4[i] = t4[i].apply(int)
In [51]:
f = pd.DataFrame(t4.iloc[0]) #for Ex. only selecting small data
In [290]:
pd.set_option('display.max_columns', 100) # expand to see more variables
In [291]:
f.T
Out[291]:
Importing some stats packages...
In [71]:
import scipy.stats as stats
In [140]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
In [177]:
log_test = t4[['title','budget', 'original_language','runtime','vote_average','Action', 'Adventure',
'Comedy','Fantasy', 'Foreign', 'Family', 'History', 'Horror', 'Music', 'Thriller', 'War']]
In [ ]:
In [320]:
results = smf.glm('Action ~ Adventure',
data=log_test, family=sm.families.Binomial()).fit()
#results = smf.glm('Action ~ Adventure + Comedy + Fantasy + Foreign + Family + History + Horror + Thriller + War',
# data=log_test, family=sm.families.Binomial()).fit()
In [321]:
print(results.summary())
In [322]:
results.params
Out[322]:
Unadjusted Odds Ratio Example. Logistic regression is one technique that could be used to predict / analyze patterns for all 15015 features in the wide data set.
In [332]:
for i, j in enumerate(results.params):
if i == 0:
next
else:
print results.model.data.param_names[i],"is" , np.round(np.exp(j),1), "x likely to be also an Action movie."
In [ ]:
In [ ]:
In [ ]: