In [1]:
import pandas as pd
import numpy as np
In [53]:
df = pd.read_csv('SP_main.csv')
df1 = pd.read_csv('SP_main_1.csv')
df2 = pd.read_csv('SP_main_2.csv')
df3 = pd.read_csv('SP_main_3.csv')
In [54]:
print df.shape
df.head(3)
Out[54]:
In [55]:
df = df.drop('Unnamed: 0', 1)
In [56]:
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df1.head(3)
Out[56]:
In [57]:
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)
Out[57]:
In [58]:
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)
Out[58]:
In [59]:
#concatenate the main tables.
SP_main= pd.concat([df, df1, df2, df3])
#create a new dataframe with selected columns
SP_main_reduced = SP_main.drop(['recipeName', 'sourceDisplayName'], axis = 1)
In [60]:
#peek at dataframe
print SP_main.shape
SP_main = SP_main.sort_values(by = 'id')
SP_main.head(3)
Out[60]:
In [61]:
for i in SP_main.duplicated('id'):
if i == True:
print i
SP_main = SP_main.drop_duplicates('id')
SP_main.shape
Out[61]:
In [62]:
fdf = pd.read_csv('SP_flavors.csv')
fdf1 = pd.read_csv('SP_flavors_1.csv')
fdf2 = pd.read_csv('SP_flavors_2.csv')
fdf3 = pd.read_csv('SP_flavors_3.csv')
In [63]:
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf.head(3)
Out[63]:
In [64]:
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1 = fdf1.rename(columns = {'index':'id'})
fdf1.head(3)
Out[64]:
In [65]:
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.head(3)
Out[65]:
In [66]:
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.head(3)
Out[66]:
In [67]:
#concatenate the flavors tables.
SP_flavors= pd.concat([fdf, fdf1, fdf2, fdf3])
In [68]:
#peek at dataframe
print SP_flavors.shape
SP_flavors = SP_flavors.sort_values(by= 'id')
SP_flavors.head(3)
Out[68]:
In [69]:
len([i for i, j in zip(SP_flavors.id, SP_main.id) if i == j])
Out[69]:
In [70]:
for i in SP_flavors.duplicated('id'):
if i == True:
print i
SP_flavors = SP_flavors.drop_duplicates('id')
SP_flavors.shape
Out[70]:
In [71]:
cdf = pd.read_csv('SP_cuisines.csv')
cdf1 = pd.read_csv('SP_cuisines_1.csv')
cdf2 = pd.read_csv('SP_cuisines_2.csv')
cdf3 = pd.read_csv('SP_cuisines_3.csv')
In [72]:
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.head(3)
Out[72]:
In [73]:
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
cdf1 = cdf1.rename(columns = {'index':'id'})
print cdf1.columns
cdf1.head(3)
Out[73]:
In [74]:
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.head(3)
Out[74]:
In [75]:
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.head(3)
Out[75]:
In [76]:
#concatenate the cuisine tables.
SP_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3])
In [77]:
#peek at dataframe
print SP_cuisines.shape
cols = list(SP_cuisines)
cols.insert(0, cols.pop(cols.index('id')))
SP_cuisines = SP_cuisines.ix[:, cols]
SP_cuisines = SP_cuisines.sort_values(by ='id')
SP_cuisines.head(3)
Out[77]:
In [78]:
len([i for i, j in zip(SP_main.id, SP_cuisines.id) if i == j])
Out[78]:
In [79]:
for i in SP_cuisines.duplicated('id'):
if i == True:
print i
SP_cuisines = SP_cuisines.drop_duplicates('id')
SP_cuisines.shape
Out[79]:
In [80]:
ddf = pd.read_csv('SP_details.csv')
ddf1 = pd.read_csv('SP_details_1.csv')
ddf2 = pd.read_csv('SP_details_2.csv')
ddf3 = pd.read_csv('SP_details_3.csv')
In [81]:
print len(set(ddf1.id) & set(df1.id))
print len(set(ddf2.id) & set(df2.id))
print len(set(ddf3.id) & set(df3.id))
In [82]:
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.head(3)
Out[82]:
In [83]:
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.head(3)
Out[83]:
In [84]:
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.head(3)
Out[84]:
In [85]:
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.head(3)
Out[85]:
In [86]:
#concatenate the details tables.
SP_details= pd.concat([ddf, ddf1, ddf2, ddf3])
In [87]:
#peek at dataframe
print SP_details.shape
SP_details = SP_details.sort_values(by = 'id')
SP_details.head(3)
Out[87]:
In [88]:
len([i for i, j in zip(SP_main.id, SP_details.id) if i == j])
Out[88]:
In [89]:
for i in SP_details.duplicated('id'):
if i == True:
print i
SP_details = SP_details.drop_duplicates('id')
SP_details.shape
Out[89]:
In [90]:
idf = pd.read_csv('SP_ingredients.csv')
idf1 = pd.read_csv('SP_ingredients_1.csv')
idf2 = pd.read_csv('SP_ingredients_2.csv')
idf3 = pd.read_csv('SP_ingredients_3.csv')
In [91]:
print len(set(idf1.id) & set(ddf1.id))
print len(set(idf2.id) & set(ddf2.id))
print len(set(idf3.id) & set(ddf3.id))
In [92]:
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.head(3)
Out[92]:
In [93]:
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.head(3)
Out[93]:
In [94]:
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.head(3)
Out[94]:
In [95]:
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.head(3)
Out[95]:
In [96]:
#concatenate the ingredients tables.
SP_ing= pd.concat([idf, idf1, idf2, idf3])
#create a new dataframe with selected columns
SP_ing_reduced = SP_ing[['id', 'ingredient_list']]
In [97]:
print SP_ing.shape
SP_ing.head(3)
Out[97]:
In [98]:
#drop unnamed column & make id first column
#SP_ing = SP_ing.drop('Unnamed: 0', 1)
cols = list(SP_ing)
cols.insert(0, cols.pop(cols.index('id')))
SP_ing = SP_ing.ix[:, cols]
In [99]:
print SP_ing.shape
SP_ing = SP_ing.sort_values(by = 'id')
SP_ing.head(3)
Out[99]:
In [100]:
len([i for i, j in zip(SP_main.id, SP_ing.id) if i != j])
Out[100]:
In [101]:
for i in SP_ing.duplicated('id'):
if i == True:
print i
SP_ing = SP_ing.drop_duplicates('id')
SP_ing.shape
Out[101]:
In [102]:
# set index to column 'id'
_df = [SP_main, SP_main_reduced, SP_cuisines, SP_flavors, SP_details, SP_ing, SP_ing_reduced]
for df in _df:
df.set_index('id', inplace = True)
In [103]:
# join dataframes
SP_data = SP_main.join([SP_cuisines, SP_flavors, SP_details, SP_ing])
SP_data_reduced = SP_main_reduced.join([SP_flavors, SP_details, SP_ing_reduced])
# create a course column
SP_data['course'] = 'soup'
SP_data_reduced['course'] = 'soup'
In [104]:
print SP_data.shape
SP_data.head(3)
Out[104]:
In [105]:
# save to csv
SP_data.to_csv('SP_data.csv')
SP_data_reduced.to_csv('SP_data_reduced.csv')