In [1]:
import pandas as pd
import numpy as np
In [2]:
df = pd.read_csv('SLD_main.csv')
df1 = pd.read_csv('SLD_main_1.csv')
df2 = pd.read_csv('SLD_main_2.csv')
df3 = pd.read_csv('SLD_main_3.csv')
In [3]:
print df.shape
df.head(3)
Out[3]:
In [4]:
df = df.drop('Unnamed: 0', 1)
In [5]:
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df1.head(3)
Out[5]:
In [6]:
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)
Out[6]:
In [7]:
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)
Out[7]:
In [8]:
#concatenate the main tables.
SLD_main= pd.concat([df, df1, df2, df3])
#create a new dataframe with selected columns
SLD_main_reduced = SLD_main.drop(['recipeName', 'sourceDisplayName'], axis = 1)
In [9]:
#peek at dataframe
print SLD_main.shape
SLD_main.head(3)
Out[9]:
In [10]:
for i in SLD_main.duplicated('id'):
if i == True:
print i
SLD_main = SLD_main.drop_duplicates('id')
SLD_main.shape
Out[10]:
In [11]:
fdf = pd.read_csv('SLD_flavors.csv')
fdf1 = pd.read_csv('SLD_flavors_1.csv')
fdf2 = pd.read_csv('SLD_flavors_2.csv')
fdf3 = pd.read_csv('SLD_flavors_3.csv')
In [12]:
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf.head(3)
Out[12]:
In [13]:
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1 = fdf1.rename(columns = {'index':'id'})
fdf1.head(3)
Out[13]:
In [14]:
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.head(3)
Out[14]:
In [15]:
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.head(3)
Out[15]:
In [16]:
#concatenate the flavors tables.
SLD_flavors= pd.concat([fdf, fdf1, fdf2, fdf3])
In [17]:
#peek at dataframe
print SLD_flavors.shape
SLD_flavors.head(3)
Out[17]:
In [18]:
for i in SLD_flavors.duplicated('id'):
if i == True:
print i
SLD_flavors = SLD_flavors.drop_duplicates('id')
SLD_flavors.shape
Out[18]:
In [19]:
cdf = pd.read_csv('SLD_cuisines.csv')
cdf1 = pd.read_csv('SLD_cuisines_1.csv')
cdf2 = pd.read_csv('SLD_cuisines_2.csv')
cdf3 = pd.read_csv('SLD_cuisines_3.csv')
In [20]:
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.head(3)
Out[20]:
In [21]:
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
cdf1 = cdf1.rename(columns = {'index':'id'})
print cdf1.columns
cdf1.head(3)
Out[21]:
In [22]:
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.head(3)
Out[22]:
In [23]:
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.head(3)
Out[23]:
In [24]:
#concatenate the cuisine tables.
SLD_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3])
In [25]:
#peek at dataframe
print SLD_cuisines.shape
SLD_cuisines.head(3)
Out[25]:
In [26]:
for i in SLD_cuisines.duplicated('id'):
if i == True:
print i
SLD_cuisines = SLD_cuisines.drop_duplicates('id')
SLD_cuisines.shape
Out[26]:
In [27]:
ddf = pd.read_csv('SLD_details.csv')
ddf1 = pd.read_csv('SLD_details_1.csv')
ddf2 = pd.read_csv('SLD_details_2.csv')
ddf3 = pd.read_csv('SLD_details_3.csv')
In [28]:
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.head(3)
Out[28]:
In [29]:
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.head(3)
Out[29]:
In [30]:
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.head(3)
Out[30]:
In [31]:
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.head(3)
Out[31]:
In [32]:
#concatenate the details tables.
SLD_details= pd.concat([ddf, ddf1, ddf2, ddf3])
In [33]:
#peek at dataframe
print SLD_details.shape
SLD_details.head(3)
Out[33]:
In [34]:
for i in SLD_details.duplicated('id'):
if i == True:
print i
SLD_details = SLD_details.drop_duplicates('id')
SLD_details.shape
Out[34]:
In [35]:
idf = pd.read_csv('SLD_ingredients.csv')
idf1 = pd.read_csv('SLD_ingredients_1.csv')
idf2 = pd.read_csv('SLD_ingredients_2.csv')
idf3 = pd.read_csv('SLD_ingredients_3.csv')
In [36]:
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.head(3)
Out[36]:
In [37]:
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.head(3)
Out[37]:
In [38]:
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.head(3)
Out[38]:
In [39]:
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.head(3)
Out[39]:
In [40]:
#concatenate the ingredients tables.
SLD_ing= pd.concat([idf, idf1, idf2, idf3])
#create a new dataframe with selected columns
SLD_ing_reduced = SLD_ing[['id', 'ingredient_list']]
In [41]:
SLD_ing.head(3)
Out[41]:
In [45]:
#drop unnamed column & make id first column
cols = list(SLD_ing)
cols.insert(0, cols.pop(cols.index('id')))
SLD_ing = SLD_ing.ix[:, cols]
In [46]:
SLD_ing.head(3)
Out[46]:
In [47]:
for i in SLD_ing.duplicated('id'):
if i == True:
print i
SLD_ing = SLD_ing.drop_duplicates('id')
SLD_ing.shape
Out[47]:
In [49]:
# set index to column 'id'
_df = [SLD_main, SLD_main_reduced, SLD_cuisines, SLD_flavors, SLD_details, SLD_ing, SLD_ing_reduced]
for df in _df:
df.set_index('id', inplace = True)
In [50]:
# join dataframes
SLD_data = SLD_main.join([SLD_cuisines, SLD_flavors, SLD_details, SLD_ing])
SLD_data_reduced = SLD_main_reduced.join([SLD_flavors, SLD_details, SLD_ing_reduced])
# create a course column
SLD_data['course'] = 'salad'
SLD_data_reduced['course'] = 'salad'
In [51]:
SLD_data.head(3)
Out[51]:
In [52]:
#Save into a csv file
SLD_data.to_csv('SLD_data.csv')
SLD_data_reduced.to_csv('SLD_data_reduced.csv')