In [3]:
import pandas as pd
import numpy as np
In [211]:
df = pd.read_csv('BB_main.csv')
df1 = pd.read_csv('BB_main_1.csv')
df2 = pd.read_csv('BB_main_2.csv')
df3 = pd.read_csv('BB_main_3.csv')
In [212]:
print df.shape
df2.id.duplicated().sum()
Out[212]:
In [213]:
df = df.drop('Unnamed: 0', 1)
In [214]:
print df1.shape
df1 = df1.drop('Unnamed: 0', 1)
df.id.duplicated().sum()
Out[214]:
In [215]:
print df2.shape
df2 = df2.drop('Unnamed: 0', 1)
df2.head(3)
Out[215]:
In [216]:
print df3.shape
df3 = df3.drop('Unnamed: 0', 1)
df3.head(3)
Out[216]:
In [217]:
#concatenate the main tables.
BB_main= pd.concat([df, df1, df2, df3], join = 'outer', axis = 0, ignore_index = True)
In [218]:
#peek at dataframe
print BB_main.shape
BB_main.head(3)
Out[218]:
In [219]:
for i in BB_main.duplicated('id'):
if i == True:
print i
BB_main = BB_main.drop_duplicates('id')
BB_main.shape
Out[219]:
In [220]:
fdf = pd.read_csv('BB_flavors.csv')
fdf1 = pd.read_csv('BB_flavors_1.csv')
fdf2 = pd.read_csv('BB_flavors_2.csv')
fdf3 = pd.read_csv('BB_flavors_3.csv')
In [221]:
print fdf.shape
fdf = fdf.drop('Unnamed: 0', 1)
fdf = fdf.rename(columns = {'index':'id'})
fdf1.id.duplicated().sum()
Out[221]:
In [222]:
print fdf1.shape
fdf1 = fdf1.drop('Unnamed: 0', 1)
fdf1.head(3)
Out[222]:
In [223]:
print fdf2.shape
fdf2 = fdf2.drop('Unnamed: 0', 1)
fdf2 = fdf2.rename(columns = {'index':'id'})
fdf2.id.duplicated().sum()
Out[223]:
In [224]:
print fdf3.shape
fdf3 = fdf3.drop('Unnamed: 0', 1)
fdf3 = fdf3.rename(columns = {'index':'id'})
fdf3.id.duplicated().sum()
Out[224]:
In [225]:
#concatenate the flavors tables.
BB_flavors= pd.concat([fdf, fdf1, fdf2, fdf3], join = 'outer', axis = 0, ignore_index = True)
In [226]:
#peek at dataframe
print BB_flavors.shape
BB_flavors.head(2)
Out[226]:
In [227]:
for i in BB_flavors.duplicated('id'):
if i == True:
print i
BB_flavors = BB_flavors.drop_duplicates('id')
BB_flavors.shape
Out[227]:
In [228]:
cdf = pd.read_csv('BB_cuisines.csv')
cdf1 = pd.read_csv('BB_cuisines_1.csv')
cdf2 = pd.read_csv('BB_cuisines_2.csv')
cdf3 = pd.read_csv('BB_cuisines_3.csv')
In [229]:
print cdf.shape
cdf = cdf.drop('Unnamed: 0', 1)
cdf = cdf.rename(columns = {'index':'id'})
print cdf.columns
cdf.id.duplicated().sum()
Out[229]:
In [230]:
print cdf1.shape
cdf1 = cdf1.drop('Unnamed: 0', 1)
print cdf1.columns
cdf1.id.duplicated().sum()
Out[230]:
In [231]:
print cdf2.shape
cdf2 = cdf2.drop('Unnamed: 0', 1)
cdf2 = cdf2.rename(columns = {'index':'id'})
print cdf2.columns
cdf2.id.duplicated().sum()
Out[231]:
In [232]:
print cdf3.shape
cdf3 = cdf3.drop('Unnamed: 0', 1)
cdf3 = cdf3.rename(columns = {'index':'id'})
print cdf3.columns
cdf3.id.duplicated().sum()
Out[232]:
In [233]:
#concatenate the cuisines tables.
BB_cuisines= pd.concat([cdf, cdf1, cdf2, cdf3], join = 'outer', axis = 0, ignore_index = True)
In [236]:
#peek at dataframe
print BB_cuisines.shape
BB_cuisines.head(3)
Out[236]:
In [237]:
for i in BB_cuisines.duplicated('id'):
if i == True:
print i
BB_cuisines = BB_cuisines.drop_duplicates('id')
BB_cuisines.shape
Out[237]:
In [238]:
ddf = pd.read_csv('BB_details.csv')
ddf1 = pd.read_csv('BB_details_1.csv')
ddf2 = pd.read_csv('BB_details_2.csv')
ddf3 = pd.read_csv('BB_details_3.csv')
In [239]:
len(set(ddf3.id) & set(ddf2.id))
Out[239]:
In [240]:
print ddf.shape
ddf = ddf.drop('Unnamed: 0', 1)
print ddf.columns
ddf.id.duplicated().sum()
Out[240]:
In [241]:
print ddf1.shape
ddf1 = ddf1.drop('Unnamed: 0', 1)
print ddf1.columns
ddf1.id.duplicated().sum()
Out[241]:
In [242]:
print ddf2.shape
ddf2 = ddf2.drop('Unnamed: 0', 1)
print ddf2.columns
ddf2.id.duplicated().sum()
Out[242]:
In [243]:
print ddf3.shape
ddf3 = ddf3.drop('Unnamed: 0', 1)
print ddf3.columns
ddf3.id.duplicated().sum()
Out[243]:
In [244]:
#concatenate the details tables.
BB_details= pd.concat([ddf, ddf1, ddf2, ddf3], join = 'outer', axis = 0, ignore_index = True)
In [246]:
#peek at dataframe
print BB_details.shape
BB_details.head(3)
Out[246]:
In [247]:
for i in BB_details.duplicated('id'):
if i == True:
print i
BB_details = BB_details.drop_duplicates('id')
BB_details.shape
Out[247]:
In [258]:
idf = pd.read_csv('BB_ingredients.csv')
idf1 = pd.read_csv('BB_ingredients_1.csv')
idf2 = pd.read_csv('BB_ingredients_2.csv')
idf3 = pd.read_csv('BB_ingredients_3.csv')
In [259]:
len(set(idf2.id) & set(idf.id))
Out[259]:
In [260]:
print idf.shape
idf = idf.drop('Unnamed: 0', 1)
print idf.columns
idf.id.duplicated().sum()
Out[260]:
In [261]:
print idf1.shape
idf1 = idf1.drop('Unnamed: 0', 1)
print idf1.columns
idf1.id.duplicated().sum()
Out[261]:
In [262]:
print idf2.shape
idf2 = idf2.drop('Unnamed: 0', 1)
print idf2.columns
idf2.id.duplicated().sum()
Out[262]:
In [263]:
print idf3.shape
idf3 = idf3.drop('Unnamed: 0', 1)
print idf3.columns
idf3.id.duplicated().sum()
Out[263]:
In [264]:
#concatenate the ingredients tables.
BB_ing= pd.concat([idf, idf1, idf2, idf3], join = 'outer', axis = 0, ignore_index = True)
In [267]:
#make id first column
cols = list(BB_ing)
cols.insert(0, cols.pop(cols.index('id')))
BB_ing = BB_ing.ix[:, cols]
In [268]:
BB_ing.head(3)
Out[268]:
In [269]:
for i in BB_ing.duplicated('id'):
if i == True:
print i
BB_ing = BB_ing.drop_duplicates('id')
BB_ing.shape
Out[269]:
In [270]:
# set index to column 'id'
_df = [BB_main, BB_cuisines, BB_flavors, BB_details, BB_ing]
for df in _df:
df.set_index('id', inplace = True)
In [274]:
# join dataframes
BB_data = BB_main.join([BB_cuisines, BB_flavors, BB_details, BB_ing])
# add course column-Breakfast&Brunch
BB_data['course'] = 'Breakfast_Brunch'
In [275]:
print BB_data.shape
BB_data.head(3)
Out[275]:
In [276]:
# save to csv
BB_data.to_csv('BB_Data.csv')