In [1]:
#supress warning message
import warnings; warnings.simplefilter("ignore")
In [2]:
#import some extension libraries
import pandas as pd #import pandas library
In [3]:
superstore = pd.ExcelFile(r'/Users/ishmaelamin/Documents/Northwestern_University/2016_Fall/Predict_480/480_Group_Final_Project/Superstore_Sales_Data.xlsx')
In [4]:
sheet_1 = pd.read_excel(superstore,sheetname=0)
In [5]:
sheet_2 = pd.read_excel(superstore,sheetname=1)
In [6]:
#display worksheet names in the workbook
read_sheets_name = superstore.sheet_names
print(read_sheets_name)
#count sheets in the workbook
print("There are %d sheets in the workbook" % len(read_sheets_name))
In [7]:
#Display the recordsin the first sheet
sheet_1.head()
len(sheet_1)
Out[7]:
In [8]:
len(sheet_2)
Out[8]:
In [9]:
sheet_1.ix[:,0:4].head()
Out[9]:
In [10]:
#sheet_1[["id","Order_Date"]].head()
sheet_1.set_index('Product_Type_ID',inplace=True)
In [11]:
sheet_2.set_index('Product_Type_ID',inplace=True)
In [12]:
sheet_1.head()
Out[12]:
In [13]:
sheet_2.head()
Out[13]:
In [14]:
#result = pd.merge(sheet_1, sheet_2, how='left', on=['Product_Type_ID'])
internal = sheet_1.join(sheet_2)
In [15]:
len(internal)
Out[15]:
In [16]:
internal = internal[["id","Order_Date","Internal_Cost","Internal_Sale_Price","Customer_Segment","Product_Category","Product_Sub_Category","Product_Type","Internal_Product_Name","Vendor"]]
In [17]:
internal.head()
Out[17]:
In [18]:
internal['Business_Type'] = pd.Series('internal', index=internal.index)
In [19]:
internal.head()
Out[19]:
In [20]:
internal.columns = ["id","Order_Date","Cost","Price","Customer_Segment","Product_Category","Product_Sub_Category","Product_Type","Product_Name","Vendor","Business_Type"]
In [21]:
internal.head()
Out[21]:
In [22]:
external=sheet_1
In [23]:
# append column to both datasets that distinguish them as either internal or external
external['Business_Type'] = pd.Series('external', index=external.index)
In [24]:
external.head()
Out[24]:
In [25]:
#Add cost column so that the two dataframes can be concatenated
external['Cost'] = pd.Series(None, index=external.index)
In [26]:
external.head()
Out[26]:
In [27]:
# get a list of columns
cols = list(external)
In [28]:
# move the column to head of list using index, pop and insert
cols.insert(2, cols.pop(cols.index('Cost')))
cols
Out[28]:
In [29]:
# use ix to reorder
external = external.ix[:, cols]
external.head()
Out[29]:
In [30]:
internal.head()
Out[30]:
In [31]:
#Preserve the value of product_type_id as a column
internal['Product_Type_ID1'] = internal.index
In [32]:
internal.head()
Out[32]:
In [33]:
external['Product_Type_ID1'] = external.index
In [34]:
external.head()
Out[34]:
In [35]:
internal.set_index('id',inplace=True)
external.set_index('id',inplace=True)
In [36]:
result = pd.concat([external, internal], ignore_index=True)
In [37]:
len(result)
Out[37]:
In [38]:
result.head()
Out[38]:
In [ ]:
#results.columns = ["Business_Type","Cost","Customer_Segment","Order_Date","Price","Product_Category","Product_Name","Product_Sub_Category","Product_Type",Product_Type_ID","Vendor"]
In [39]:
result.columns = ["Order_Date","Cost","Price","Customer_Segment","Product_Category","Product_Sub_Category","Product_Type","Product_Name","Vendor","Business_Type","Product_Type_ID"]
In [40]:
result.head()
Out[40]:
In [41]:
result.reset_index()
Out[41]:
In [42]:
#export to file .csv
result.to_csv(r'/Users/ishmaelamin/Documents/Northwestern_University/2016_Fall/Predict_480/superstore_businesstypes.csv')
In [ ]: