In [1]:
#supress warning message
import warnings; warnings.simplefilter("ignore")

In [2]:
#import some extension libraries
import pandas as pd #import pandas library

In [3]:
superstore = pd.ExcelFile(r'/Users/ishmaelamin/Documents/Northwestern_University/2016_Fall/Predict_480/480_Group_Final_Project/Superstore_Sales_Data.xlsx')

In [4]:
sheet_1 = pd.read_excel(superstore,sheetname=0)

In [5]:
sheet_2 = pd.read_excel(superstore,sheetname=1)

In [6]:
#display worksheet names in the workbook
read_sheets_name = superstore.sheet_names
print(read_sheets_name)

#count sheets in the workbook
print("There are %d sheets in the workbook" % len(read_sheets_name))


['External', 'Internal']
There are 2 sheets in the workbook

In [7]:
#Display the recordsin the first sheet
sheet_1.head()
len(sheet_1)


Out[7]:
403

In [8]:
len(sheet_2)


Out[8]:
18

In [9]:
sheet_1.ix[:,0:4].head()


Out[9]:
id Product_Type_ID Order_Date Price
0 1 1 2009-03-22 152.48
1 2 1 2010-12-22 152.48
2 3 1 2012-10-28 152.48
3 4 1 2012-09-24 152.48
4 5 1 2010-04-20 152.48

In [10]:
#sheet_1[["id","Order_Date"]].head()
sheet_1.set_index('Product_Type_ID',inplace=True)

In [11]:
sheet_2.set_index('Product_Type_ID',inplace=True)

In [12]:
sheet_1.head()


Out[12]:
id Order_Date Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor
Product_Type_ID
1 1 2009-03-22 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso
1 2 2010-12-22 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso
1 3 2012-10-28 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso
1 4 2012-09-24 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso
1 5 2010-04-20 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso

In [13]:
sheet_2.head()


Out[13]:
Internal_Product_Name Internal_Cost Internal_Sale_Price
Product_Type_ID
1 CPU 50 200
2 Graphics/video card 35 75
3 Speakers 15 25
6 Wi-Fi 20 50
7 Sound card 15 40

In [14]:
#result = pd.merge(sheet_1, sheet_2, how='left', on=['Product_Type_ID'])
internal = sheet_1.join(sheet_2)

In [15]:
len(internal)


Out[15]:
403

In [16]:
internal = internal[["id","Order_Date","Internal_Cost","Internal_Sale_Price","Customer_Segment","Product_Category","Product_Sub_Category","Product_Type","Internal_Product_Name","Vendor"]]

In [17]:
internal.head()


Out[17]:
id Order_Date Internal_Cost Internal_Sale_Price Customer_Segment Product_Category Product_Sub_Category Product_Type Internal_Product_Name Vendor
Product_Type_ID
1 1 2009-03-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso
1 2 2010-12-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso
1 3 2012-10-28 50 200 Enterprise Technology Computer Components CPU CPU Adesso
1 4 2012-09-24 50 200 Enterprise Technology Computer Components CPU CPU Adesso
1 5 2010-04-20 50 200 Enterprise Technology Computer Components CPU CPU Adesso

In [18]:
internal['Business_Type'] = pd.Series('internal', index=internal.index)

In [19]:
internal.head()


Out[19]:
id Order_Date Internal_Cost Internal_Sale_Price Customer_Segment Product_Category Product_Sub_Category Product_Type Internal_Product_Name Vendor Business_Type
Product_Type_ID
1 1 2009-03-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 2 2010-12-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 3 2012-10-28 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 4 2012-09-24 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 5 2010-04-20 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal

In [20]:
internal.columns = ["id","Order_Date","Cost","Price","Customer_Segment","Product_Category","Product_Sub_Category","Product_Type","Product_Name","Vendor","Business_Type"]

In [21]:
internal.head()


Out[21]:
id Order_Date Cost Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type
Product_Type_ID
1 1 2009-03-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 2 2010-12-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 3 2012-10-28 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 4 2012-09-24 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 5 2010-04-20 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal

In [22]:
external=sheet_1

In [23]:
# append column to both datasets that distinguish them as either internal or external
external['Business_Type'] = pd.Series('external', index=external.index)

In [24]:
external.head()


Out[24]:
id Order_Date Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type
Product_Type_ID
1 1 2009-03-22 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external
1 2 2010-12-22 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external
1 3 2012-10-28 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external
1 4 2012-09-24 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external
1 5 2010-04-20 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external

In [25]:
#Add cost column so that the two dataframes can be concatenated
external['Cost'] = pd.Series(None, index=external.index)

In [26]:
external.head()


Out[26]:
id Order_Date Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type Cost
Product_Type_ID
1 1 2009-03-22 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external NaN
1 2 2010-12-22 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external NaN
1 3 2012-10-28 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external NaN
1 4 2012-09-24 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external NaN
1 5 2010-04-20 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external NaN

In [27]:
# get a list of columns
cols = list(external)

In [28]:
# move the column to head of list using index, pop and insert
cols.insert(2, cols.pop(cols.index('Cost')))
cols


Out[28]:
['id',
 'Order_Date',
 'Cost',
 'Price',
 'Customer_Segment',
 'Product_Category',
 'Product_Sub_Category',
 'Product_Type',
 'Product_Name',
 'Vendor',
 'Business_Type']

In [29]:
# use ix to reorder
external = external.ix[:, cols]
external.head()


Out[29]:
id Order_Date Cost Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type
Product_Type_ID
1 1 2009-03-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external
1 2 2010-12-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external
1 3 2012-10-28 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external
1 4 2012-09-24 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external
1 5 2010-04-20 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external

In [30]:
internal.head()


Out[30]:
id Order_Date Cost Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type
Product_Type_ID
1 1 2009-03-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 2 2010-12-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 3 2012-10-28 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 4 2012-09-24 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal
1 5 2010-04-20 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal

In [31]:
#Preserve the value of product_type_id as a column
internal['Product_Type_ID1'] = internal.index

In [32]:
internal.head()


Out[32]:
id Order_Date Cost Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type Product_Type_ID1
Product_Type_ID
1 1 2009-03-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal 1
1 2 2010-12-22 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal 1
1 3 2012-10-28 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal 1
1 4 2012-09-24 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal 1
1 5 2010-04-20 50 200 Enterprise Technology Computer Components CPU CPU Adesso internal 1

In [33]:
external['Product_Type_ID1'] = external.index

In [34]:
external.head()


Out[34]:
id Order_Date Cost Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type Product_Type_ID1
Product_Type_ID
1 1 2009-03-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
1 2 2010-12-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
1 3 2012-10-28 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
1 4 2012-09-24 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
1 5 2010-04-20 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1

In [35]:
internal.set_index('id',inplace=True)
external.set_index('id',inplace=True)

In [36]:
result = pd.concat([external, internal], ignore_index=True)

In [37]:
len(result)


Out[37]:
806

In [38]:
result.head()


Out[38]:
Order_Date Cost Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type Product_Type_ID1
0 2009-03-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
1 2010-12-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
2 2012-10-28 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
3 2012-09-24 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
4 2010-04-20 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1

In [ ]:
#results.columns = ["Business_Type","Cost","Customer_Segment","Order_Date","Price","Product_Category","Product_Name","Product_Sub_Category","Product_Type",Product_Type_ID","Vendor"]

In [39]:
result.columns = ["Order_Date","Cost","Price","Customer_Segment","Product_Category","Product_Sub_Category","Product_Type","Product_Name","Vendor","Business_Type","Product_Type_ID"]

In [40]:
result.head()


Out[40]:
Order_Date Cost Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type Product_Type_ID
0 2009-03-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
1 2010-12-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
2 2012-10-28 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
3 2012-09-24 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
4 2010-04-20 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1

In [41]:
result.reset_index()


Out[41]:
index Order_Date Cost Price Customer_Segment Product_Category Product_Sub_Category Product_Type Product_Name Vendor Business_Type Product_Type_ID
0 0 2009-03-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
1 1 2010-12-22 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
2 2 2012-10-28 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
3 3 2012-09-24 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
4 4 2010-04-20 NaN 152.48 Enterprise Technology Computer Components CPU Adesso CPU Adesso external 1
5 5 2011-12-24 NaN 120.97 Enterprise Technology Computer Components Graphics/video card Canon BP1200DH 12-Digit graphics card Canon external 2
6 6 2009-09-17 NaN 120.97 Enterprise Technology Computer Components Graphics/video card Canon BP1200DH graphics card Canon external 2
7 7 2012-05-21 NaN 120.97 Enterprise Technology Computer Components Graphics/video card Canon BP1200DH graphics card Canon external 2
8 8 2010-06-07 NaN 140.99 Enterprise Technology Computer Components Graphics/video card Canon MP100DHII graphics card Canon external 2
9 9 2012-03-02 NaN 150.98 Enterprise Technology Computer Components Graphics/video card Canon MP41DH graphics card Canon external 2
10 10 2010-08-30 NaN 150.98 Enterprise Technology Computer Components Graphics/video card Canon MP41DH graphics card Canon external 2
11 11 2012-04-11 NaN 150.98 Enterprise Technology Computer Components Graphics/video card Canon MP41DH graphics card Canon external 2
12 12 2010-03-08 NaN 150.98 Enterprise Technology Computer Components Graphics/video card Canon MP41DH graphics card Canon external 2
13 13 2009-01-26 NaN 150.98 Enterprise Technology Computer Components Graphics/video card Canon MP41DH graphics card Canon external 2
14 14 2009-03-25 NaN 150.98 Enterprise Technology Computer Components Graphics/video card Canon MP41DH graphics card Canon external 2
15 15 2012-05-07 NaN 150.98 Enterprise Technology Computer Components Graphics/video card Canon MP41DH graphics card Canon external 2
16 16 2010-08-15 NaN 17.98 Enterprise Technology Computer Components Graphics/video card Canon P1-DHIII Palm graphics card Canon external 2
17 17 2012-05-28 NaN 17.98 Enterprise Technology Computer Components Graphics/video card Canon P1-DHIII Palm graphics card Canon external 2
18 18 2011-12-07 NaN 17.98 Enterprise Technology Computer Components Graphics/video card Canon P1-DHIII Palm graphics card Canon external 2
19 19 2011-12-02 NaN 17.98 Enterprise Technology Computer Components Graphics/video card Canon P1-DHIII Palm graphics card Canon external 2
20 20 2010-07-03 NaN 17.98 Enterprise Technology Computer Components Graphics/video card Canon P1-DHIII Palm graphics card Canon external 2
21 21 2009-04-07 NaN 40.48 Enterprise Technology Computer Components Graphics/video card Keytronic Designer graphics card Keytronic external 2
22 22 2011-04-17 NaN 40.48 Enterprise Technology Computer Components Graphics/video card Keytronic Designer graphics card Keytronic external 2
23 23 2012-12-21 NaN 20.97 Enterprise Technology Computer Components Wi-Fi Microsoft WiFi Microsoft external 6
24 24 2011-03-12 NaN 20.97 Enterprise Technology Computer Components Wi-Fi Microsoft WiFi Microsoft external 6
25 25 2011-04-20 NaN 20.97 Enterprise Technology Computer Components Wi-Fi Microsoft WiFi Microsoft external 6
26 26 2012-05-07 NaN 20.97 Enterprise Technology Computer Components Wi-Fi Microsoft WiFi Microsoft external 6
27 27 2011-07-22 NaN 20.97 Enterprise Technology Computer Components Wi-Fi Microsoft WiFi Microsoft external 6
28 28 2010-01-10 NaN 20.97 Enterprise Technology Computer Components Wi-Fi Microsoft WiFi Microsoft external 6
29 29 2009-05-13 NaN 30.42 Enterprise Technology Computer Components Wi-Fi Microsoft WiFi Microsoft external 6
... ... ... ... ... ... ... ... ... ... ... ... ...
776 776 2012-05-06 345.0 500.00 Enterprise Technology Computer Components RAM RAM Brother internal 19
777 777 2012-07-13 345.0 500.00 Enterprise Technology Computer Components RAM RAM Brother internal 19
778 778 2012-05-11 345.0 500.00 Enterprise Technology Computer Components RAM RAM Canon internal 19
779 779 2010-01-09 345.0 500.00 Enterprise Technology Computer Components RAM RAM Canon internal 19
780 780 2011-02-06 345.0 500.00 Enterprise Technology Computer Components RAM RAM Canon internal 19
781 781 2012-03-30 345.0 500.00 Enterprise Technology Computer Components RAM RAM Canon internal 19
782 782 2011-09-02 345.0 500.00 Enterprise Technology Computer Components RAM RAM Canon internal 19
783 783 2009-03-23 345.0 500.00 Enterprise Technology Computer Components RAM RAM Canon internal 19
784 784 2012-03-05 345.0 500.00 Enterprise Technology Computer Components RAM RAM Canon internal 19
785 785 2011-11-16 345.0 500.00 Enterprise Technology Computer Components RAM RAM Hewlett Packard internal 19
786 786 2010-02-27 345.0 500.00 Enterprise Technology Computer Components RAM RAM Hewlett Packard internal 19
787 787 2011-11-16 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
788 788 2012-03-03 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
789 789 2010-05-10 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
790 790 2011-09-15 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
791 791 2011-02-03 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
792 792 2009-09-19 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
793 793 2012-04-04 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
794 794 2012-10-27 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
795 795 2009-12-01 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
796 796 2010-09-19 10.0 50.00 Enterprise Technology Computer Components microprocessor microprocessor Unknown internal 20
797 797 2010-06-14 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive Micro Innovations internal 21
798 798 2011-06-25 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive Micro Innovations internal 21
799 799 2010-12-05 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive Micro Innovations internal 21
800 800 2010-02-22 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive Microsoft internal 21
801 801 2011-08-16 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive Microsoft internal 21
802 802 2010-11-01 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive Microsoft internal 21
803 803 2010-06-14 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive PC Concepts internal 21
804 804 2012-02-10 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive PC Concepts internal 21
805 805 2010-04-19 35.0 45.00 Enterprise Technology Computer Components harddrive harddrive PC Concepts internal 21

806 rows × 12 columns


In [42]:
#export to file .csv
result.to_csv(r'/Users/ishmaelamin/Documents/Northwestern_University/2016_Fall/Predict_480/superstore_businesstypes.csv')

In [ ]: