Use the Adventure Works dataset to create the following reports. The data files can be directlty accessed by all users on ARC (our Hadoop Cluster).
No need to download or copy into your home directory!
Reference to the original data set https://msdn.microsoft.com/en-us/library/hh403424.aspx
In [1]:
import pandas as pd
import numpy as np
In [2]:
%%time
Employees = pd.read_excel('/home/data/AdventureWorks/Employees.xls')
print("shape:", Employees.shape)
In [6]:
%%time
Territory = pd.read_excel('/home/data/AdventureWorks/SalesTerritory.xls')
print("shape:", Territory.shape)
In [11]:
%%time
Customers = pd.read_excel('/home/data/AdventureWorks/Customers.xls')
print("shape:", Customers.shape)
In [10]:
%%time
Orders = pd.read_excel('/home/data/AdventureWorks/ItemsOrdered.xls')
print("shape:", Orders.shape)
In [23]:
Employees.MaritalStatus == 'M'
Out[23]:
In [29]:
Employees.loc[Employees.MaritalStatus == 'M', ['EmployeeID', 'FirstName', 'LastName', 'MaritalStatus']]
Out[29]:
In [4]:
filter = Employees.LastName.map(lambda x: x[0] == 'R')
In [5]:
Employees[filter]
Out[5]:
In [12]:
df = Employees[Employees.LastName.map(lambda x: x[0] == 'R')]
df[['EmployeeID', 'FirstName', 'LastName']]
Out[12]:
In [35]:
Employees.loc[Employees.LastName.map(lambda x: x[-1] == 'r'), ['EmployeeID', 'FirstName', 'LastName']]
Out[35]:
In [16]:
help(pd.Series.between)
In [14]:
type(Employees.LastName)
dir(type(Employees.LastName))
Out[14]:
In [36]:
Employees.loc[Employees.LastName.map(lambda x: '-' in x), ['EmployeeID', 'FirstName', 'LastName']]
Out[36]:
In [38]:
Employees.loc[(Employees.SalariedFlag==1) & (Employees.VacationHours>35), ['EmployeeID', 'FirstName', 'LastName', 'SalariedFlag', 'VacationHours']]
Out[38]:
In [ ]:
Employees.loc[(Employees.SalariedFlag==1) & (Employees.VacationHours>35), ['EmployeeID', 'FirstName', 'LastName', 'SalariedFlag', 'VacationHours']]
In [39]:
Employees.loc[
(Employees.SalariedFlag==1) & (Employees.VacationHours>35) & (Employees.CountryName != 'United States')
, ['EmployeeID', 'FirstName', 'LastName', 'SalariedFlag', 'VacationHours']
]
Out[39]:
In [40]:
Employees.loc[
(Employees.SalariedFlag==1) & (Employees.VacationHours>35) & (Employees.StateProvinceName == 'Washington')
, ['EmployeeID', 'FirstName', 'LastName', 'SalariedFlag', 'VacationHours', 'StateProvinceName']
]
Out[40]:
In [41]:
Employees.loc[
(Employees.SalariedFlag==1) | (Employees.VacationHours>35) | (Employees.StateProvinceName == 'Washington')
, ['EmployeeID', 'FirstName', 'LastName', 'SalariedFlag', 'VacationHours', 'StateProvinceName']
]
Out[41]:
In [47]:
'abc'.startswith('ab')
Out[47]:
In [49]:
Employees.loc[ Employees.JobTitle.map(lambda x: x.startswith('Production Technician'))
& (
(Employees.SalariedFlag==1) | (Employees.VacationHours>35) | (Employees.StateProvinceName == 'Washington')
), ['EmployeeID', 'FirstName', 'LastName', 'SalariedFlag', 'VacationHours', 'StateProvinceName', 'JobTitle']
]
Out[49]:
In [53]:
Employees.loc[Employees.City.isin(['Redmond', 'Seattle', 'Bellevue']),
['EmployeeID', 'FirstName', 'LastName', 'City']].sort_values('City')
Out[53]:
In [54]:
Employees.loc[Employees.City.isin(['Redmond', 'Seattle', 'Bellevue'])
& (Employees.StateProvinceName=='Washington'),
['EmployeeID', 'FirstName', 'LastName', 'City']].sort_values('City')
Out[54]:
In [93]:
### Note: when exporting the SQL table to Excel the one row with a ' ' in the Title was already removed...Sorry
Employees.loc[pd.isnull(Employees.Title) | Employees.Title.map(lambda x: len(str(x).strip())==0)
,['EmployeeID', 'Title', 'FirstName', 'LastName', 'City']]
Out[93]:
In [111]:
Employees.loc[Employees.VacationHours >= 60
, ['EmployeeID', 'FirstName', 'LastName', 'VacationHours', 'SickLeaveHours']]
Out[111]:
In [106]:
Employees.loc[Employees.VacationHours < 60
, ['EmployeeID', 'FirstName', 'LastName', 'VacationHours', 'SickLeaveHours']]
Out[106]:
In [112]:
Employees.loc[(Employees.VacationHours > 20) & (Employees.VacationHours < 60)
, ['EmployeeID', 'FirstName', 'LastName', 'VacationHours', 'SickLeaveHours']]
Out[112]:
In [114]:
Employees.loc[Employees.VacationHours.between(20, 60, inclusive=False)
, ['EmployeeID', 'FirstName', 'LastName', 'VacationHours', 'SickLeaveHours']]
Out[114]:
In [115]:
Employees.BirthDate.min()
Out[115]:
In [117]:
(Employees.BirthDate.min(), Employees.BirthDate.max())
Out[117]:
In [131]:
Employees.groupby(by=['Gender']).BirthDate.aggregate([min, max])
Out[131]:
In [134]:
Employees.groupby(by=['Gender', 'SalariedFlag']).BirthDate.aggregate([min, max])
Out[134]:
In [135]:
Employees.VacationHours.mean()
Out[135]:
In [136]:
(Employees.VacationHours.mean(), Employees.VacationHours.min())
Out[136]:
In [157]:
Employees.groupby(by=['JobTitle']).VacationHours.aggregate([np.mean, np.min])
Out[157]:
In [158]:
Employees.groupby(by=['JobTitle', 'MaritalStatus']).VacationHours.aggregate([np.mean, np.min])
Out[158]:
In [159]:
Employees.groupby(by=['JobTitle', 'MaritalStatus']).VacationHours.aggregate([np.mean, np.min, np.max])
Out[159]:
In [160]:
Employees.groupby(by=['JobTitle', 'MaritalStatus', 'StateProvinceName']).VacationHours.aggregate([np.mean, np.min, np.max])
Out[160]:
In [164]:
Employees[Employees.CountryName=='United States'] \
.groupby(by=['JobTitle', 'MaritalStatus', 'StateProvinceName']).VacationHours.aggregate([np.mean, np.min, np.max])
Out[164]:
In [165]:
Employees[Employees.CountryName=='United States'] \
.groupby(by=['MaritalStatus', 'StateProvinceName']).VacationHours.aggregate([np.mean, np.min, np.max])
Out[165]:
In [180]:
Employees[Employees.CountryName=='United States'] \
.groupby(by=['StateProvinceName']).filter(lambda x: x.VacationHours.mean()>30) \
.groupby(by=['StateProvinceName']).VacationHours.aggregate([np.mean, np.min, np.max])
Out[180]:
In [182]:
Employees[Employees.CountryName=='United States'] \
.groupby(by=['StateProvinceName']).filter(lambda x: (x.VacationHours.mean() >30) and (x.VacationHours.max()<50)) \
.groupby(by=['StateProvinceName']).VacationHours.aggregate([np.mean, np.min, np.max])
Out[182]:
In [183]:
Employees[Employees.CountryName!='United States'] \
.groupby(by=['StateProvinceName']).filter(lambda x: (x.VacationHours.mean() >30) and (x.VacationHours.max()<50)) \
.groupby(by=['StateProvinceName']).VacationHours.aggregate([np.mean, np.min, np.max])
Out[183]:
In [185]:
Employees.shape[0]
Out[185]:
In [186]:
Employees.groupby(by=['ManagerID']).EmployeeID.aggregate(lambda x: x.shape[0])
Out[186]:
In [193]:
Employees.loc[pd.isnull(Employees.EmployeeID)==False] \
.groupby(by=['ManagerID']).EmployeeID.aggregate(lambda x: x.shape[0])
Out[193]:
In [195]:
Employees.loc[pd.isnull(Employees.EmployeeID)==False] \
.groupby(by=['ManagerID']).filter(lambda x: x.shape[0]>=5) \
.groupby(by=['ManagerID']).EmployeeID.aggregate(lambda x: x.shape[0])
Out[195]:
In [196]:
Employees.VacationHours.mean()
Out[196]:
In [197]:
Employees.groupby(by=['StateProvinceName']).VacationHours.mean()
Out[197]:
In [198]:
Employees.groupby(by=['StateProvinceName', 'City']).VacationHours.mean()
Out[198]:
In [17]:
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean'
},
'EmployeeID' : {
'NumEmployees' : 'count'
}
}
Employees.groupby(by=['StateProvinceName', 'City'])\
.aggregate(aggregations)
Out[17]:
In [217]:
### Since Pandas already sorts the results, we show here how show resutls in DESCENDING order
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean'
},
'EmployeeID' : {
'NumEmployees' : 'count'
}
}
Employees.groupby(by=['StateProvinceName', 'City']).aggregate(aggregations) \
.sort_index(ascending=False, axis=0)
Out[217]:
In [241]:
## This adds the row indices back as columns
df = Employees.groupby(by=['StateProvinceName', 'City']).aggregate(aggregations) \
.sort_index(ascending=False, axis=0)
df['SPN'] = df.index.get_level_values(0)
df['C'] = df.index.get_level_values(1)
Out[241]:
In [362]:
# Version 1: first create new column 'CityState' ... rest is easy
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean'
},
'EmployeeID' : {
'NumEmployees' : 'count'
},
}
df = Employees.copy()
df['CityState'] = Employees.City.str.cat(Employees.StateProvinceName, sep=', ')
df.groupby(by = 'CityState').aggregate(aggregations).sort_index(ascending=False, axis=0)
Out[362]:
In [301]:
# Version 2: using the multi-indexed result and adding index values as new column 'CityState'
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean'
},
'EmployeeID' : {
'NumEmployees' : 'count'
},
}
df = Employees.groupby(by=['StateProvinceName', 'City']).aggregate(aggregations) \
.sort_index(ascending=False, axis=0)
df['CityState'] = [
r.City+', '+r.State \
for i, r in pd.DataFrame({
'City': df.index.get_level_values(1),
'State': df.index.get_level_values(0)
}).iterrows()
]
df
Out[301]:
In [309]:
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean',
'MinVacationHours' : min,
'MaxVacationHours' : max,
'DeltaVacationHoues' : lambda x: x.max()-x.min()
},
'EmployeeID' : {
'NumEmployees' : 'count'
},
}
Employees.groupby(by=['StateProvinceName', 'City']).aggregate(aggregations) \
.sort_index(ascending=False, axis=0)
Out[309]:
In [326]:
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean',
'MinVacationHours' : min,
'MaxVacationHours' : max,
'DeltaVacationHours' : lambda x: x.max()-x.min()
},
'EmployeeID' : {
'NumEmployees' : 'count'
},
}
df = Employees.groupby(by=['StateProvinceName', 'City']).aggregate(aggregations)
df.iloc[np.argsort(df['VacationHours']['DeltaVacationHours'])]
Out[326]:
In [50]:
def my_own_delta(x):
return x.max()-x.min()
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean',
'MinVacationHours' : np.min,
'MaxVacationHours' : np.max,
'DeltaVacationHours' : my_own_delta, ##lambda x: x.max()-x.min(),
'cnt' : pd.Series.count
},
'EmployeeID' : {
'NumEmployees' : 'count'
},
}
df = Employees.groupby(by=['StateProvinceName', 'City']).aggregate(aggregations)
df['VacationHours']['DeltaVacationHours']>1
df2 = df[df['VacationHours']['DeltaVacationHours']>1]
df2.columns = df2.columns.droplevel(0)
df2.reset_index()
##df2.iloc[np.argsort(df2['VacationHours']['DeltaVacationHours'])]
##np.argsort(df2['VacationHours']['DeltaVacationHours'])
Out[50]:
In [336]:
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean',
'MinVacationHours' : min,
'MaxVacationHours' : max,
'DeltaVacationHours' : lambda x: x.max()-x.min()
},
'EmployeeID' : {
'NumEmployees' : 'count'
},
}
df = Employees[Employees.CountryName != 'United States'].groupby(by=['StateProvinceName', 'City']).aggregate(aggregations)
df2 = df[df['VacationHours']['DeltaVacationHours']>1]
df2.iloc[np.argsort(df2['VacationHours']['DeltaVacationHours'])]
Out[336]:
In [354]:
[cs for cs in map(lambda x,y: str(y)+', '+str(x), Employees.StateProvinceName.values, Employees.City.values)]
Out[354]:
In [356]:
Employees.City.str.cat(Employees.StateProvinceName, sep=', ')
Out[356]:
In [ ]:
# First ten rows
Employees[0:10]
In [5]:
# Rows satisfying a particular condition
Employees[Employees.MaritalStatus=='M']
Out[5]:
In [ ]:
# Select sertain columns
Employees[['EmployeeID', 'FirstName']]
In [9]:
Employees[0:10][Employees.MaritalStatus=='M'][['EmployeeID', 'FirstName', 'MaritalStatus']]
Out[9]:
In [10]:
Employees[Employees.MaritalStatus=='M'][['EmployeeID', 'FirstName', 'MaritalStatus']][0:10]
Out[10]:
Recommended to use .loc .iloc .at methods
In [14]:
Employees.groupby(by=['MaritalStatus']).VacationHours.sum()
Out[14]:
In [ ]:
aggregations = {
'VacationHours' : {
'AverageVacationHours' : 'mean',
'MinVacationHours' : min,
'MaxVacationHours' : max,
'DeltaVacationHoues' : lambda x: x.max()-x.min()
},
'EmployeeID' : {
'NumEmployees' : 'count'
},
}
Employees.groupby(by=['StateProvinceName', 'City']).aggregate(aggregations) \
.sort_index(ascending=False, axis=0)
In [18]:
help(np.min)
help(pd.Series.between)
In [ ]:
In [ ]: