In [1]:
# Import necessary packages and modules
import swat
%matplotlib inline
# Set the connection by specifying the hostname, port, username, and password
conn = swat.CAS(hostname, port, username, password)
# Get the hmeq csv file from SAS support documentation and lift into memory
castbl = conn.read_csv('http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv', casout = 'hmeq')
castbl.replace = True
In [2]:
# Assign the variable name df to the new CASTable object
df = conn.CASTable('hmeq')
# Perform the head method to return the first 5 rows
df.head()
Out[2]:
In [3]:
# How much of their mortgage have they paid off?
df['MORTPAID'] = df['VALUE'] - df['MORTDUE']
df.head()
Out[3]:
In [4]:
# What percent of the time does this happen?
df.query('MORTPAID < 0')['MORTPAID'].count()/len(df)
Out[4]:
In [5]:
# Use the pandas/matplotlib method for plotting a histogram of all numeric variables
df.hist(figsize = (15, 10));
In [6]:
# Use the pandas describe method, then switch rows and columns
summary = df.describe(include = 'all').transpose()
summary
Out[6]:
In [7]:
# Create percent missing column for plotting
summary['pctmiss'] = (len(df) - summary['count'])/len(df)
# Make a bar graph using pandas/matplotlib functionality
summary.query('pctmiss > 0')['pctmiss'].plot('bar', title = 'Pct Missing Values', figsize = (10, 6), color = 'c');
In [8]:
# This is using the CAS action impute - really nice method for imputing all variables at once
## Impute the median for numeric, most common for categorical
df.impute(
methodContinuous = 'MEDIAN',
methodNominal = 'MODE',
inputs = list(summary.index[1:]), # exclude target column
copyAllVars = True,
casOut = castbl
)
Out[8]:
In [9]:
# Load the sampling actionset
conn.loadactionset('sampling')
# Do a simple random sample with a 70/30 split
df.srs(samppct = 30, partind = True, seed = 1, output = dict(casout = castbl, copyvars = 'all'))
# What percentage is in each split?
castbl['_PartInd_'].groupby('_PartInd_').count()/len(castbl)
Out[9]:
In [10]:
castbl.head()
Out[10]:
In [11]:
# This allows me to share my data across sessions and with other colleagues
conn.promote(name = castbl, targetlib = 'public', target = 'hmeq_prepped')
In [12]:
# Verify that hmeq_prepped has made it into the public caslib
conn.tableinfo(caslib = 'public')['TableInfo'].query('Name == "HMEQ_PREPPED"')
Out[12]:
In [13]:
# End the session
conn.endsession()
Out[13]: