In [28]:
%pylab inline
import pandas as pd
df = pd.read_csv('../datasets/samsung/samsungdata.csv')


Populating the interactive namespace from numpy and matplotlib

In [29]:
# Since total no. of columns is equal to unique columns.
# hence no duplicates.
df.columns.unique


Out[29]:
<bound method Index.unique of Index(['Unnamed: 0', 'tBodyAcc-mean()-X', 'tBodyAcc-mean()-Y',
       'tBodyAcc-mean()-Z', 'tBodyAcc-std()-X', 'tBodyAcc-std()-Y',
       'tBodyAcc-std()-Z', 'tBodyAcc-mad()-X', 'tBodyAcc-mad()-Y',
       'tBodyAcc-mad()-Z', 
       ...
       'fBodyBodyGyroJerkMag-kurtosis()', 'angle(tBodyAccMean,gravity)',
       'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'subject', 'activity'],
      dtype='object', length=564)>

In [30]:
# Identify and fix inclusion of ( ) in column names
df = df.rename(columns=lambda x: x.replace('()', ''))

In [31]:
df.columns.unique


Out[31]:
<bound method Index.unique of Index(['Unnamed: 0', 'tBodyAcc-mean-X', 'tBodyAcc-mean-Y', 'tBodyAcc-mean-Z',
       'tBodyAcc-std-X', 'tBodyAcc-std-Y', 'tBodyAcc-std-Z', 'tBodyAcc-mad-X',
       'tBodyAcc-mad-Y', 'tBodyAcc-mad-Z', 
       ...
       'fBodyBodyGyroJerkMag-kurtosis', 'angle(tBodyAccMean,gravity)',
       'angle(tBodyAccJerkMean),gravityMean)',
       'angle(tBodyGyroMean,gravityMean)',
       'angle(tBodyGyroJerkMean,gravityMean)', 'angle(X,gravityMean)',
       'angle(Y,gravityMean)', 'angle(Z,gravityMean)', 'subject', 'activity'],
      dtype='object', length=564)>

In [39]:
# Exploration
df = df.rename(columns=lambda x: x.replace('-', ''))
df = df.rename(columns=lambda x: x.replace(',', ''))
df = df.rename(columns=lambda x: x.replace('BodyBody', ''))
df = df.rename(columns=lambda x: x.replace('Body', ''))

In [40]:
# Variable Reduction
# 1. Mag and angle variables contain the same info as (= strongly correlated with) XYZ variables and we choose the later.
for col in df.columns:
    if 'Mag' in col or 'angle' in col:
        df = df.drop(col, 1)

In [41]:
# 2. We ignore the band variables as we have no simple way to interpret the meaning and relate them to physical activities.
for col in df.columns:
    if 'band' in col:
        df = df.drop(col, 1)

In [42]:
# Eliminating confounders
for col in df.columns:
    if 'X' is col[-1] or 'Y' is col[-1] or 'Z' is col[-1]:
        df = df.drop(col, 1)

In [43]:
df.columns


Out[43]:
Index(['Unnamed: 0', 'tAccsma', 'tAccarCoeffX1', 'tAccarCoeffX2',
       'tAccarCoeffX3', 'tAccarCoeffX4', 'tAccarCoeffY1', 'tAccarCoeffY2',
       'tAccarCoeffY3', 'tAccarCoeffY4', 'tAccarCoeffZ1', 'tAccarCoeffZ2',
       'tAccarCoeffZ3', 'tAccarCoeffZ4', 'tGravityAccsma',
       'tGravityAccarCoeffX1', 'tGravityAccarCoeffX2', 'tGravityAccarCoeffX3',
       'tGravityAccarCoeffX4', 'tGravityAccarCoeffY1', 'tGravityAccarCoeffY2',
       'tGravityAccarCoeffY3', 'tGravityAccarCoeffY4', 'tGravityAccarCoeffZ1',
       'tGravityAccarCoeffZ2', 'tGravityAccarCoeffZ3', 'tGravityAccarCoeffZ4',
       'tAccJerksma', 'tAccJerkarCoeffX1', 'tAccJerkarCoeffX2',
       'tAccJerkarCoeffX3', 'tAccJerkarCoeffX4', 'tAccJerkarCoeffY1',
       'tAccJerkarCoeffY2', 'tAccJerkarCoeffY3', 'tAccJerkarCoeffY4',
       'tAccJerkarCoeffZ1', 'tAccJerkarCoeffZ2', 'tAccJerkarCoeffZ3',
       'tAccJerkarCoeffZ4', 'tGyrosma', 'tGyroarCoeffX1', 'tGyroarCoeffX2',
       'tGyroarCoeffX3', 'tGyroarCoeffX4', 'tGyroarCoeffY1', 'tGyroarCoeffY2',
       'tGyroarCoeffY3', 'tGyroarCoeffY4', 'tGyroarCoeffZ1', 'tGyroarCoeffZ2',
       'tGyroarCoeffZ3', 'tGyroarCoeffZ4', 'tGyroJerksma',
       'tGyroJerkarCoeffX1', 'tGyroJerkarCoeffX2', 'tGyroJerkarCoeffX3',
       'tGyroJerkarCoeffX4', 'tGyroJerkarCoeffY1', 'tGyroJerkarCoeffY2',
       'tGyroJerkarCoeffY3', 'tGyroJerkarCoeffY4', 'tGyroJerkarCoeffZ1',
       'tGyroJerkarCoeffZ2', 'tGyroJerkarCoeffZ3', 'tGyroJerkarCoeffZ4',
       'fAccsma', 'fAccJerksma', 'fGyrosma', 'subject', 'activity'],
      dtype='object')

In [ ]: