In [ ]:
import pandas as pd
import os
import numpy as np

from collections import OrderedDict, defaultdict
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, LabelEncoder

In [ ]:
### TODO: make sure language is consistent: 'labels' is what is to be predicted, 'responses' are the answers to questions on the survey, 'predictors' are questions that are not the labels.

survey_data = pd.read_csv("https://raw.githubusercontent.com/samtalasila/e-bike-survey-response-results/master/E-Bike_Survey_Responses.csv")
print(survey_data.shape)
survey_data.head()

In [ ]:
response_q = 'Does your household have access to any of the following private motorized vehicles?'
positive_label = 'No - I do not have access to a private motorized vehicle'

labels = survey_data[response_q]
predictors = survey_data.drop(columns=[response_q])
missing_values_per_predictor = predictors.isna().sum()

print("Original data was shape: ", survey_data.shape)
print("Predictors have shape: ", predictors.shape)

print("Found ", labels.isna().sum(), " rows with response missing")
print("Found ", missing_values_per_predictor.sum(), " number of missing values total among all predictor columns")

In [ ]:
predictors = predictors.fillna(value='NA')
print("Found ", len(set(labels)), "different responses")

positives = 0
negatives = 0
for label in labels:
    if label.strip().startswith(positive_label):
        positives += 1
    else:
        negatives += 1
print("Got ", positives, " matches to specific answer")
print("Got ", negatives, " non-mathces to specific answer")
print("Expected total of ", labels.shape[0], " responses, got ", positives + negatives)

In [ ]:
question_response_complexity_dict = OrderedDict()
for k in predictors.keys():
    question_response_complexity_dict[k] = ""
    
print("Number of questions: ", len(predictors.keys()))  
print("Cardinality of responses to each survey question:\n")    
for k in question_response_complexity_dict.keys():
    data = predictors[k]
    cardinality = len(set(data))
    question_response_complexity_dict[k] = cardinality
    print(k, "\t", cardinality)

In [ ]:
# Go through the predictors, find those answers in each question to be grouped into 'other'
# N.B: using a poorly chosen heuristic of 4 is not great, but based on my reading of
# the data manually, it seems an okay heuristic. 

# One of the ways to improve this would be to automatically determine an appropriate threshold via an information theoretic measure
# for each predictor, but this is out of scope for a first attempt.

replace_as_other = {k: [] for k in predictors.keys()}
for k in predictors.keys():
    replace_as_other[k] = [n for n, d in predictors[k].value_counts().items() if d < 4]

# perform the in-place replacement
for k in predictors.keys():
    predictors.loc[predictors[k].isin(replace_as_other[k]), k] = 'unusual answer'

In [ ]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer

# Encode labels in the response
label_encoder = {positive_label: 1}
encoded_labels = [label_encoder.get(r,0) for r in labels]
y = np.array(encoded_labels)

# Encode the features in the predictors, imputing missing values
encoder_pipeline = Pipeline([
    ("vectorizer", DictVectorizer(sparse=False)), 
    ("imputer", Imputer(missing_values=np.nan, 
                    strategy="mean",
                    axis=0))
                    ])

features = predictors.to_dict('records')
X = encoder_pipeline.fit_transform(features)

# Make sure we get sensible features
print("shape of X: ", X.shape)
print("type of X: ", type(X))
print("Any NaN or infinity? ", np.isnan(X).sum())

In [ ]:
# Encode the features in the predictors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import EnglishStemmer

stemmer = EnglishStemmer()

def stemmed_words(words):
    return [stemmer.stem(w) for w in words]

tfidf_vectorizer = TfidfVectorizer(analyzer=stemmed_words, stop_words='english', ngram_range=(1,2))

In [ ]:
test_data = predictors.head()
count_data_vec = []
tfidf_data_vec = []

for name, values in test_data.iteritems():
    count_transformed_data = count_vectorizer.fit_transform(values)
    count_data_vec.append(count_transformed_data)
    tfidf_transformed_data = tfidf_vectorizer.fit_transform(values)
    tfidf_data_vec.append(tfidf_transformed_data)

print("Count data")
print("Test data has length: ",len(count_data_vec))
print("Each element has shape: ", count_data_vec[0].shape)

print("Tfidf transformed data")
print("Test data has length: ", len(tfidf_data_vec))
print("Each element has shape: ", tfidf_data_vec[0].shape)