In [ ]:
import pandas as pd
import os
import numpy as np
from collections import OrderedDict, defaultdict
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, LabelEncoder
In [ ]:
### TODO: make sure language is consistent: 'labels' is what is to be predicted, 'responses' are the answers to questions on the survey, 'predictors' are questions that are not the labels.
survey_data = pd.read_csv("https://raw.githubusercontent.com/samtalasila/e-bike-survey-response-results/master/E-Bike_Survey_Responses.csv")
print(survey_data.shape)
survey_data.head()
In [ ]:
response_q = 'Does your household have access to any of the following private motorized vehicles?'
positive_label = 'No - I do not have access to a private motorized vehicle'
labels = survey_data[response_q]
predictors = survey_data.drop(columns=[response_q])
missing_values_per_predictor = predictors.isna().sum()
print("Original data was shape: ", survey_data.shape)
print("Predictors have shape: ", predictors.shape)
print("Found ", labels.isna().sum(), " rows with response missing")
print("Found ", missing_values_per_predictor.sum(), " number of missing values total among all predictor columns")
In [ ]:
predictors = predictors.fillna(value='NA')
print("Found ", len(set(labels)), "different responses")
positives = 0
negatives = 0
for label in labels:
if label.strip().startswith(positive_label):
positives += 1
else:
negatives += 1
print("Got ", positives, " matches to specific answer")
print("Got ", negatives, " non-mathces to specific answer")
print("Expected total of ", labels.shape[0], " responses, got ", positives + negatives)
In [ ]:
question_response_complexity_dict = OrderedDict()
for k in predictors.keys():
question_response_complexity_dict[k] = ""
print("Number of questions: ", len(predictors.keys()))
print("Cardinality of responses to each survey question:\n")
for k in question_response_complexity_dict.keys():
data = predictors[k]
cardinality = len(set(data))
question_response_complexity_dict[k] = cardinality
print(k, "\t", cardinality)
In [ ]:
# Go through the predictors, find those answers in each question to be grouped into 'other'
# N.B: using a poorly chosen heuristic of 4 is not great, but based on my reading of
# the data manually, it seems an okay heuristic.
# One of the ways to improve this would be to automatically determine an appropriate threshold via an information theoretic measure
# for each predictor, but this is out of scope for a first attempt.
replace_as_other = {k: [] for k in predictors.keys()}
for k in predictors.keys():
replace_as_other[k] = [n for n, d in predictors[k].value_counts().items() if d < 4]
# perform the in-place replacement
for k in predictors.keys():
predictors.loc[predictors[k].isin(replace_as_other[k]), k] = 'unusual answer'
In [ ]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
# Encode labels in the response
label_encoder = {positive_label: 1}
encoded_labels = [label_encoder.get(r,0) for r in labels]
y = np.array(encoded_labels)
# Encode the features in the predictors, imputing missing values
encoder_pipeline = Pipeline([
("vectorizer", DictVectorizer(sparse=False)),
("imputer", Imputer(missing_values=np.nan,
strategy="mean",
axis=0))
])
features = predictors.to_dict('records')
X = encoder_pipeline.fit_transform(features)
# Make sure we get sensible features
print("shape of X: ", X.shape)
print("type of X: ", type(X))
print("Any NaN or infinity? ", np.isnan(X).sum())
In [ ]:
# Encode the features in the predictors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
def stemmed_words(words):
return [stemmer.stem(w) for w in words]
tfidf_vectorizer = TfidfVectorizer(analyzer=stemmed_words, stop_words='english', ngram_range=(1,2))
In [ ]:
test_data = predictors.head()
count_data_vec = []
tfidf_data_vec = []
for name, values in test_data.iteritems():
count_transformed_data = count_vectorizer.fit_transform(values)
count_data_vec.append(count_transformed_data)
tfidf_transformed_data = tfidf_vectorizer.fit_transform(values)
tfidf_data_vec.append(tfidf_transformed_data)
print("Count data")
print("Test data has length: ",len(count_data_vec))
print("Each element has shape: ", count_data_vec[0].shape)
print("Tfidf transformed data")
print("Test data has length: ", len(tfidf_data_vec))
print("Each element has shape: ", tfidf_data_vec[0].shape)