In [1]:
# imports
import numpy as np
import pandas as pd
import _pickle as cPickle
import random
from datetime import datetime
from time import time
import csv
from AssistmentsProperties import AssistmentsProperties
In [2]:
# Assistments class for data preprocessing
class DataPreprocessor(object):
def __init__(self, dataset_str, version):
self.dataset = dataset_str
self.version = version
if ('Assistments' == dataset_str):
self.attr = AssistmentsProperties(version)
else:
print('{} dataset not yet realized'.format(dataset_str))
exit(1)
# default data config
self.config = {
'split_rate': 0.2,
'method': 'default',
'has_scaffolding': True,
'count_no_skill_id': True,
'has_test_mode': True,
'allow_multi_skills': True,
'window_length': 10,
'one_hot': True
}
def get_datapath(self, ext='csv', is_original=True, is_problem_contents=False, is_training=True):
return self.attr.get_datapath(ext, is_original, self.config, is_problem_contents, is_training)
def set_datapath(self, datapath):
self.attr.set_datapath(datapath)
# input: dataframe, split attribute, and split_rate
# input should be injected after assigning idx
# return a list
# list[0] = (test_df, test_num_problems, test_num_steps)
# list[1] = (train_df, train_num_problems, train_num_steps)
# list[2] = max_num_skills
def split_train_test(self, df, split_rate=0.2):
if ('Assistments' == self.dataset):
split_attr = self.attr_dict.get('split', None)
skill_id_attr = self.attr_dict.get('skill_id', None)
time_attr = self.attr_dict.get('time', None)
max_num_skills = __class__.get_nunique(df, skill_id_attr)
__class__.sort(df, [split_attr, time_attr], [True, True])
# change skill_id as str for probable multiple skill concat
df.loc[:, skill_id_attr] = df.loc[:, skill_id_attr].astype('str')
# for probable use
columns = df.columns
# split dataframes according to split_attr
# note that groupby preserves the order of samples within groups (see documentation)
groupby_obj = df.groupby(split_attr, sort=False)
print('groupby done')
# bool to check if it is not one_hot
not_one_hot = (not self.config.get('one_hot', True)) and self.config.get('allow_multi_skills', True)
print('not_one_hot: ', not_one_hot)
if ('2009' == self.version and not_one_hot):
allow_multi_skill_attr = self.attr_dict.get('allow_multi_skills', None)
# TODO: the part below is the bottleneck. make it more efficient
# groupby again for each group according to time attribute; should be sorted
skill_joined = pd.DataFrame(columns=[time_attr, skill_id_attr])
for _, rows in groupby_obj:
time_groupby_obj = rows.groupby(time_attr, sort=True)
# concat skills with a space as a delimeter
joined = time_groupby_obj[skill_id_attr].apply(' '.join).reset_index()
skill_joined = pd.concat([skill_joined, joined])
# now drop duplicate multiple skills
df.drop_duplicates(subset=[allow_multi_skill_attr], inplace=True)
# drop skill_id_attr column
df.drop(skill_id_attr, axis=1, inplace=True)
# join multi_skill column with original df
df = df.join(skill_joined.set_index(time_attr),
on=time_attr, how = 'outer')
# now groupby again
groupby_obj = df.groupby(split_attr, sort=False)
# bool to know if there are two groupby obj's or only one
only_one_groupby_obj = True
# bool to know how to append for train / test group list
just_append_train = True
just_append_test = True
# process before shuffling
if ('default' == self.config['method']):
pass
elif ('sliding_window' == self.config['method']):
window_length = self.config.get('window_length', 10)
test_format = self.config.get('test_format', 'same_as_training')
if ('same_as_training' == test_format):
temp_df = groupby_obj.apply(lambda x: None
if len(x) < 10
else __class__.rolling_window(x,
window_length,
overlap_stride=window_length-1))
# groupby again
groupby_obj = temp_df.groupby(split_attr, sort=False)
just_append_train = False
just_append_test = False
else:
# there are two groupby_obj's: train_groupby_obj and test_groupby_obj
only_one_groupby_obj = False
# just append for test, opposite for train
just_append_train = False
# need to split test and train dataset before applying sliding window
group_list = list(groupby_obj)
# number of elements for test df
num_groups = int(len(group_list) * split_rate) + 1
print('num_groups: ', num_groups)
test_groups = group_list[:num_groups]
train_groups = group_list[num_groups:]
test_df = pd.DataFrame(columns=columns)
for _, group in test_groups:
test_df = pd.concat([test_df, group])
train_df = pd.DataFrame(columns=columns)
for _, group in train_groups:
train_df = pd.concat([train_df, group])
test_groupby_obj = test_df.groupby(split_attr, sort=False)
train_groupby_obj = train_df.groupby(split_attr, sort=False)
# apply rolling window train_groupby_obj
temp_df = train_groupby_obj.apply(lambda x: None
if len(x) < 10
else __class__.rolling_window(x,
window_length,
overlap_stride=window_length-1))
# groupby again
train_groupby_obj = temp_df.groupby(split_attr, sort=False)
# do nothing for default test_format
if ('default' == test_format):
pass
else:
just_append_test = False
# apply rolling window for testing dataset
if ('overlapping_last_element' == test_format):
overlap_stride = 1
elif ('partition'):
overlap_stride = 0
else:
print('{} test format not yet realized'.format(test_format))
exit(1)
temp_df = test_groupby_obj.apply(lambda x: None
if len(x) < 10
else __class__.rolling_window(x,
window_length,
overlap_stride=overlap_stride))
test_groupby_obj = temp_df.groupby(split_attr, sort=False)
print('processed before shuffling')
if (only_one_groupby_obj):
# shuffle
group_list = list(groupby_obj)
random.shuffle(group_list)
print('shuffled')
# number of elements for test df
num_groups = int(len(group_list) * split_rate) + 1
print('num_groups: ', num_groups)
test_groups = group_list[:num_groups]
train_groups = group_list[num_groups:]
else:
# shuffle
test_groups = list(test_groupby_obj)
train_groups = list(train_groupby_obj)
random.shuffle(test_groups)
random.shuffle(train_groups)
print('shuffled')
test_dfs = self.groups_to_dflist(test_groups, columns, just_append_test)
train_dfs = self.groups_to_dflist(train_groups, columns, just_append_train)
print('dfs for testing and training created')
if ('default' == self.config['method']):
test_num_steps = np.max([len(rows) for rows in test_dfs])
train_num_steps = np.max([len(rows) for rows in train_dfs])
elif ('sliding_window' == self.config['method']):
train_num_steps = window_length
if ('default' == test_format):
test_num_steps = np.max([len(rows) for rows in test_dfs])
else:
test_num_steps = window_length
return [(test_dfs, test_num_steps),
(train_dfs, train_num_steps),
max_num_skills]
else:
print('{} dataset not yet realized'.format(self.dataset))
exit(1)
# group_list is a list of groupby obj
def groups_to_dflist(self, group_list, columns, just_append=True):
if (just_append):
dfs = []
for _, rows in group_list:
dfs.append(rows)
else:
window_length = self.config['window_length']
dfs = []
for _, rows in group_list:
dfs.extend([rows.iloc[window_length * i : window_length * (i + 1), :]
for i in range(len(rows) // window_length)])
return dfs
# dfs: list of dataframes, each of which has a dataframe of distinct student
def dfs_to_students(self, dfs):
if ('Assistments' == self.dataset):
correct_attr = self.attr_dict.get('correct', None)
skill_id_attr = self.attr_dict.get('skill_id', None)
if ('2009' == self.version):
not_one_hot = (not self.config.get('one_hot', True)) and self.config.get('allow_multi_skills', True)
print('not_one_hot: ', not_one_hot)
if (not not_one_hot):
students = [([len(rows)],
rows.loc[:, skill_id_attr].values.tolist(),
rows.loc[:, correct_attr].values.tolist())
for rows in dfs]
else:
time_attr = self.attr_dict.get('time', None)
students = []
for rows in dfs:
skill_ids = rows.loc[:, skill_id_attr].astype('str').values.tolist()
results = []
for skill_id in skill_ids:
results.append(list(map(int, skill_id.split(' '))))
skill_ids = results
correctness = rows.loc[:, correct_attr].values.tolist()
problem_num = len(rows)
students.append(([problem_num], skill_ids, correctness))
elif ('2012' == self.version):
problem_content_attr = self.attr_dict.get('problem_content', None)
students = [([len(rows)],
rows[skill_id_attr].values.tolist(),
rows[correct_attr].values.tolist(),
rows[problem_content_attr].values.tolist())
for rows in dfs]
elif ('2015' == self.version):
students = [([len(rows)],
rows[skill_id_attr].values.tolist(),
rows[correct_attr].values.tolist())
for rows in dfs]
else:
print('{} dataset not yet realized'.format(self.dataset))
exit(1)
return students
# setup for preparing rnn
def set_config(self, config):
self.config.update(config)
def get_attributes_for_df(self):
attr_dict = {}
if ('Assistments' == self.dataset):
if ('2009' == self.version):
attr_dict['split'] = self.attr.user_id
attr_dict['time'] = self.attr.order_id
attr_dict['correct'] = self.attr.correct
attr_dict['skill_id'] = self.attr.skill_id
# for data config
attr_dict['scaffolding'] = self.attr.original
attr_dict['tutor_mode'] = self.attr.tutor_mode
attr_dict['allow_multi_skills'] = self.attr.order_id
elif('2012' == self.version):
attr_dict['split'] = self.attr.user_id
attr_dict['time'] = self.attr.end_time
attr_dict['correct'] = self.attr.correct
attr_dict['skill_id'] = self.attr.skill_id
# for data config
attr_dict['scaffolding'] = self.attr.original
attr_dict['tutor_mode'] = self.attr.tutor_mode
# problem contents
attr_dict['problem_content'] = self.attr.problem_content
# TODO: how to represent multiple skills in 2012 dataset
else:
print('{} version not yet realized'.format(self.version))
exit(1)
else:
print('{} dataset not yet realized'.format(self.dataset))
self.attr_dict = attr_dict
return attr_dict
# input
# method: {default, moving_window}
# max_num_steps: maximum number of steps
# max_num_skills: maximum number of skills
# students: list of tuples of 3 elements
# students[i][0]: a list of length 1 with num_problems
# students[i][1]: a list of skills
# students[i][2]: a list of correctness
# len(students[i][1]) == len(students[i][2]) == students[i][0][0]
def prepare_rnn(self):
# set proper attributes for df depending on dataset and version
attr_dict = self.get_attributes_for_df()
split_attr = attr_dict.get('split', None)
time_attr = attr_dict.get('time', None)
correct_attr = attr_dict.get('correct', None)
skill_id_attr = attr_dict.get('skill_id', None)
scaffolding_attr = attr_dict.get('scaffolding', None)
tutor_mode_attr = attr_dict.get('tutor_mode', None)
allow_multi_skill_attr = attr_dict.get('allow_multi_skills', None)
if ('Assistments' == self.dataset):
if ('2012' == self.version):
problem_content_attr = attr_dict.get('problem_content', None)
method = self.config['method']
has_scaffolding = self.config['has_scaffolding']
count_no_skill_id = self.config['count_no_skill_id']
has_test_mode = self.config['has_test_mode']
allow_multi_skills = self.config['allow_multi_skills']
one_hot = self.config['one_hot']
df = self.df.copy(deep=True)
if (not has_scaffolding):
if (None == scaffolding_attr):
print('{} dataset {} version needs scaffolding_attr'.format(self.dataset, self.version))
exit(1)
df = df.loc[df.loc[:, scaffolding_attr].astype('int') == 1]
if (not has_test_mode):
if (None == tutor_mode_attr):
print('{} dataset {} version needs tutor_mode_attr'.format(self.dataset, self.version))
exit(1)
df = df.loc[df.loc[:, tutor_mode_attr].astype('str') == 'tutor']
not_one_hot = (not one_hot) and allow_multi_skills
if (not allow_multi_skills):
if (None == allow_multi_skill_attr):
print('{} dataset {} version needs allow_multi_skill_atr'.format(self.dataset, self.version))
exit(1)
else:
df.drop_duplicates(subset=[allow_multi_skill_attr], inplace=True)
if (None == skill_id_attr):
print('{} dataset {} version needs skill_id_attr'.format(self.dataset, self.version))
exit(1)
if (not count_no_skill_id):
df = df.dropna(axis=0, subset=[skill_id_attr], how='any')
else:
df.loc[:, skill_id_attr].fillna(np.max(df.loc[:, skill_id_attr])+1, axis=0, inplace=True)
if ('2009' == self.version):
df = df.loc[:, [split_attr, skill_id_attr, correct_attr, time_attr]]
elif ('2012' == self.version):
df = df.loc[:, [split_attr, skill_id_attr, correct_attr, time_attr, problem_content_attr]]
df.loc[:, problem_content_attr].fillna('', axis=0, inplace=True)
# use original correct values from 0 ~ 1
# if ('2012' == self.version):
# __class__.process_correct_attr(df=df, correct_attr=correct_attr)
# else:
# __class__.assign_idx_to_column_values(df, correct_attr)
__class__.assign_idx_to_column_values(df, split_attr)
__class__.assign_idx_to_column_values(df, skill_id_attr)
split_rate = self.config.get('split_rate', 0.2)
train_test_num_skill_list = self.split_train_test(df, split_rate=split_rate)
test_dfs = train_test_num_skill_list[0][0]
test_num_steps = train_test_num_skill_list[0][1]
train_dfs = train_test_num_skill_list[1][0]
train_num_steps = train_test_num_skill_list[1][1]
num_skills = train_test_num_skill_list[2]
print('convert dfs to rnn inputs')
# convert list of dataframes to rnn input
test_students = self.dfs_to_students(test_dfs)
train_students = self.dfs_to_students(train_dfs)
self.test_rnn_data = (test_students, test_num_steps, num_skills)
self.train_rnn_data = (train_students, train_num_steps, num_skills)
print('conversion done')
else:
print('{} dataset not yet realized'.format(self.dataset))
def get_save_path(self, ext='csv', is_problem_contents=False):
test_save_path = self.get_datapath(ext=ext,
is_original=False,
is_problem_contents=is_problem_contents,
is_training=False)
train_save_path = self.get_datapath(ext=ext,
is_original=False,
is_problem_contents=is_problem_contents,
is_training=True)
return test_save_path, train_save_path
# ext: extension - csv, etc.
def save(self, ext='pkl', is_problem_contents=False):
_ext = {
'csv': 'csv',
'pkl': 'pkl'
}.get(ext, None)
if (None == ext):
print('{} extension not yet realized'.format(ext))
exit(1)
test_save_path, train_save_path = self.get_save_path(ext, is_problem_contents=is_problem_contents)
test_rnn_data = self.test_rnn_data
train_rnn_data = self.train_rnn_data
if ('csv' == ext):
__class__.save_rnn_data_as_csv(test_rnn_data, test_save_path)
__class__.save_rnn_data_as_csv(train_rnn_data, train_save_path)
elif ('pkl' == ext):
__class__.save_rnn_data_as_pkl(test_rnn_data, test_save_path)
__class__.save_rnn_data_as_pkl(train_rnn_data, train_save_path)
# for 2009 version ...
# self.config should include
# 1. method: default or moving_windows
# 2. has_scaffolding: True/False; include scaffolding problems or not; indicated by 'original' column
# 3. count_no_skill_id: True/False; include interactions with no skill id or not
# 4. has_test_mode: True/False; include test mode or not
# 5. allow_multi_skills: True/False; whehter to allow multi skill or not
# 6. window_length: if method is sliding_window, window_length should be provided. if not, default to 10
def generate_rnn_data(self, save=False, ext='csv', is_problem_contents=False, encoding='iso-8859-1'):
if ('Assistments' == self.dataset):
assert self.config != None, 'please set config first'
assert self.config['method'] != None, 'method is none'
assert type(self.config['has_scaffolding']) == bool, 'has_scaffolding is not boolean'
assert type(self.config['count_no_skill_id']) == bool, 'count_no_skill_id is not boolean'
assert type(self.config['has_test_mode']) == bool, 'has_test_mode is not boolean'
assert type(self.config['allow_multi_skills']) == bool, 'allow_multi_skills is not boolean'
assert type(self.config['one_hot']) == bool, 'one_hot is not boolean'
if ('df' not in self.__dict__):
self.df = __class__.read_data_from_csv(self.get_datapath(ext='csv',
is_original=True,
is_problem_contents=False,
is_training=True),
encoding)
if ('2012' == self.version):
problem_contents = __class__.read_data_from_csv(self.get_datapath(ext='csv',
is_original=True,
is_problem_contents=True,
is_training=True),
encoding)
self.prepare_rnn()
if (type(ext) == list and save):
for e in ext:
self.save(e)
elif (save):
self.save(ext)
return (self.test_rnn_data, self.train_rnn_data)
else:
print('{} dataset not yet realized'.format(self.dataset))
def load_rnn_data(self, is_training, ext='pkl', is_problem_contents=False):
_ext = {
'pkl': 'pkl'
}.get(ext, None)
if (None == _ext):
print('{} extension is not yet realized'.format(ext))
exit(1)
datapath = self.get_datapath(ext=ext,
is_original=False,
is_problem_contents=is_problem_contents,
is_training=is_training)
if ('pkl' == ext):
obj = __class__.load_rnn_data_from_pkl(datapath)
return obj
# attr_list: list of args for sorting from higher priorities
@staticmethod
def sort(df, attr_list, is_ascending_list=[True], inplace=True):
assert type(attr_list) == list, 'attr_list is not a list'
assert type(is_ascending_list) == list, 'is_ascending_list is not a list'
assert len(attr_list) == len(is_ascending_list), 'len of attr_list and is_ascending_list are not the same'
assert type(inplace) == bool, 'inplace is not a boolean'
result = df.sort_values(attr_list, ascending=is_ascending_list, inplace=inplace)
if (inplace):
return df
else:
return result
@staticmethod
def get_nunique(df, attr, dropna=True):
return df.loc[:, attr].nunique(dropna=dropna)
@staticmethod
def assign_idx_to_column_values(df, attr):
df.loc[:, attr] = pd.Categorical(df.loc[:, attr]).codes
@staticmethod
def read_data_from_csv(datapath, encoding='iso-8859-1'):
return pd.read_csv(datapath, encoding=encoding)
# 2d to 3d
# df to df
# result[i]: i-th sliding window 2d numpy array
# overlap_stride = 0 => partition
@staticmethod
def rolling_window(df, window, overlap_stride=0):
# first get the columns from df
columns = df.columns
# convert df to numpy array
a = df.values
rolled_a = __class__.rolling_window_np(a, window, overlap_stride)
rolled_df = pd.DataFrame(np.concatenate(rolled_a, axis=0), columns=columns)
return rolled_df
# 2d to 3d
# np to np
@staticmethod
def rolling_window_np(a, window, overlap_stride=0):
num_sequences = (a.shape[0] - overlap_stride) // (window - overlap_stride)
shape = (num_sequences, window, a.shape[-1])
strides = (a.strides[0] * (window - overlap_stride),) + a.strides
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
# for assistments 2012 dataset, correct values can be floating point between 0 and 1 for essay questions
# for convenience for now, if it is greater than or equal to 0.5, then it is processed as correct
# see "https://sites.google.com/site/assistmentsdata/how-to-interpret"
@staticmethod
def process_correct_attr(df, correct_attr):
df.loc[df.loc[:, correct_attr] > 0.5].loc[:, correct_attr] = 1
df.loc[:, correct_attr] = df.loc[:, correct_attr] // 1
@staticmethod
def save_rnn_data_as_csv(rnn_data, save_path):
# rnn_students include all the information including num_steps and num_skills
# just save rnn_students
with open(save_path, 'w', newline='') as f:
students = rnn_data[0]
writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_NONE, escapechar=' ')
for student in students:
row = []
for i in student:
row.append(i)
writer.writerows(row)
@staticmethod
def save_rnn_data_as_pkl(rnn_data, save_path):
obj = {
'students': rnn_data[0],
'num_steps': rnn_data[1],
'num_skills': rnn_data[2]
}
if (len(rnn_data) > 3):
obj['problem_contents'] = rnn_data[3]
with open(save_path, 'wb') as output_file:
cPickle.dump(file=output_file, obj=obj)
@staticmethod
def load_rnn_data_from_pkl(pklpath):
with open(pklpath, 'rb') as input_file:
obj = cPickle.load(input_file)
return obj
@staticmethod
def make_data_config(method='default',
has_scaffolding=True,
count_no_skill_id=True,
has_test_mode=True,
allow_multi_skills=True,
window_length=10,
test_format='same_as_training',
one_hot=True,
split_rate=0.2):
config = {
'method': method,
'has_scaffolding': has_scaffolding,
'count_no_skill_id': count_no_skill_id,
'has_test_mode': has_test_mode,
'allow_multi_skills': allow_multi_skills,
'one_hot': one_hot,
'split_rate': split_rate
}
if ('sliding_window' == method):
config['window_length'] = window_length
config['test_format'] = test_format
return config
In [3]:
def generate_2009_one_hot_sliding_window_all(split_rate=0.2):
# file path to log result
log_path = 'a2009_one_hot_sliding_window_all.log'
# run one_hot sliding_window with different config
# first generate 2009 data
a2009 = DataPreprocessor('Assistments', '2009')
for config_index in range(16):
binary_index = format(config_index, '04b')
config_arr = []
with open(log_path, 'a') as f:
f.write('\n' + binary_index + '\n')
for i in binary_index:
i_int = int(i)
i_bool = bool(i_int)
config_arr.append(i_bool)
config = {
'split_rate': split_rate,
'method': 'sliding_window',
'has_scaffolding': config_arr[0],
'count_no_skill_id': config_arr[1],
'has_test_mode': config_arr[2],
'allow_multi_skills': config_arr[3],
'window_length': 10,
'one_hot': True,
'test_format': 'overlapping_last_element'
}
start = time()
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'partition'
start = end
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'default'
start = end
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'same_as_training'
start = end
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
In [4]:
def generate_2009_not_one_hot_sliding_window_all(split_rate=0.2):
# file path to log result
log_path = 'a2009_not_one_hot_sliding_window_all.log'
# run not_one_hot sliding_window with different config
# first generate 2009 data
a2009 = DataPreprocessor('Assistments', '2009')
for config_index in range(8):
binary_index = format(config_index, '03b')
config_arr = []
with open(log_path, 'a') as f:
f.write('\n' + binary_index + '1\n')
for i in binary_index:
i_int = int(i)
i_bool = bool(i_int)
config_arr.append(i_bool)
config = {
'split_rate': split_rate,
'method': 'sliding_window',
'has_scaffolding': config_arr[0],
'count_no_skill_id': config_arr[1],
'has_test_mode': config_arr[2],
'allow_multi_skills': True,
'window_length': 10,
'one_hot': False,
'test_format': 'overlapping_last_element'
}
start = time()
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'partition'
start = end
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'default'
start = end
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'same_as_training'
start = end
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
In [5]:
def generate_2009_one_hot_default_all(split_rate=0.2):
# file path to log result
log_path = 'a2009_one_hot_default_all.log'
# run one_hot default with different config
# first generate 2009 data
a2009 = DataPreprocessor('Assistments', '2009')
for config_index in range(16):
binary_index = format(config_index, '04b')
config_arr = []
for i in binary_index:
i_int = int(i)
i_bool = bool(i_int)
config_arr.append(i_bool)
config = {
'split_rate': split_rate,
'method': 'default',
'has_scaffolding': config_arr[0],
'count_no_skill_id': config_arr[1],
'has_test_mode': config_arr[2],
'allow_multi_skills': config_arr[3],
'window_length': 10,
'one_hot': True,
'test_format': 'overlapping_last_element'
}
start = time()
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write('\n' + binary_index + ' : ' + str(end - start) + ' seconds\n')
In [6]:
def generate_2009_not_one_hot_default_all(split_rate=0.2):
# file path to log result
log_path = 'a2009_not_one_hot_default_all.log'
# run one_hot default with different config
# first generate 2009 data
a2009 = DataPreprocessor('Assistments', '2009')
for config_index in range(8):
binary_index = format(config_index, '03b')
config_arr = []
for i in binary_index:
i_int = int(i)
i_bool = bool(i_int)
config_arr.append(i_bool)
config = {
'split_rate': split_rate,
'method': 'default',
'has_scaffolding': config_arr[0],
'count_no_skill_id': config_arr[1],
'has_test_mode': config_arr[2],
'allow_multi_skills': True,
'window_length': 10,
'one_hot': False,
'test_format': 'overlapping_last_element'
}
start = time()
a2009.set_config(config)
a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write('\n' + binary_index + '1 : ' + str(end - start) + ' seconds\n')
In [7]:
def generate_2012_one_hot_sliding_window_all(split_rate=0.2):
# file path to log result
log_path = 'a2012_one_hot_sliding_window_all.log'
# run one_hot sliding_window with different config
# first generate 2012 data
a2012 = DataPreprocessor('Assistments', '2012')
for config_index in range(8):
binary_index = format(config_index, '03b')
config_arr = []
with open(log_path, 'a') as f:
f.write('\n' + binary_index + '1\n')
for i in binary_index:
i_int = int(i)
i_bool = bool(i_int)
config_arr.append(i_bool)
config = {
'split_rate': split_rate,
'method': 'sliding_window',
'has_scaffolding': config_arr[0],
'count_no_skill_id': config_arr[1],
'has_test_mode': config_arr[2],
'allow_multi_skills': True,
'window_length': 10,
'one_hot': True,
'test_format': 'overlapping_last_element'
}
start = time()
a2012.set_config(config)
a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'partition'
start = end
a2012.set_config(config)
a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'default'
start = end
a2012.set_config(config)
a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
config['test_format'] = 'same_as_training'
start = end
a2012.set_config(config)
a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
In [8]:
def generate_2012_one_hot_default_all(split_rate=0.2):
# file path to log result
log_path = 'a2012_one_hot_default_all.log'
# first generate 2012 data
a2012 = DataPreprocessor('Assistments', '2012')
for config_index in range(8):
binary_index = format(config_index, '03b')
config_arr = []
for i in binary_index:
i_int = int(i)
i_bool = bool(i_int)
config_arr.append(i_bool)
config = {
'split_rate': split_rate,
'method': 'default',
'has_scaffolding': config_arr[0],
'count_no_skill_id': config_arr[1],
'has_test_mode': config_arr[2],
'allow_multi_skills': True,
'window_length': 10,
'one_hot': True,
'test_format': 'overlapping_last_element'
}
start = time()
a2012.set_config(config)
a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
end = time()
with open(log_path, 'a') as f:
f.write('\n' + binary_index + '1 : ' + str(end - start) + ' seconds\n')
In [9]:
if ('__main__' == __name__):
split_rate = 0.2
#generate_2009_not_one_hot_default_all(split_rate)
#generate_2009_not_one_hot_sliding_window_all(split_rate)
#generate_2009_one_hot_default_all(split_rate)
#generate_2009_one_hot_sliding_window_all(split_rate)
#generate_2012_one_hot_default_all(split_rate)
#generate_2012_one_hot_sliding_window_all(split_rate)
pass
In [10]:
def test_performance(assistment_instance, config_list, ext='csv'):
now = datetime.now().strftime('__%Y-%m-%d_%H:%M:%S')
for config in config_list:
assistment_instance.set_config(config)
timestamp = time()
rnn_data = assistment_instance.generate_rnn_data()
new_timestamp = time()
with open('Assistments' + assistment_instance.version + now + '.log', 'a') as f:
f.write(config.get('method', None) + ' generation: ' + str(new_timestamp - timestamp) + ' seconds\n')
# save csv and pkl and time it
timestamp = time()
assistment_instance.save('csv')
new_timestamp = time()
with open('Assistments' + assistment_instance.version + now + '.log', 'a') as f:
f.write(config.get('method') + ' save: ' + str(new_timestamp - timestamp) + ' seconds \n')
timestamp = time()
assistment_instance.save('pkl')
new_timestamp = time()
with open('Assistments' + assistment_instance.version + now + '.log', 'a') as f:
f.write(config.get('method') + ' save: ' + str(new_timestamp - timestamp) + ' seconds \n')
In [11]:
def print_result(rnn_data):
print('')
print('*' * 125)
print('')
rnn_students = rnn_data['students']
rnn_num_steps = rnn_data['num_steps']
rnn_num_skills = rnn_data['num_skills']
print('len(rnn_students): ', len(rnn_students))
print('')
print('rnn_num_steps: ', rnn_num_steps)
print('')
print('rnn_num_skills: ', rnn_num_skills)
print('')
print('rnn_students[0][0][0]: ', rnn_students[0][0][0])
print('len(rnn_students[0][1]): ', len(rnn_students[0][1]))
print('len(rnn_students[0][2]): ', len(rnn_students[0][2]))
print('')
print('correctness check: ')
print(rnn_students[0][2])
print('')
print('*' * 125)
print('')