In [1]:
# imports
import numpy as np
import pandas as pd
import _pickle as cPickle
import random
from datetime import datetime
from time import time
import csv
from AssistmentsProperties import AssistmentsProperties

In [2]:
# Assistments class for data preprocessing
class DataPreprocessor(object):
    def __init__(self, dataset_str, version):
        self.dataset = dataset_str
        self.version = version
        
        if ('Assistments' == dataset_str):
            self.attr = AssistmentsProperties(version)
        else:
            print('{} dataset not yet realized'.format(dataset_str))
            exit(1)
        
        # default data config
        self.config = {
            'split_rate': 0.2,
            'method': 'default',
            'has_scaffolding': True,
            'count_no_skill_id': True,
            'has_test_mode': True,
            'allow_multi_skills': True,
            'window_length': 10,
            'one_hot': True
        }
    
    def get_datapath(self, ext='csv', is_original=True, is_problem_contents=False, is_training=True):
        return self.attr.get_datapath(ext, is_original, self.config, is_problem_contents, is_training)
    
    def set_datapath(self, datapath):
        self.attr.set_datapath(datapath)
    
    # input: dataframe, split attribute, and split_rate
    # input should be injected after assigning idx
    # return a list
    # list[0] = (test_df, test_num_problems, test_num_steps)
    # list[1] = (train_df, train_num_problems, train_num_steps)
    # list[2] = max_num_skills
    def split_train_test(self, df, split_rate=0.2):
        if ('Assistments' == self.dataset):
            split_attr = self.attr_dict.get('split', None)
            skill_id_attr = self.attr_dict.get('skill_id', None)
            time_attr = self.attr_dict.get('time', None)
            
            max_num_skills = __class__.get_nunique(df, skill_id_attr)
            
            __class__.sort(df, [split_attr, time_attr], [True, True])
            
            # change skill_id as str for probable multiple skill concat
            df.loc[:, skill_id_attr] = df.loc[:, skill_id_attr].astype('str')
            
            # for probable use
            columns = df.columns
            
            # split dataframes according to split_attr
            # note that groupby preserves the order of samples within groups (see documentation)
            groupby_obj = df.groupby(split_attr, sort=False)
            print('groupby done')
            
            # bool to check if it is not one_hot
            not_one_hot = (not self.config.get('one_hot', True)) and self.config.get('allow_multi_skills', True)
            print('not_one_hot: ', not_one_hot)
            
            if ('2009' == self.version and not_one_hot):
                allow_multi_skill_attr = self.attr_dict.get('allow_multi_skills', None)
                
                # TODO: the part below is the bottleneck. make it more efficient
                # groupby again for each group according to time attribute; should be sorted
                skill_joined = pd.DataFrame(columns=[time_attr, skill_id_attr])
                for _, rows in groupby_obj:
                    time_groupby_obj = rows.groupby(time_attr, sort=True)

                    # concat skills with a space as a delimeter
                    joined = time_groupby_obj[skill_id_attr].apply(' '.join).reset_index()
                    skill_joined = pd.concat([skill_joined, joined])

                # now drop duplicate multiple skills
                df.drop_duplicates(subset=[allow_multi_skill_attr], inplace=True)
                
                # drop skill_id_attr column
                df.drop(skill_id_attr, axis=1, inplace=True)
                # join multi_skill column with original df
                df = df.join(skill_joined.set_index(time_attr), 
                                 on=time_attr, how = 'outer')
                
                # now groupby again
                groupby_obj = df.groupby(split_attr, sort=False)

            # bool to know if there are two groupby obj's or only one
            only_one_groupby_obj = True
            
            # bool to know how to append for train / test group list
            just_append_train = True
            just_append_test = True
            
            # process before shuffling
            if ('default' == self.config['method']):
                pass
            
            elif ('sliding_window' == self.config['method']):
                window_length = self.config.get('window_length', 10)
                test_format = self.config.get('test_format', 'same_as_training')

                if ('same_as_training' == test_format):
                    temp_df = groupby_obj.apply(lambda x: None 
                                           if len(x) < 10
                                           else __class__.rolling_window(x,
                                                                         window_length, 
                                                                         overlap_stride=window_length-1))
                    
                    # groupby again
                    groupby_obj = temp_df.groupby(split_attr, sort=False)
                    
                    just_append_train = False
                    just_append_test = False
                
                else:
                    # there are two groupby_obj's: train_groupby_obj and test_groupby_obj
                    only_one_groupby_obj = False
                    
                    # just append for test, opposite for train
                    just_append_train = False
                    
                    # need to split test and train dataset before applying sliding window
                    group_list = list(groupby_obj)
                    
                    # number of elements for test df
                    num_groups = int(len(group_list) * split_rate) + 1
                    print('num_groups: ', num_groups)
                    
                    test_groups = group_list[:num_groups]
                    train_groups = group_list[num_groups:]
                    
                    test_df = pd.DataFrame(columns=columns)
                    for _, group in test_groups:
                        test_df = pd.concat([test_df, group])
                        
                    train_df = pd.DataFrame(columns=columns)
                    for _, group in train_groups:
                        train_df = pd.concat([train_df, group])    
                        
                    test_groupby_obj = test_df.groupby(split_attr, sort=False)
                    train_groupby_obj = train_df.groupby(split_attr, sort=False)
                    
                    # apply rolling window train_groupby_obj
                    temp_df = train_groupby_obj.apply(lambda x: None
                                                      if len(x) < 10
                                                      else __class__.rolling_window(x,
                                                                                    window_length,
                                                                                    overlap_stride=window_length-1))
                    
                    # groupby again
                    train_groupby_obj = temp_df.groupby(split_attr, sort=False) 
                    
                    # do nothing for default test_format
                    if ('default' == test_format):
                        pass
                    else:
                        just_append_test = False
                        # apply rolling window for testing dataset
                        if ('overlapping_last_element' == test_format):
                            overlap_stride = 1
                        elif ('partition'):
                            overlap_stride = 0
                        else:
                            print('{} test format not yet realized'.format(test_format))
                            exit(1)
                        
                        temp_df = test_groupby_obj.apply(lambda x: None
                                                         if len(x) < 10
                                                         else __class__.rolling_window(x,
                                                                                       window_length, 
                                                                                       overlap_stride=overlap_stride))
                        test_groupby_obj = temp_df.groupby(split_attr, sort=False)

            print('processed before shuffling')
            
            if (only_one_groupby_obj):
                # shuffle
                group_list = list(groupby_obj)
                random.shuffle(group_list)
                print('shuffled')

                # number of elements for test df
                num_groups = int(len(group_list) * split_rate) + 1
                print('num_groups: ', num_groups)

                test_groups = group_list[:num_groups]
                train_groups = group_list[num_groups:]
            else:
                # shuffle
                test_groups = list(test_groupby_obj)
                train_groups = list(train_groupby_obj)
                random.shuffle(test_groups)
                random.shuffle(train_groups)
                print('shuffled')
                
            test_dfs = self.groups_to_dflist(test_groups, columns, just_append_test)
            train_dfs = self.groups_to_dflist(train_groups, columns, just_append_train)
            print('dfs for testing and training created')
            

            if ('default' == self.config['method']):
                test_num_steps = np.max([len(rows) for rows in test_dfs])
                train_num_steps = np.max([len(rows) for rows in train_dfs])
            elif ('sliding_window' == self.config['method']):
                train_num_steps = window_length
                
                if ('default' == test_format):
                    test_num_steps = np.max([len(rows) for rows in test_dfs])
                else:
                    test_num_steps = window_length

            return [(test_dfs, test_num_steps),
                    (train_dfs, train_num_steps),
                    max_num_skills]
        else:
            print('{} dataset not yet realized'.format(self.dataset))
            exit(1)
    
    # group_list is a list of groupby obj
    def groups_to_dflist(self, group_list, columns, just_append=True):
        if (just_append):
            dfs = []
            for _, rows in group_list:
                dfs.append(rows)
            
        else:
            window_length = self.config['window_length']
            dfs = []
            for _, rows in group_list:
                dfs.extend([rows.iloc[window_length * i : window_length * (i + 1), :] 
                            for i in range(len(rows) // window_length)])
        
        return dfs
    
    # dfs: list of dataframes, each of which has a dataframe of distinct student
    def dfs_to_students(self, dfs):
        if ('Assistments' == self.dataset):
            correct_attr = self.attr_dict.get('correct', None)
            skill_id_attr = self.attr_dict.get('skill_id', None)

            if ('2009' == self.version):
                not_one_hot = (not self.config.get('one_hot', True)) and self.config.get('allow_multi_skills', True)
                print('not_one_hot: ', not_one_hot)
                
                if (not not_one_hot):
                    students = [([len(rows)], 
                                 rows.loc[:, skill_id_attr].values.tolist(), 
                                 rows.loc[:, correct_attr].values.tolist()) 
                                 for rows in dfs]
                else:
                    time_attr = self.attr_dict.get('time', None)
                    students = []
                    for rows in dfs:
                        skill_ids = rows.loc[:, skill_id_attr].astype('str').values.tolist()
                        results = []
                        
                        for skill_id in skill_ids:
                            results.append(list(map(int, skill_id.split(' '))))
                        
                        skill_ids = results
                        
                        correctness = rows.loc[:, correct_attr].values.tolist()
                        problem_num = len(rows)

                        students.append(([problem_num], skill_ids, correctness))
            
            elif ('2012' == self.version):
                problem_content_attr = self.attr_dict.get('problem_content', None)
                
                students = [([len(rows)], 
                             rows[skill_id_attr].values.tolist(), 
                             rows[correct_attr].values.tolist(),
                             rows[problem_content_attr].values.tolist())
                             for rows in dfs]
            
            elif ('2015' == self.version):
                students = [([len(rows)], 
                             rows[skill_id_attr].values.tolist(), 
                             rows[correct_attr].values.tolist()) 
                             for rows in dfs]
                
        else:
            print('{} dataset not yet realized'.format(self.dataset))
            exit(1)
        
        return students

        
    
    # setup for preparing rnn
    def set_config(self, config):
        self.config.update(config)
    
    def get_attributes_for_df(self):
        attr_dict = {}
        if ('Assistments' == self.dataset):
            if ('2009' == self.version):
                attr_dict['split'] = self.attr.user_id
                attr_dict['time'] = self.attr.order_id
                attr_dict['correct'] = self.attr.correct
                attr_dict['skill_id'] = self.attr.skill_id

                # for data config
                attr_dict['scaffolding'] = self.attr.original
                attr_dict['tutor_mode'] = self.attr.tutor_mode
                attr_dict['allow_multi_skills'] = self.attr.order_id

            elif('2012' == self.version):
                attr_dict['split'] = self.attr.user_id
                attr_dict['time'] = self.attr.end_time
                attr_dict['correct'] = self.attr.correct
                attr_dict['skill_id'] = self.attr.skill_id
                
                # for data config
                attr_dict['scaffolding'] = self.attr.original
                attr_dict['tutor_mode'] = self.attr.tutor_mode
                
                # problem contents
                attr_dict['problem_content'] = self.attr.problem_content
                
                # TODO: how to represent multiple skills in 2012 dataset
            
            else:
                print('{} version not yet realized'.format(self.version))
                exit(1)
        else:
            print('{} dataset not yet realized'.format(self.dataset))
        
        self.attr_dict = attr_dict
        
        return attr_dict
    
    # input
    # method: {default, moving_window}
    # max_num_steps: maximum number of steps
    # max_num_skills: maximum number of skills
    # students: list of tuples of 3 elements
    # students[i][0]: a list of length 1 with num_problems
    # students[i][1]: a list of skills
    # students[i][2]: a list of correctness
    # len(students[i][1]) == len(students[i][2]) == students[i][0][0]
    def prepare_rnn(self):
        # set proper attributes for df depending on dataset and version
        attr_dict = self.get_attributes_for_df()
        
        split_attr = attr_dict.get('split', None)
        time_attr = attr_dict.get('time', None)
        correct_attr = attr_dict.get('correct', None)
        skill_id_attr = attr_dict.get('skill_id', None)
        scaffolding_attr = attr_dict.get('scaffolding', None)
        tutor_mode_attr = attr_dict.get('tutor_mode', None)
        allow_multi_skill_attr = attr_dict.get('allow_multi_skills', None)
        
        if ('Assistments' == self.dataset):
            if ('2012' == self.version):
                problem_content_attr = attr_dict.get('problem_content', None)
                
            method = self.config['method']
            has_scaffolding = self.config['has_scaffolding']
            count_no_skill_id = self.config['count_no_skill_id']
            has_test_mode = self.config['has_test_mode']
            allow_multi_skills = self.config['allow_multi_skills']
            one_hot = self.config['one_hot']
            df = self.df.copy(deep=True)
                
            if (not has_scaffolding):
                if (None == scaffolding_attr):
                    print('{} dataset {} version needs scaffolding_attr'.format(self.dataset, self.version))
                    exit(1)
                df = df.loc[df.loc[:, scaffolding_attr].astype('int') == 1]
            if (not has_test_mode):
                if (None == tutor_mode_attr):
                    print('{} dataset {} version needs tutor_mode_attr'.format(self.dataset, self.version))
                    exit(1)
                df = df.loc[df.loc[:, tutor_mode_attr].astype('str') == 'tutor']
            
            not_one_hot = (not one_hot) and allow_multi_skills
            
            if (not allow_multi_skills):
                if (None == allow_multi_skill_attr):
                    print('{} dataset {} version needs allow_multi_skill_atr'.format(self.dataset, self.version))
                    exit(1)
                else:
                    df.drop_duplicates(subset=[allow_multi_skill_attr], inplace=True)
            
            if (None == skill_id_attr):
                print('{} dataset {} version needs skill_id_attr'.format(self.dataset, self.version))
                exit(1)
            
            if (not count_no_skill_id):
                df = df.dropna(axis=0, subset=[skill_id_attr], how='any')
            else:
                df.loc[:, skill_id_attr].fillna(np.max(df.loc[:, skill_id_attr])+1, axis=0, inplace=True)
            
            if ('2009' == self.version):
                df = df.loc[:, [split_attr, skill_id_attr, correct_attr, time_attr]]
            elif ('2012' == self.version):
                df = df.loc[:, [split_attr, skill_id_attr, correct_attr, time_attr, problem_content_attr]]
                df.loc[:, problem_content_attr].fillna('', axis=0, inplace=True)
            
            # use original correct values from 0 ~ 1
            # if ('2012' == self.version):
            #     __class__.process_correct_attr(df=df, correct_attr=correct_attr)
            # else:
            #     __class__.assign_idx_to_column_values(df, correct_attr)

            __class__.assign_idx_to_column_values(df, split_attr)
            __class__.assign_idx_to_column_values(df, skill_id_attr)

            split_rate = self.config.get('split_rate', 0.2)
            train_test_num_skill_list = self.split_train_test(df, split_rate=split_rate)

            test_dfs = train_test_num_skill_list[0][0]
            test_num_steps = train_test_num_skill_list[0][1]

            train_dfs = train_test_num_skill_list[1][0]
            train_num_steps = train_test_num_skill_list[1][1]

            num_skills = train_test_num_skill_list[2]
            
            print('convert dfs to rnn inputs')
            
            # convert list of dataframes to rnn input            
            test_students = self.dfs_to_students(test_dfs)
            train_students = self.dfs_to_students(train_dfs)

            self.test_rnn_data = (test_students, test_num_steps, num_skills)
            self.train_rnn_data = (train_students, train_num_steps, num_skills)
            
            print('conversion done')

        else:
            print('{} dataset not yet realized'.format(self.dataset))
    
    def get_save_path(self, ext='csv', is_problem_contents=False):
        test_save_path = self.get_datapath(ext=ext, 
                                           is_original=False,                               
                                           is_problem_contents=is_problem_contents,
                                           is_training=False)
        train_save_path = self.get_datapath(ext=ext,
                                            is_original=False, 
                                            is_problem_contents=is_problem_contents,
                                            is_training=True)

        return test_save_path, train_save_path
    
    # ext: extension - csv, etc.
    def save(self, ext='pkl', is_problem_contents=False):
        _ext = {
            'csv': 'csv',
            'pkl': 'pkl'
        }.get(ext, None)
        
        if (None == ext):
            print('{} extension not yet realized'.format(ext))
            exit(1)
        
        test_save_path, train_save_path = self.get_save_path(ext, is_problem_contents=is_problem_contents)
        test_rnn_data = self.test_rnn_data
        train_rnn_data = self.train_rnn_data
        if ('csv' == ext):
            __class__.save_rnn_data_as_csv(test_rnn_data, test_save_path)
            __class__.save_rnn_data_as_csv(train_rnn_data, train_save_path)
        
        elif ('pkl' == ext):
            __class__.save_rnn_data_as_pkl(test_rnn_data, test_save_path)
            __class__.save_rnn_data_as_pkl(train_rnn_data, train_save_path)
    
    # for 2009 version ...
    # self.config should include
    # 1. method: default or moving_windows
    # 2. has_scaffolding: True/False; include scaffolding problems or not; indicated by 'original' column
    # 3. count_no_skill_id: True/False; include interactions with no skill id or not
    # 4. has_test_mode: True/False; include test mode or not
    # 5. allow_multi_skills: True/False; whehter to allow multi skill or not
    # 6. window_length: if method is sliding_window, window_length should be provided. if not, default to 10
    def generate_rnn_data(self, save=False, ext='csv', is_problem_contents=False, encoding='iso-8859-1'):
        if ('Assistments' == self.dataset):
            assert self.config != None, 'please set config first'
            assert self.config['method'] != None, 'method is none'
            assert type(self.config['has_scaffolding']) == bool, 'has_scaffolding is not boolean'
            assert type(self.config['count_no_skill_id']) == bool, 'count_no_skill_id is not boolean'
            assert type(self.config['has_test_mode']) == bool, 'has_test_mode is not boolean'
            assert type(self.config['allow_multi_skills']) == bool, 'allow_multi_skills is not boolean'
            assert type(self.config['one_hot']) == bool, 'one_hot is not boolean'
            
            if ('df' not in self.__dict__):
                self.df = __class__.read_data_from_csv(self.get_datapath(ext='csv', 
                                                                         is_original=True, 
                                                                         is_problem_contents=False, 
                                                                         is_training=True), 
                                                       encoding)
                if ('2012' == self.version):
                    problem_contents = __class__.read_data_from_csv(self.get_datapath(ext='csv', 
                                                                                      is_original=True, 
                                                                                      is_problem_contents=True, 
                                                                                      is_training=True),
                                                                    encoding)
                    
            
            self.prepare_rnn()
            
            if (type(ext) == list and save):
                for e in ext:
                    self.save(e)
            
            elif (save):
                self.save(ext)

            return (self.test_rnn_data, self.train_rnn_data)
        else:
            print('{} dataset not yet realized'.format(self.dataset))
    
    def load_rnn_data(self, is_training, ext='pkl', is_problem_contents=False):
        _ext = {
            'pkl': 'pkl'
        }.get(ext, None)
        
        if (None == _ext):
            print('{} extension is not yet realized'.format(ext))
            exit(1)
        
        datapath = self.get_datapath(ext=ext, 
                                     is_original=False,  
                                     is_problem_contents=is_problem_contents, 
                                     is_training=is_training)
        
        if ('pkl' == ext):
            obj = __class__.load_rnn_data_from_pkl(datapath)
            
        return obj
        # attr_list: list of args for sorting from higher priorities
    
    @staticmethod
    def sort(df, attr_list, is_ascending_list=[True], inplace=True):
        assert type(attr_list) == list, 'attr_list is not a list'
        assert type(is_ascending_list) == list, 'is_ascending_list is not a list'
        assert len(attr_list) == len(is_ascending_list), 'len of attr_list and is_ascending_list are not the same'
        assert type(inplace) == bool, 'inplace is not a boolean'
        
        result = df.sort_values(attr_list, ascending=is_ascending_list, inplace=inplace)
        
        if (inplace):
            return df
        else:
            return result
        
    @staticmethod
    def get_nunique(df, attr, dropna=True):
        return df.loc[:, attr].nunique(dropna=dropna)
    
    @staticmethod
    def assign_idx_to_column_values(df, attr):
        df.loc[:, attr] = pd.Categorical(df.loc[:, attr]).codes
    
    @staticmethod
    def read_data_from_csv(datapath, encoding='iso-8859-1'):
        return pd.read_csv(datapath, encoding=encoding)
    
    # 2d to 3d
    # df to df
    # result[i]: i-th sliding window 2d numpy array
    # overlap_stride = 0 => partition
    @staticmethod
    def rolling_window(df, window, overlap_stride=0):
        # first get the columns from df
        columns = df.columns
        
        # convert df to numpy array
        a = df.values
        rolled_a = __class__.rolling_window_np(a, window, overlap_stride)
        
        rolled_df = pd.DataFrame(np.concatenate(rolled_a, axis=0), columns=columns)

        return rolled_df
    
    # 2d to 3d
    # np to np
    @staticmethod
    def rolling_window_np(a, window, overlap_stride=0):
        num_sequences = (a.shape[0] - overlap_stride) // (window - overlap_stride)
        shape = (num_sequences, window, a.shape[-1])
        strides = (a.strides[0] * (window - overlap_stride),) + a.strides
        
        return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
    
    # for assistments 2012 dataset, correct values can be floating point between 0 and 1 for essay questions
    # for convenience for now, if it is greater than or equal to 0.5, then it is processed as correct
    # see "https://sites.google.com/site/assistmentsdata/how-to-interpret"
    @staticmethod
    def process_correct_attr(df, correct_attr):
        df.loc[df.loc[:, correct_attr] > 0.5].loc[:, correct_attr] = 1
        df.loc[:, correct_attr] = df.loc[:, correct_attr] // 1
    
    @staticmethod
    def save_rnn_data_as_csv(rnn_data, save_path):
        # rnn_students include all the information including num_steps and num_skills
        # just save rnn_students
        with open(save_path, 'w', newline='') as f:
            students = rnn_data[0]
            writer = csv.writer(f, delimiter=',', quoting=csv.QUOTE_NONE, escapechar=' ')
            
            for student in students:
                row = []
                for i in student:
                    row.append(i)

                writer.writerows(row)
    
    @staticmethod
    def save_rnn_data_as_pkl(rnn_data, save_path):
        obj = {
            'students': rnn_data[0], 
            'num_steps': rnn_data[1], 
            'num_skills': rnn_data[2]
        }
        
        if (len(rnn_data) > 3):
            obj['problem_contents'] = rnn_data[3]
        
        with open(save_path, 'wb') as output_file:
            cPickle.dump(file=output_file, obj=obj)
    
    @staticmethod
    def load_rnn_data_from_pkl(pklpath):
        with open(pklpath, 'rb') as input_file:
            obj = cPickle.load(input_file)
        
        return obj
    
    @staticmethod
    def make_data_config(method='default', 
                         has_scaffolding=True, 
                         count_no_skill_id=True, 
                         has_test_mode=True, 
                         allow_multi_skills=True, 
                         window_length=10,
                         test_format='same_as_training',
                         one_hot=True, 
                         split_rate=0.2):
        config = {
            'method': method, 
            'has_scaffolding': has_scaffolding,
            'count_no_skill_id': count_no_skill_id, 
            'has_test_mode': has_test_mode,
            'allow_multi_skills': allow_multi_skills,
            'one_hot': one_hot,
            'split_rate': split_rate
        }
        
        if ('sliding_window' == method):
            config['window_length'] = window_length
            config['test_format'] = test_format
        
        return config

In [3]:
def generate_2009_one_hot_sliding_window_all(split_rate=0.2):
    # file path to log result
    log_path = 'a2009_one_hot_sliding_window_all.log'
    
    # run one_hot sliding_window with different config
    # first generate 2009 data
    a2009 = DataPreprocessor('Assistments', '2009')

    for config_index in range(16):
        binary_index = format(config_index, '04b')
        config_arr = []

        with open(log_path, 'a') as f:
            f.write('\n' + binary_index + '\n')
        
        for i in binary_index:
            i_int = int(i)
            i_bool = bool(i_int)
            config_arr.append(i_bool)

        config = {
            'split_rate': split_rate,
            'method': 'sliding_window',
            'has_scaffolding': config_arr[0],
            'count_no_skill_id': config_arr[1],
            'has_test_mode': config_arr[2],
            'allow_multi_skills': config_arr[3],
            'window_length': 10,
            'one_hot': True,
            'test_format': 'overlapping_last_element'
        }
    
        start = time()
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'partition'

        start = end
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'default'

        start = end
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'same_as_training'

        start = end
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')

In [4]:
def generate_2009_not_one_hot_sliding_window_all(split_rate=0.2):
    # file path to log result
    log_path = 'a2009_not_one_hot_sliding_window_all.log'
    
    # run not_one_hot sliding_window with different config
    # first generate 2009 data
    a2009 = DataPreprocessor('Assistments', '2009')

    for config_index in range(8):
        binary_index = format(config_index, '03b')
        config_arr = []

        with open(log_path, 'a') as f:
            f.write('\n' + binary_index + '1\n')
        
        for i in binary_index:
            i_int = int(i)
            i_bool = bool(i_int)
            config_arr.append(i_bool)

        config = {
            'split_rate': split_rate,
            'method': 'sliding_window',
            'has_scaffolding': config_arr[0],
            'count_no_skill_id': config_arr[1],
            'has_test_mode': config_arr[2],
            'allow_multi_skills': True,
            'window_length': 10,
            'one_hot': False,
            'test_format': 'overlapping_last_element'
        }
    
        start = time()
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'partition'

        start = end
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'default'

        start = end
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'same_as_training'

        start = end
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')

In [5]:
def generate_2009_one_hot_default_all(split_rate=0.2):
    # file path to log result
    log_path = 'a2009_one_hot_default_all.log'
    
    # run one_hot default with different config
    # first generate 2009 data
    a2009 = DataPreprocessor('Assistments', '2009')

    for config_index in range(16):
        binary_index = format(config_index, '04b')
        config_arr = []
        
        for i in binary_index:
            i_int = int(i)
            i_bool = bool(i_int)
            config_arr.append(i_bool)

        config = {
            'split_rate': split_rate,
            'method': 'default',
            'has_scaffolding': config_arr[0],
            'count_no_skill_id': config_arr[1],
            'has_test_mode': config_arr[2],
            'allow_multi_skills': config_arr[3],
            'window_length': 10,
            'one_hot': True,
            'test_format': 'overlapping_last_element'
        }
    
        start = time()
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write('\n' + binary_index + ' : ' + str(end - start) + ' seconds\n')

In [6]:
def generate_2009_not_one_hot_default_all(split_rate=0.2):
    # file path to log result
    log_path = 'a2009_not_one_hot_default_all.log'
    
    # run one_hot default with different config
    # first generate 2009 data
    a2009 = DataPreprocessor('Assistments', '2009')

    for config_index in range(8):
        binary_index = format(config_index, '03b')
        config_arr = []
        
        for i in binary_index:
            i_int = int(i)
            i_bool = bool(i_int)
            config_arr.append(i_bool)

        config = {
            'split_rate': split_rate,
            'method': 'default',
            'has_scaffolding': config_arr[0],
            'count_no_skill_id': config_arr[1],
            'has_test_mode': config_arr[2],
            'allow_multi_skills': True,
            'window_length': 10,
            'one_hot': False,
            'test_format': 'overlapping_last_element'
        }
    
        start = time()
        a2009.set_config(config)
        a2009.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write('\n' + binary_index + '1 : ' + str(end - start) + ' seconds\n')

In [7]:
def generate_2012_one_hot_sliding_window_all(split_rate=0.2):
    # file path to log result
    log_path = 'a2012_one_hot_sliding_window_all.log'
    
    # run one_hot sliding_window with different config
    # first generate 2012 data
    a2012 = DataPreprocessor('Assistments', '2012')

    for config_index in range(8):
        binary_index = format(config_index, '03b')
        config_arr = []

        with open(log_path, 'a') as f:
            f.write('\n' + binary_index + '1\n')
        
        for i in binary_index:
            i_int = int(i)
            i_bool = bool(i_int)
            config_arr.append(i_bool)

        config = {
            'split_rate': split_rate,
            'method': 'sliding_window',
            'has_scaffolding': config_arr[0],
            'count_no_skill_id': config_arr[1],
            'has_test_mode': config_arr[2],
            'allow_multi_skills': True,
            'window_length': 10,
            'one_hot': True,
            'test_format': 'overlapping_last_element'
        }
    
        start = time()
        a2012.set_config(config)
        a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'partition'

        start = end
        a2012.set_config(config)
        a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'default'

        start = end
        a2012.set_config(config)
        a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')
        
        config['test_format'] = 'same_as_training'

        start = end
        a2012.set_config(config)
        a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write(config['test_format'] + ' : ' + str(end - start) + ' seconds\n')

In [8]:
def generate_2012_one_hot_default_all(split_rate=0.2):
    # file path to log result
    log_path = 'a2012_one_hot_default_all.log'
    
    # first generate 2012 data
    a2012 = DataPreprocessor('Assistments', '2012')

    for config_index in range(8):
        binary_index = format(config_index, '03b')
        config_arr = []
        
        for i in binary_index:
            i_int = int(i)
            i_bool = bool(i_int)
            config_arr.append(i_bool)

        config = {
            'split_rate': split_rate,
            'method': 'default',
            'has_scaffolding': config_arr[0],
            'count_no_skill_id': config_arr[1],
            'has_test_mode': config_arr[2],
            'allow_multi_skills': True,
            'window_length': 10,
            'one_hot': True,
            'test_format': 'overlapping_last_element'
        }
    
        start = time()
        a2012.set_config(config)
        a2012.generate_rnn_data(ext=['pkl', 'csv'], save=True, is_problem_contents=False)
        end = time()
        
        with open(log_path, 'a') as f:
            f.write('\n' + binary_index + '1 : ' + str(end - start) + ' seconds\n')

In [9]:
if ('__main__' == __name__):
    split_rate = 0.2
    #generate_2009_not_one_hot_default_all(split_rate)
    #generate_2009_not_one_hot_sliding_window_all(split_rate)
    #generate_2009_one_hot_default_all(split_rate)
    #generate_2009_one_hot_sliding_window_all(split_rate)
    #generate_2012_one_hot_default_all(split_rate)
    #generate_2012_one_hot_sliding_window_all(split_rate)
    pass


/home/data/jleeae/anaconda3/envs/py3.6_tf1.1/lib/python3.6/site-packages/ipykernel_launcher.py:5: DtypeWarning: Columns (17,29) have mixed types. Specify dtype option on import or set low_memory=False.
  """
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  833
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  833
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  833
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  833
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  843
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  843
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  844
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  844
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  833
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  833
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  833
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  833
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  843
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  843
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  844
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done
groupby done
not_one_hot:  False
processed before shuffling
shuffled
num_groups:  844
dfs for testing and training created
convert dfs to rnn inputs
not_one_hot:  False
not_one_hot:  False
conversion done

In [10]:
def test_performance(assistment_instance, config_list, ext='csv'):
    now = datetime.now().strftime('__%Y-%m-%d_%H:%M:%S')
    for config in config_list:
        assistment_instance.set_config(config)
        timestamp = time()
        rnn_data = assistment_instance.generate_rnn_data()
        new_timestamp = time()
        
        with open('Assistments' + assistment_instance.version + now + '.log', 'a') as f:
            f.write(config.get('method', None) + ' generation: ' + str(new_timestamp - timestamp) + ' seconds\n')

        # save csv and pkl and time it
        timestamp = time()
        assistment_instance.save('csv')
        new_timestamp = time()
        
        with open('Assistments' + assistment_instance.version + now + '.log', 'a') as f:
            f.write(config.get('method') + ' save: ' + str(new_timestamp - timestamp) + ' seconds \n')
            
        timestamp = time()
        assistment_instance.save('pkl')
        new_timestamp = time()
        
        with open('Assistments' + assistment_instance.version + now + '.log', 'a') as f:
            f.write(config.get('method') + ' save: ' + str(new_timestamp - timestamp) + ' seconds \n')

In [11]:
def print_result(rnn_data):
    print('')
    print('*' * 125)
    print('')
    rnn_students = rnn_data['students']
    
    rnn_num_steps = rnn_data['num_steps']
    rnn_num_skills = rnn_data['num_skills']
    
    print('len(rnn_students): ', len(rnn_students))
    print('')
    
    print('rnn_num_steps: ', rnn_num_steps)
    print('')
    
    print('rnn_num_skills: ', rnn_num_skills)
    print('')
    
    print('rnn_students[0][0][0]: ', rnn_students[0][0][0])
    print('len(rnn_students[0][1]): ', len(rnn_students[0][1]))
    print('len(rnn_students[0][2]): ', len(rnn_students[0][2]))
    print('')
    
    print('correctness check: ')
    print(rnn_students[0][2])
    print('')
    print('*' * 125)
    print('')