In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [2]:
people=pd.read_csv("data/people.csv",sep=',',header=0,keep_default_na=True,parse_dates=['date'])

In [3]:
people.set_index(keys=['people_id'],drop=True,append=False,inplace=True)
people.head(5)


Out[3]:
char_1 group_1 char_2 date char_3 char_4 char_5 char_6 char_7 char_8 ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100 type 2 group 17304 type 2 2021-06-29 type 5 type 5 type 5 type 3 type 11 type 2 ... False True True False False True True True False 36
ppl_100002 type 2 group 8688 type 3 2021-01-06 type 28 type 9 type 5 type 3 type 11 type 2 ... False True True True True True True True False 76
ppl_100003 type 2 group 33592 type 3 2022-06-10 type 4 type 8 type 5 type 2 type 5 type 2 ... False False True True True True False True True 99
ppl_100004 type 2 group 22593 type 3 2022-07-20 type 40 type 25 type 9 type 4 type 16 type 2 ... True True True True True True True True True 76
ppl_100006 type 2 group 6534 type 3 2022-07-27 type 40 type 25 type 9 type 3 type 8 type 2 ... False False True False False False True True False 84

5 rows × 40 columns


In [4]:
act_train=pd.read_csv("data/act_train.csv",sep=',',header=0,keep_default_na=True,parse_dates=['date'])
act_train.set_index(keys=['people_id'],drop=True,append=False,inplace=True)

In [5]:
act_train.head(10)


Out[5]:
activity_id date activity_category char_1 char_2 char_3 char_4 char_5 char_6 char_7 char_8 char_9 char_10 outcome
people_id
ppl_100 act2_1734928 2023-08-26 type 4 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 76 0
ppl_100 act2_2434093 2022-09-27 type 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1 0
ppl_100 act2_3404049 2022-09-27 type 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1 0
ppl_100 act2_3651215 2023-08-04 type 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1 0
ppl_100 act2_4109017 2023-08-26 type 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1 0
ppl_100 act2_898576 2023-08-04 type 4 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1727 0
ppl_100002 act2_1233489 2022-11-23 type 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1 1
ppl_100002 act2_1623405 2022-11-23 type 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1 1
ppl_100003 act2_1111598 2023-02-07 type 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1 1
ppl_100003 act2_1177453 2023-06-28 type 2 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 1 1

In [6]:
act_test=pd.read_csv("data/act_test.csv",sep=',',header=0,keep_default_na=True,parse_dates=['date'])
act_test.set_index(keys=['people_id'],drop=True,append=False,inplace=True)

In [7]:
act_test.head(10)


Out[7]:
activity_id date activity_category char_1 char_2 char_3 char_4 char_5 char_6 char_7 char_8 char_9 char_10
people_id
ppl_100004 act1_249281 2022-07-20 type 1 type 5 type 10 type 5 type 1 type 6 type 1 type 1 type 7 type 4 NaN
ppl_100004 act2_230855 2022-07-20 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 682
ppl_10001 act1_240724 2022-10-14 type 1 type 12 type 1 type 5 type 4 type 6 type 1 type 1 type 13 type 10 NaN
ppl_10001 act1_83552 2022-11-27 type 1 type 20 type 10 type 5 type 4 type 6 type 1 type 1 type 5 type 5 NaN
ppl_10001 act2_1043301 2022-10-15 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 3015
ppl_10001 act2_112890 2022-11-27 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 4987
ppl_10001 act2_1169930 2022-10-15 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 3015
ppl_10001 act2_1924448 2022-10-15 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 3015
ppl_10001 act2_1953554 2022-10-15 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 3015
ppl_10001 act2_1971739 2022-11-28 type 5 NaN NaN NaN NaN NaN NaN NaN NaN NaN type 3015

In [8]:
train_data=act_train.merge(people,how='left',left_index=True,right_index=True,suffixes=('_act', '_people'))
train_data.head(10)


Out[8]:
activity_id date_act activity_category char_1_act char_2_act char_3_act char_4_act char_5_act char_6_act char_7_act ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100 act2_1734928 2023-08-26 type 4 NaN NaN NaN NaN NaN NaN NaN ... False True True False False True True True False 36
ppl_100 act2_2434093 2022-09-27 type 2 NaN NaN NaN NaN NaN NaN NaN ... False True True False False True True True False 36
ppl_100 act2_3404049 2022-09-27 type 2 NaN NaN NaN NaN NaN NaN NaN ... False True True False False True True True False 36
ppl_100 act2_3651215 2023-08-04 type 2 NaN NaN NaN NaN NaN NaN NaN ... False True True False False True True True False 36
ppl_100 act2_4109017 2023-08-26 type 2 NaN NaN NaN NaN NaN NaN NaN ... False True True False False True True True False 36
ppl_100 act2_898576 2023-08-04 type 4 NaN NaN NaN NaN NaN NaN NaN ... False True True False False True True True False 36
ppl_100002 act2_1233489 2022-11-23 type 2 NaN NaN NaN NaN NaN NaN NaN ... False True True True True True True True False 76
ppl_100002 act2_1623405 2022-11-23 type 2 NaN NaN NaN NaN NaN NaN NaN ... False True True True True True True True False 76
ppl_100003 act2_1111598 2023-02-07 type 2 NaN NaN NaN NaN NaN NaN NaN ... False False True True True True False True True 99
ppl_100003 act2_1177453 2023-06-28 type 2 NaN NaN NaN NaN NaN NaN NaN ... False False True True True True False True True 99

10 rows × 54 columns


In [9]:
test_data=act_test.merge(people,how='left',left_index=True,right_index=True,suffixes=('_act', '_people'))
test_data.head(10)


Out[9]:
activity_id date_act activity_category char_1_act char_2_act char_3_act char_4_act char_5_act char_6_act char_7_act ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100004 act1_249281 2022-07-20 type 1 type 5 type 10 type 5 type 1 type 6 type 1 type 1 ... True True True True True True True True True 76
ppl_100004 act2_230855 2022-07-20 type 5 NaN NaN NaN NaN NaN NaN NaN ... True True True True True True True True True 76
ppl_10001 act1_240724 2022-10-14 type 1 type 12 type 1 type 5 type 4 type 6 type 1 type 1 ... False True True True True True True True True 90
ppl_10001 act1_83552 2022-11-27 type 1 type 20 type 10 type 5 type 4 type 6 type 1 type 1 ... False True True True True True True True True 90
ppl_10001 act2_1043301 2022-10-15 type 5 NaN NaN NaN NaN NaN NaN NaN ... False True True True True True True True True 90
ppl_10001 act2_112890 2022-11-27 type 5 NaN NaN NaN NaN NaN NaN NaN ... False True True True True True True True True 90
ppl_10001 act2_1169930 2022-10-15 type 5 NaN NaN NaN NaN NaN NaN NaN ... False True True True True True True True True 90
ppl_10001 act2_1924448 2022-10-15 type 5 NaN NaN NaN NaN NaN NaN NaN ... False True True True True True True True True 90
ppl_10001 act2_1953554 2022-10-15 type 5 NaN NaN NaN NaN NaN NaN NaN ... False True True True True True True True True 90
ppl_10001 act2_1971739 2022-11-28 type 5 NaN NaN NaN NaN NaN NaN NaN ... False True True True True True True True True 90

10 rows × 53 columns


In [10]:
train_data.activity_category.value_counts()


Out[10]:
type 2    904683
type 5    490710
type 3    429408
type 4    207465
type 1    157615
type 6      4253
type 7      3157
Name: activity_category, dtype: int64

In [11]:
types=['type %d'%i for i in range(1,8)]
train_datas={}
test_datas={}
for _type in types:
    train_datas[_type]=train_data[train_data.activity_category==_type].dropna(axis=(0,1), how='all')
    test_datas[_type]=test_data[test_data.activity_category==_type].dropna(axis=(0,1), how='all')
    print(train_datas[_type].activity_category.unique())
    print(test_datas[_type].activity_category.unique())


['type 1']
['type 1']
['type 2']
['type 2']
['type 3']
['type 3']
['type 4']
['type 4']
['type 5']
['type 5']
['type 6']
['type 6']
['type 7']
['type 7']

In [12]:
train_datas['type 1'].head(2)


Out[12]:
activity_id date_act activity_category char_1_act char_2_act char_3_act char_4_act char_5_act char_6_act char_7_act ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100025 act1_9923 2022-11-25 type 1 type 3 type 5 type 1 type 1 type 6 type 3 type 3 ... False False False False False False False False False 76
ppl_100033 act1_198174 2022-07-26 type 1 type 36 type 11 type 5 type 1 type 6 type 1 type 1 ... False False False False False False False False False 0

2 rows × 53 columns


In [13]:
train_datas['type 2'].head(2)


Out[13]:
activity_id date_act activity_category char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100 act2_2434093 2022-09-27 type 2 type 1 0 type 2 group 17304 type 2 2021-06-29 type 5 ... False True True False False True True True False 36
ppl_100 act2_3404049 2022-09-27 type 2 type 1 0 type 2 group 17304 type 2 2021-06-29 type 5 ... False True True False False True True True False 36

2 rows × 45 columns


In [14]:
train_datas['type 3'].head(2)


Out[14]:
activity_id date_act activity_category char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100006 act2_2380649 2022-07-27 type 3 type 114 1 type 2 group 6534 type 3 2022-07-27 type 40 ... False False True False False False True True False 84
ppl_100019 act2_1681678 2023-03-26 type 3 type 2 1 type 2 group 45749 type 3 2023-03-26 type 40 ... False False False False False False False False False 84

2 rows × 45 columns


In [15]:
train_datas['type 4'].head(2)


Out[15]:
activity_id date_act activity_category char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100 act2_1734928 2023-08-26 type 4 type 76 0 type 2 group 17304 type 2 2021-06-29 type 5 ... False True True False False True True True False 36
ppl_100 act2_898576 2023-08-04 type 4 type 1727 0 type 2 group 17304 type 2 2021-06-29 type 5 ... False True True False False True True True False 36

2 rows × 45 columns


In [16]:
train_datas['type 5'].head(2)


Out[16]:
activity_id date_act activity_category char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100013 act2_1667803 2023-01-26 type 5 type 5493 1 type 2 group 4204 type 3 2023-01-24 type 4 ... False False True True True True False True True 91
ppl_100013 act2_2229 2023-01-25 type 5 type 5493 1 type 2 group 4204 type 3 2023-01-24 type 4 ... False False True True True True False True True 91

2 rows × 45 columns


In [17]:
train_datas['type 6'].head(2)


Out[17]:
activity_id date_act activity_category char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100175 act2_4192771 2023-02-28 type 6 type 110 1 type 2 group 18966 type 3 2023-02-27 type 7 ... True False True True False True False False False 93
ppl_100202 act2_4786070 2023-02-25 type 6 type 110 1 type 2 group 38766 type 3 2022-10-22 type 15 ... False False True True True True True True True 89

2 rows × 45 columns


In [18]:
train_datas['type 7'].head(2)


Out[18]:
activity_id date_act activity_category char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100133 act2_1723517 2022-11-30 type 7 type 194 0 type 2 group 17304 type 2 2022-11-29 type 4 ... False False False False False False False False False 7
ppl_100212 act2_450045 2023-06-21 type 7 type 194 1 type 2 group 31542 type 3 2023-06-20 type 5 ... False False True False True False False False False 87

2 rows × 45 columns


In [19]:
types=['type %d'%i for i in range(1,8)]
for _type in types:
    train_datas[_type].drop(['activity_category'],axis=1,inplace=True)
    test_datas[_type].drop(['activity_category'],axis=1,inplace=True)

In [20]:
train_datas['type 1'].head(2)


Out[20]:
activity_id date_act char_1_act char_2_act char_3_act char_4_act char_5_act char_6_act char_7_act char_8_act ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100025 act1_9923 2022-11-25 type 3 type 5 type 1 type 1 type 6 type 3 type 3 type 6 ... False False False False False False False False False 76
ppl_100033 act1_198174 2022-07-26 type 36 type 11 type 5 type 1 type 6 type 1 type 1 type 4 ... False False False False False False False False False 0

2 rows × 52 columns


In [21]:
train_datas['type 2'].head(2)


Out[21]:
activity_id date_act char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people char_4_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100 act2_2434093 2022-09-27 type 1 0 type 2 group 17304 type 2 2021-06-29 type 5 type 5 ... False True True False False True True True False 36
ppl_100 act2_3404049 2022-09-27 type 1 0 type 2 group 17304 type 2 2021-06-29 type 5 type 5 ... False True True False False True True True False 36

2 rows × 44 columns


In [22]:
train_datas['type 3'].head(2)


Out[22]:
activity_id date_act char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people char_4_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id
ppl_100006 act2_2380649 2022-07-27 type 114 1 type 2 group 6534 type 3 2022-07-27 type 40 type 25 ... False False True False False False True True False 84
ppl_100019 act2_1681678 2023-03-26 type 2 1 type 2 group 45749 type 3 2023-03-26 type 40 type 25 ... False False False False False False False False False 84

2 rows × 44 columns


In [23]:
types=['type %d'%i for i in range(1,8)]
for _type in types:
    train_datas[_type].set_index(keys=['activity_id'], drop=True, append=True, inplace=True)
    test_datas[_type].set_index(keys=['activity_id'], drop=True, append=True, inplace=True)

In [24]:
train_datas['type 1'].head(2)


Out[24]:
date_act char_1_act char_2_act char_3_act char_4_act char_5_act char_6_act char_7_act char_8_act char_9_act ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id activity_id
ppl_100025 act1_9923 2022-11-25 type 3 type 5 type 1 type 1 type 6 type 3 type 3 type 6 type 8 ... False False False False False False False False False 76
ppl_100033 act1_198174 2022-07-26 type 36 type 11 type 5 type 1 type 6 type 1 type 1 type 4 type 1 ... False False False False False False False False False 0

2 rows × 51 columns


In [25]:
train_datas['type 2'].head(2)


Out[25]:
date_act char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people char_4_people char_5_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id activity_id
ppl_100 act2_2434093 2022-09-27 type 1 0 type 2 group 17304 type 2 2021-06-29 type 5 type 5 type 5 ... False True True False False True True True False 36
act2_3404049 2022-09-27 type 1 0 type 2 group 17304 type 2 2021-06-29 type 5 type 5 type 5 ... False True True False False True True True False 36

2 rows × 43 columns


In [26]:
train_datas['type 3'].head(2)


Out[26]:
date_act char_10_act outcome char_1_people group_1 char_2_people date_people char_3_people char_4_people char_5_people ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id activity_id
ppl_100006 act2_2380649 2022-07-27 type 114 1 type 2 group 6534 type 3 2022-07-27 type 40 type 25 type 9 ... False False True False False False True True False 84
ppl_100019 act2_1681678 2023-03-26 type 2 1 type 2 group 45749 type 3 2023-03-26 type 40 type 25 type 9 ... False False False False False False False False False 84

2 rows × 43 columns


In [27]:
types=['type %d'%i for i in range(1,8)]
for _type in types:
    print(train_datas[_type].index.is_unique,end=',')
    print(test_datas[_type].index.is_unique,end=',' )


True,True,True,True,True,True,True,True,True,True,True,True,True,True,

In [28]:
pd.DataFrame({'train_1':train_datas['type 1'].dtypes,'train_2':train_datas['type 2'].dtypes,
              'train_3':train_datas['type 3'].dtypes,'train_4':train_datas['type 4'].dtypes,
              'train_5':train_datas['type 5'].dtypes,'train_6':train_datas['type 6'].dtypes,
              'train_7':train_datas['type 7'].dtypes,
              'test_1':test_datas['type 1'].dtypes,'test_2':test_datas['type 2'].dtypes,
              'test_3':test_datas['type 3'].dtypes,'test_4':test_datas['type 4'].dtypes,
              'test_5':test_datas['type 5'].dtypes,'test_6':test_datas['type 6'].dtypes,
              'test_7':test_datas['type 7'].dtypes,})


Out[28]:
test_1 test_2 test_3 test_4 test_5 test_6 test_7 train_1 train_2 train_3 train_4 train_5 train_6 train_7
char_10_act NaN object object object object object object NaN object object object object object object
char_10_people bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_11 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_12 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_13 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_14 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_15 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_16 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_17 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_18 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_19 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_1_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_1_people object object object object object object object object object object object object object object
char_20 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_21 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_22 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_23 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_24 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_25 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_26 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_27 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_28 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_29 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_2_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_2_people object object object object object object object object object object object object object object
char_30 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_31 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_32 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_33 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_34 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_35 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_36 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_37 bool bool bool bool bool bool bool bool bool bool bool bool bool bool
char_38 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64 int64
char_3_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_3_people object object object object object object object object object object object object object object
char_4_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_4_people object object object object object object object object object object object object object object
char_5_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_5_people object object object object object object object object object object object object object object
char_6_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_6_people object object object object object object object object object object object object object object
char_7_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_7_people object object object object object object object object object object object object object object
char_8_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_8_people object object object object object object object object object object object object object object
char_9_act object NaN NaN NaN NaN NaN NaN object NaN NaN NaN NaN NaN NaN
char_9_people object object object object object object object object object object object object object object
date_act datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns]
date_people datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns] datetime64[ns]
group_1 object object object object object object object object object object object object object object
outcome NaN NaN NaN NaN NaN NaN NaN int64 int64 int64 int64 int64 int64 int64

In [29]:
str_col_list=['group_1']+['char_%d_act'%i for i in range(1,11)]+['char_%d_people'%i for i in range(1,10)]
bool_col_list=['char_10_people']+['char_%d'%i for i in range(11,38)]
types=['type %d'%i for i in range(1,8)]
for _type in types:
    for data_set in [train_datas,test_datas]:
        data_set[_type].date_act= (data_set[_type].date_act- np.datetime64('1970-01-01'))/ np.timedelta64(1, 'D')
        data_set[_type].date_people= (data_set[_type].date_people- np.datetime64('1970-01-01'))/ np.timedelta64(1,'D') 
        data_set[_type].group_1=data_set[_type].group_1.str.replace("group",'').str.strip().astype(np.float64)
        for col in bool_col_list:
               if col in data_set[_type]:data_set[_type][col]=data_set[_type][col].astype(np.float64)
        for col in str_col_list[1:]:
               if col in data_set[_type]:data_set[_type][col]=data_set[_type][col].str.replace("type",'').str.strip().astype(np.float64) 

        data_set[_type]= data_set[_type].astype(np.float64)

In [31]:
types=['type %d'%i for i in range(1,8)]
for _type in types:
    print((train_datas[_type].dtypes==np.float64).all(),end=',')
    print((test_datas[_type].dtypes==np.float64).all(),end=',')


True,True,True,True,True,True,True,True,True,True,True,True,True,True,

In [32]:
train_datas['type 1'].head(5)


Out[32]:
date_act char_1_act char_2_act char_3_act char_4_act char_5_act char_6_act char_7_act char_8_act char_9_act ... char_29 char_30 char_31 char_32 char_33 char_34 char_35 char_36 char_37 char_38
people_id activity_id
ppl_100025 act1_9923 19321.0 3.0 5.0 1.0 1.0 6.0 3.0 3.0 6.0 8.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 76.0
ppl_100033 act1_198174 19199.0 36.0 11.0 5.0 1.0 6.0 1.0 1.0 4.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
act1_214090 19523.0 24.0 6.0 6.0 3.0 1.0 3.0 4.0 5.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
act1_230588 19416.0 2.0 2.0 3.0 3.0 5.0 2.0 2.0 4.0 2.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
act1_271874 19199.0 2.0 5.0 3.0 2.0 6.0 1.0 1.0 6.0 8.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 51 columns


In [33]:
lambda_len=lambda x:len(x.unique())
lambda_data=lambda x:str(x.unique()) if(len(x.unique())<=3) else str(x.unique()[:3])+'...'
train_results={}
test_results={}
types=['type %d'%i for i in range(1,8)]
for _type in types:
    train_results[_type[-1]]=pd.DataFrame({'len':train_datas[_type].apply(lambda_len),
                        'data':train_datas[_type].apply(lambda_data)},
                        index=train_datas[_type].columns) 
    test_results[_type[-1]]=pd.DataFrame({'len':test_datas[_type].apply(lambda_len),
                        'data':train_datas[_type].apply(lambda_data)},
                        index=test_datas[_type].columns) 

train_12=train_results['1'].merge(train_results['2'],how='outer',left_index=True,right_index=True,suffixes=('_ta_1', '_ta_2')) 
train_34=train_results['3'].merge(train_results['4'],how='outer',left_index=True,right_index=True,suffixes=('_ta_3', '_ta_4')) 
train_56=train_results['5'].merge(train_results['6'],how='outer',left_index=True,right_index=True,suffixes=('_ta_5', '_ta_6')) 
train_test_77=train_results['7'].merge(test_results['7'],how='outer',left_index=True,right_index=True,suffixes=('_ta_7', '_tt_7')) 
test_12=test_results['1'].merge(test_results['2'],how='outer',left_index=True,right_index=True,suffixes=('_tt_1', '_tt_2')) 
test_34=test_results['3'].merge(test_results['4'],how='outer',left_index=True,right_index=True,suffixes=('_tt_3', '_tt_4')) 
test_56=test_results['5'].merge(test_results['6'],how='outer',left_index=True,right_index=True,suffixes=('_tt_5', '_tt_6')) 

train_12.merge(train_34,how='outer',left_index=True,right_index=True)\
    .merge(train_56,how='outer',left_index=True,right_index=True)  \
    .merge(train_test_77,how='outer',left_index=True,right_index=True)\
    .merge(test_12,how='outer',left_index=True,right_index=True) \
    .merge(test_34,how='outer',left_index=True,right_index=True) \
    .merge(test_56,how='outer',left_index=True,right_index=True)


Out[33]:
data_ta_1 len_ta_1 data_ta_2 len_ta_2 data_ta_3 len_ta_3 data_ta_4 len_ta_4 data_ta_5 len_ta_5 ... data_tt_2 len_tt_2 data_tt_3 len_tt_3 data_tt_4 len_tt_4 data_tt_5 len_tt_5 data_tt_6 len_tt_6
char_10_act NaN NaN [ 1.] 1.0 [ 114. 2. 23.]... 450.0 [ 76. 1727. 894.]... 3315.0 [ 5493. 489. 584.]... 2747.0 ... [ 1.] 1.0 [ 114. 2. 23.]... 282.0 [ 76. 1727. 894.]... 2267.0 [ 5493. 489. 584.]... 1409.0 [ 110.] 1.0
char_10_people [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_11 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0
char_12 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_13 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_14 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_15 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_16 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_17 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_18 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_19 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_1_act [ 3. 36. 24.]... 51.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_1_people [ 2. 1.] 2.0 [ 2. 1.] 2.0 [ 2. 1.] 2.0 [ 2. 1.] 2.0 [ 2. 1.] 2.0 ... [ 2. 1.] 2.0 [ 2. 1.] 2.0 [ 2. 1.] 2.0 [ 2. 1.] 2.0 [ 2. 1.] 2.0
char_20 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_21 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_22 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_23 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_24 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_25 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0
char_26 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0
char_27 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_28 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_29 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0
char_2_act [ 5. 11. 6.]... 32.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_2_people [ 3. 2. 1.] 3.0 [ 2. 3. 1.] 3.0 [ 3. 2. 1.] 3.0 [ 2. 3. 1.] 3.0 [ 3. 2. 1.] 3.0 ... [ 2. 3. 1.] 3.0 [ 3. 2. 1.] 3.0 [ 2. 3. 1.] 3.0 [ 3. 2. 1.] 3.0 [ 3. 2. 1.] 3.0
char_30 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0
char_31 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_32 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_33 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_34 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0
char_35 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 ... [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0
char_36 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 ... [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_37 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0
char_38 [ 76. 0. 100.]... 101.0 [ 36. 76. 99.]... 101.0 [ 84. 76. 0.]... 101.0 [ 36. 99. 84.]... 101.0 [ 91. 76. 68.]... 101.0 ... [ 36. 76. 99.]... 101.0 [ 84. 76. 0.]... 101.0 [ 36. 99. 84.]... 101.0 [ 91. 76. 68.]... 101.0 [ 93. 89. 0.]... 95.0
char_3_act [ 1. 5. 6.]... 11.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_3_people [ 14. 10. 4.]... 42.0 [ 5. 28. 4.]... 43.0 [ 40. 14. 4.]... 42.0 [ 5. 4. 40.]... 42.0 [ 4. 14. 5.]... 41.0 ... [ 5. 28. 4.]... 41.0 [ 40. 14. 4.]... 40.0 [ 5. 4. 40.]... 40.0 [ 4. 14. 5.]... 39.0 [ 7. 15. 9.]... 29.0
char_4_act [ 1. 3. 2.]... 7.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_4_people [ 6. 7. 10.]... 25.0 [ 5. 9. 8.]... 25.0 [ 25. 6. 7.]... 25.0 [ 5. 8. 25.]... 25.0 [ 8. 6. 5.]... 25.0 ... [ 5. 9. 8.]... 25.0 [ 25. 6. 7.]... 25.0 [ 5. 8. 25.]... 25.0 [ 8. 6. 5.]... 25.0 [ 2. 3. 6.]... 23.0
char_5_act [ 6. 1. 5.]... 7.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_5_people [ 8. 6. 4.]... 9.0 [ 5. 8. 9.]... 9.0 [ 9. 8. 4.]... 9.0 [ 5. 9. 8.]... 9.0 [ 4. 8. 5.]... 9.0 ... [ 5. 8. 9.]... 9.0 [ 9. 8. 4.]... 9.0 [ 5. 9. 8.]... 9.0 [ 4. 8. 5.]... 9.0 [ 7. 6. 5.]... 9.0
char_6_act [ 3. 1. 2.]... 5.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_6_people [ 3. 1. 2.]... 7.0 [ 3. 2. 4.]... 7.0 [ 3. 4. 1.]... 7.0 [ 3. 2. 1.]... 7.0 [ 1. 3. 2.]... 7.0 ... [ 3. 2. 4.]... 7.0 [ 3. 4. 1.]... 7.0 [ 3. 2. 1.]... 7.0 [ 1. 3. 2.]... 7.0 [ 2. 1. 4.]... 7.0
char_7_act [ 3. 1. 4.]... 8.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_7_people [ 9. 23. 6.]... 25.0 [ 11. 5. 9.]... 25.0 [ 8. 9. 20.]... 25.0 [ 11. 5. 8.]... 25.0 [ 7. 9. 23.]... 25.0 ... [ 11. 5. 9.]... 25.0 [ 8. 9. 20.]... 25.0 [ 11. 5. 8.]... 25.0 [ 7. 9. 23.]... 25.0 [ 6. 4. 19.]... 25.0
char_8_act [ 6. 4. 5.]... 18.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_8_people [ 6. 3. 2.]... 8.0 [ 2. 6. 3.]... 8.0 [ 2. 3. 6.]... 8.0 [ 2. 6. 3.]... 8.0 [ 2. 6. 3.]... 8.0 ... [ 2. 6. 3.]... 8.0 [ 2. 3. 6.]... 8.0 [ 2. 6. 3.]... 8.0 [ 2. 6. 3.]... 8.0 [ 4. 2. 3.]... 8.0
char_9_act [ 8. 1. 2.]... 19.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
char_9_people [ 6. 3. 2.]... 9.0 [ 2. 4. 6.]... 9.0 [ 2. 3. 6.]... 9.0 [ 2. 6. 3.]... 9.0 [ 3. 6. 5.]... 9.0 ... [ 2. 4. 6.]... 9.0 [ 2. 3. 6.]... 9.0 [ 2. 6. 3.]... 9.0 [ 3. 6. 5.]... 9.0 [ 4. 9. 3.]... 9.0
date_act [ 19321. 19199. 19523.]... 411.0 [ 19262. 19573. 19595.]... 386.0 [ 19200. 19442. 19230.]... 386.0 [ 19595. 19573. 19214.]... 386.0 [ 19383. 19382. 19251.]... 258.0 ... [ 19262. 19573. 19595.]... 387.0 [ 19200. 19442. 19230.]... 387.0 [ 19595. 19573. 19214.]... 386.0 [ 19383. 19382. 19251.]... 250.0 [ 19416. 19413. 19314.]... 278.0
date_people [ 19230. 19199. 19014.]... 1189.0 [ 18807. 18633. 19153.]... 1195.0 [ 19200. 19442. 19230.]... 1191.0 [ 18807. 19153. 19200.]... 1189.0 [ 19381. 19230. 19215.]... 986.0 ... [ 18807. 18633. 19153.]... 1165.0 [ 19200. 19442. 19230.]... 1106.0 [ 18807. 19153. 19200.]... 1116.0 [ 19381. 19230. 19215.]... 763.0 [ 19415. 19287. 19313.]... 441.0
group_1 [ 36096. 17304. 9439.]... 17008.0 [ 17304. 8688. 33592.]... 23030.0 [ 6534. 45749. 36096.]... 18955.0 [ 17304. 33592. 6534.]... 15958.0 [ 4204. 36096. 19662.]... 9231.0 ... [ 17304. 8688. 33592.]... 8532.0 [ 6534. 45749. 36096.]... 7216.0 [ 17304. 33592. 6534.]... 5592.0 [ 4204. 36096. 19662.]... 3370.0 [ 18966. 38766. 17304.]... 714.0
outcome [ 0. 1.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 [ 0. 1.] 2.0 [ 1. 0.] 2.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

52 rows × 28 columns


In [75]:
from scipy.sparse import hstack,csr_matrix
from sklearn.preprocessing  import OneHotEncoder
def onehot_encode(train_datas,test_datas): 

    train_results={}
    test_results={}
    types=['type %d'%i for i in range(1,8)]
    for _type in types:
        if _type=='type 1':
            one_hot_cols=['char_%d_act'%i for i in range(1,10)]+\
            ['char_%d_people'%i for i in range(1,10)]
            train_end_cols=['group_1','date_act','date_people','char_38','outcome']
            test_end_cols=['group_1','date_act','date_people','char_38']
        else:
            one_hot_cols=['char_%d_people'%i for i in range(1,10)]
            train_end_cols=['group_1','char_10_act','date_act','date_people','char_38','outcome']
            test_end_cols=['group_1','char_10_act','date_act','date_people','char_38']
        
        train_front_array=train_datas[_type][one_hot_cols].values #头部数组
        train_end_array=train_datas[_type][train_end_cols].values#末尾数组
        train_middle_array=train_datas[_type].drop(train_end_cols+one_hot_cols,axis=1,inplace=False).values#中间数组
        
        test_front_array=test_datas[_type][one_hot_cols].values #头部数组
        test_end_array=test_datas[_type][test_end_cols].values#末尾数组
        test_middle_array=test_datas[_type].drop(test_end_cols+one_hot_cols,axis=1,inplace=False).values#中间数组

        encoder=OneHotEncoder(categorical_features='all',sparse=True) # 一个稀疏矩阵,类型为 csr_matrix
        train_result=hstack([encoder.fit_transform(train_front_array),csr_matrix(train_middle_array),csr_matrix(train_end_array)])
        test_result=hstack([encoder.transform(test_front_array),csr_matrix(test_middle_array),csr_matrix(test_end_array)])
        train_results[_type]=train_result
        test_results[_type]=test_result
    return train_results,test_results

In [91]:
types=['type %d'%i for i in range(1,8)]

print('before encode:\n')
for _type in types:
    print('train(type=%s):shape='%_type,train_datas[_type].shape)
    print('test(type=%s):shape='%_type,test_datas[_type].shape)
print('==============\n\n')    
train_results,test_results=onehot_encode(train_datas,test_datas)
print('after encode:\n')
for _type in types:
    print('train(type=%s):shape='%_type,train_results[_type].shape)
    print('test(type=%s):shape='%_type,test_results[_type].shape)
print('==============\n\n')


before encode:

train(type=type 1):shape= (157615, 51)
test(type=type 1):shape= (40092, 50)
train(type=type 2):shape= (904683, 43)
test(type=type 2):shape= (223164, 42)
train(type=type 3):shape= (429408, 43)
test(type=type 3):shape= (59931, 42)
train(type=type 4):shape= (207465, 43)
test(type=type 4):shape= (50215, 42)
train(type=type 5):shape= (490710, 43)
test(type=type 5):shape= (123463, 42)
train(type=type 6):shape= (4253, 43)
test(type=type 6):shape= (1051, 42)
train(type=type 7):shape= (3157, 43)
test(type=type 7):shape= (771, 42)
==============


after encode:

train(type=type 1):shape= (157615, 321)
test(type=type 1):shape= (40092, 320)
train(type=type 2):shape= (904683, 165)
test(type=type 2):shape= (223164, 164)
train(type=type 3):shape= (429408, 164)
test(type=type 3):shape= (59931, 163)
train(type=type 4):shape= (207465, 164)
test(type=type 4):shape= (50215, 163)
train(type=type 5):shape= (490710, 163)
test(type=type 5):shape= (123463, 162)
train(type=type 6):shape= (4253, 155)
test(type=type 6):shape= (1051, 154)
train(type=type 7):shape= (3157, 161)
test(type=type 7):shape= (771, 160)
==============



In [89]:
from sklearn.preprocessing  import MaxAbsScaler
def scale(train_datas,test_datas): 
    train_results={}
    test_results={}
    types=['type %d'%i for i in range(1,8)]
    
    for _type in types:
        if _type=='type 1':
            train_last_index=5#最后5列为 group_1/date_act/date_people/char_38/outcome
            test_last_index=4#最后4列为 group_1/date_act/date_people/char_38 
        else:
            train_last_index=6#最后6列为 group_1/char_10_act/date_act/date_people/char_38/outcome
            test_last_index=5#最后5列为 group_1/char_10_act/date_act/date_people/char_38 
        
        scaler=MaxAbsScaler()
        train_array=train_datas[_type].toarray()        
        train_front=train_array[:,:-train_last_index]
        train_mid=scaler.fit_transform(train_array[:,-train_last_index:-1])#outcome 不需要归一化
        train_end=train_array[:,-1].reshape((-1,1)) #outcome
        train_results[_type]=np.hstack((train_front,train_mid,train_end))
        
        test_array=test_datas[_type].toarray()
        test_front=test_array[:,:-test_last_index]
        test_end=scaler.transform(test_array[:,-test_last_index:])
        test_results[_type]=np.hstack((test_front,test_end))

    return train_results,test_results

In [94]:
ta_results,tt_results=scale(train_results,test_results)
types=['type %d'%i for i in range(1,8)]
for _type in types:
    print("Train(type=%s):"%_type,np.unique(ta_results[_type].max(axis=1)),np.unique(ta_results[_type].min(axis=1)))
    print("Test(type=%s):"%_type,np.unique(tt_results[_type].max(axis=1)),np.unique(tt_results[_type].min(axis=1)))


Train(type=type 1): [ 1.] [ 0.]
Test(type=type 1): [ 1.] [ 0.]
Train(type=type 2): [ 1.] [ 0.]
Test(type=type 2): [ 1.] [ 0.]
Train(type=type 3): [ 1.] [ 0.]
Test(type=type 3): [ 1.] [ 0.]
Train(type=type 4): [ 1.] [ 0.]
Test(type=type 4): [ 1.] [ 0.]
Train(type=type 5): [ 1.] [ 0.]
Test(type=type 5): [ 1.] [ 0.]
Train(type=type 6): [ 1.] [ 0.]
Test(type=type 6): [ 1.] [ 0.]
Train(type=type 7): [ 1.] [ 0.]
Test(type=type 7): [ 1.] [ 0.]