In [2]:
import numpy as np
import pandas as pd
import gc

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from multiprocessing import Pool

import warnings
warnings.filterwarnings('ignore')

In [3]:
from tqdm import trange

In [4]:
# Read train and test files
train = pd.read_csv('../input/train.csv')
# test = pd.read_csv('../input/test.csv')

In [5]:
rows_2 = [  4140, 2168, 1268, 4030,  2090, 2140, 1634,  606, 432, 1414, 3005,
            3917, 2630, 3145, 2813, 1441,  836,  886, 1351, 2696, 3785, 3514,
            3761, 1642, 1032, 2163, 1266,  928, 3471,  762, 2934, 2896, 605,
            1012, 2106, 1584, 3434, 3807, 2461, 1088, 3766, 3773, 4144, 3749,
            1761, 4092, 3138, 2632, 3926,  682,  480, 4292,  133, 1464, 1619,
            202,   618,   83, 3693, 3794, 1297, 3121, 4208, 2731, 1289, 1682,
            2676,  444, 1005, 2449, 1293, 2993, 2038, 2322, 1972, 4332, 933, 
            1168, 3232,  587, 2583, 3925, 2539, 1068,  687,  496,  158, 3856,
            3592, 1197, 3765,  225, 3678, 3745, 3522,  861, 3507, 3498, 3485,
            880,  4357, 3078, 3487, 4398, 4095, 3028,  473,  843, 3492, 4090,
            857,   467, 3489, 3064,  583,  899, 2997, 3544, 2872, 2885,  407,
            2894,  412, 2908,  973,  417,  966, 2924, 2925,  961, 2933, 4098,
             423,  433,  947, 3520,  945, 2944, 2955,  437,  921, 2975, 3516,
            2986,  911, 3512, 3511, 3526, 4396,  802,  816,  619, 4059, 3303,
            3302,  630,  634, 3406, 3279,  547, 3276,  649,  650, 3250, 3248,
            3277, 3244, 3399, 3316,  576, 3361,  585,  590,  564,  593,  615,
             561, 4384, 3393,  553, 3333,  609,  610, 3334, 3093, 3237, 3412,
             780,  784, 3472,  786, 3119, 3117, 3464,  797,  492, 4360, 4386,
             807]

In [6]:
train.loc[train.loc[:, 'f190486d6'] == 178000, 'f190486d6']


Out[6]:
Series([], Name: f190486d6, dtype: float64)

In [7]:
frows = [1757,3809,511,3798,625,3303,4095,1283,4209,1696,3511,
         816,245,1383,2071,3492,378,2971,2366,4414,2790,3979,193,
         1189,3516,810,4443,3697,235,1382,4384,3418,4396,921,3176,650,
        ]

In [9]:
# cols_sp = np.flip([
cols_sp     =    [4003,  811, 4500, 2303, 4368, 2144, 2854, 3974,
                  2999, 4884,  148, 3917, 4540, 2686, 2520, 3576,
                  2743, 1555,        135, 2565, 4347,  641, 3374,
                  187, 2235,]
#     0)

In [70]:
colgroups = np.array([
    ['f190486d6', '58e2e02e6', 'eeb9cd3aa', '9fd594eec', '6eef030c1', '15ace8c9f', 'fb0f5dbfe', '58e056e12', '20aa07010', '024c577b9', 'd6bb78916', 'b43a7cfd5', '58232a6fb', '1702b5bf0', '324921c7b', '62e59a501', '2ec5b290f', '241f0f867', 'fb49e4212', '66ace2992', 'f74e8f13d', '5c6487af1', '963a49cdc', '26fc93eb7', '1931ccfdd', '703885424', '70feb1494', '491b9ee45', '23310aa6f', 'e176a204a', '6619d81fc', '1db387535', 'fc99f9426', '91f701ba2', '0572565c2', '190db8488', 'adb64ff71', 'c47340d97', 'c5a231d81', '0ff32eb98'],
    ['266525925', '4b6dfc880', '2cff4bf0c', 'a3382e205', '6488c8200', '547d3135b', 'b46191036', '453128993', '2599a7eb7', '2fc60d4d9', '009319104', 'de14e7687', 'aa31dd768', '2b54cddfd', 'a67d02050', '37aab1168', '939cc02f5', '31f72667c', '6f951302c', '54723be01', '4681de4fd', '8bd53906a', '435f27009', 'f82167572', 'd428161d9', '9015ac21d', 'ec4dc7883', '22c7b00ef', 'd4cc42c3d', '1351bf96e', '1e8801477', 'b7d59d3b5', 'a459b5f7d', '580f5ff06', '39b3c553a', '1eec37deb', '692c44993', 'ce8ce671e', '88ef1d9a8', 'bf042d928'],
    ['9d5c7cb94', '197cb48af', 'ea4887e6b', 'e1d0e11b5', 'ac30af84a', 'ba4ceabc5', 'd4c1de0e2', '6d2ece683', '9c42bff81', 'cf488d633', '0e1f6696a', 'c8fdf5cbf', 'f14b57b8f', '3a62b36bd', 'aeff360c7', '64534cc93', 'e4159c59e', '429687d5a', 'c671db79e', 'd79736965', '2570e2ba9', '415094079', 'ddea5dc65', 'e43343256', '578eda8e0', 'f9847e9fe', '097c7841e', '018ab6a80', '95aea9233', '7121c40ee', '578b81a77', '96b6bd42b', '44cb9b7c4', '6192f193d', 'ba136ae3f', '8479174c2', '64dd02e44', '4ecc3f505', 'acc4a8e68', '994b946ad'],
    ['f1eeb56ae', '62ffce458', '497adaff8', 'ed1d5d137', 'faf7285a1', 'd83da5921', '0231f07ed', '7950f4c11', '051410e3d', '39e1796ab', '2e0148f29', '312832f30', '6f113540d', 'f3ee6ba3c', 'd9fc63fa1', '6a0b386ac', '5747a79a9', '64bf3a12a', 'c110ee2b7', '1bf37b3e2', 'fdd07cac1', '0872fe14d', 'ddef5ad30', '42088cf50', '3519bf4a4', 'a79b1f060', '97cc1b416', 'b2790ef54', '1a7de209c', '2a71f4027', 'f118f693a', '15e8a9331', '0c545307d', '363713112', '73e591019', '21af91e9b', '62a915028', '2ab5a56f5', 'a8ee55662', '316b978cd'],
    ['b26d16167', '930f989bf', 'ca58e6370', 'aebe1ea16', '03c589fd7', '600ea672f', '9509f66b0', '70f4f1129', 'b0095ae64', '1c62e29a7', '32a0342e2', '2fc5bfa65', '09c81e679', '49e68fdb9', '026ca57fd', 'aacffd2f4', '61483a9da', '227ff4085', '29725e10e', '5878b703c', '50a0d7f71', '0d1af7370', '7c1af7bbb', '4bf056f35', '3dd64f4c4', 'b9f75e4aa', '423058dba', '150dc0956', 'adf119b9a', 'a8110109e', '6c4f594e0', 'c44348d76', 'db027dbaf', '1fcba48d0', '8d12d44e1', '8d13d891d', '6ff9b1760', '482715cbd', 'f81c2f1dd', 'dda820122'],
    ['c928b4b74', '8e4d0fe45', '6c0e0801a', '02861e414', 'aac52d8d9', '041c5d0c9', 'd7875bb6c', 'e7c0cfd0f', 'd48c08bda', '0c9462c08', '57dd44c29', 'a93118262', '850027e38', 'db3839ab0', '27461b158', '32174174c', '9306da53f', '95742c2bf', '5831f4c76', '1e6306c7c', '06393096a', '13bdd610a', 'd7d314edc', '9a07d7b1f', '4d2671746', '822e49b95', '3c8a3ced0', '83635fb67', '1857fbccf', 'c4972742d', 'b6c0969a2', 'e78e3031b', '36a9a8479', 'e79e5f72c', '092271eb3', '74d7f2dc3', '277ef93fc', 'b30e932ba', '8f57141ec', '350473311'],
    ['06148867b', '4ec3bfda8', 'a9ca6c2f4', 'bb0408d98', '1010d7174', 'f8a437c00', '74a7b9e4a', 'cfd55f2b6', '632fed345', '518b5da24', '60a5b79e4', '3fa0b1c53', 'e769ee40d', '9f5f58e61', '83e3e2e60', '77fa93749', '3c9db4778', '42ed6824a', '761b8e0ec', 'ee7fb1067', '71f5ab59f', '177993dc6', '07df9f30c', 'b1c5346c4', '9a5cd5171', 'b5df42e10', 'c91a4f722', 'd93058147', '20a325694', 'f5e0f4a16', '5edd220bc', 'c901e7df1', 'b02dfb243', 'bca395b73', '1791b43b0', 'f04f0582d', 'e585cbf20', '03055cc36', 'd7f15a3ad', 'ccd9fc164'],
    ['df838756c', '2cb73ede7', '4dcf81d65', '61c1b7eb6', 'a9f61cf27', '1af4d24fa', 'e13b0c0aa', 'b9ba17eb6', '796c218e8', '37f57824c', 'd1e0f571b', 'f9e3b03b7', 'a3ef69ad5', 'e16a20511', '04b88be38', '99e779ee0', '9f7b782ac', '1dd7bca9f', '2eeadde2b', '6df033973', 'cdfc2b069', '031490e77', '5324862e4', '467bee277', 'a3fb07bfd', '64c6eb1cb', '8618bc1fd', '6b795a2bc', '956d228b9', '949ed0965', 'a4511cb0b', 'b64425521', '2e3c96323', '191e21b5f', 'bee629024', '1977eaf08', '5e645a169', '1d04efde3', '8675bec0b', '8337d1adc'],
    ['a1cd7b681', '9b490abb3', 'b10f15193', '05f54f417', 'a7ac690a8', 'ed6c300c2', 'd0803e3a1', 'b1bb8eac3', 'bd1c19973', 'a34f8d443', '84ec1e3db', '24018f832', '82e01a220', '4c2064b00', '0397f7c9b', 'ba42e41fa', '22d7ad48d', '9abffd22c', 'dbfa2b77f', '2c6c62b54', '9fa38def3', 'ecb354edf', '9c3154ae6', '2f26d70f4', '53102b93f', 'a36b95f78', '1fa0f78d0', '19915a6d3', 'c944a48b5', '482b04cba', '2ce77a58f', '86558e595', 'c3f400e36', '20305585c', 'f8ccfa064', 'dd771cb8e', '9aa27017e', 'cd7f0affd', '236cc1ff5', 'a3fc511cd'],
    ['920a04ee2', '93efdb50f', '15ea45005', '78c57d7cd', '91570fb11', 'c5dacc85b', '145c7b018', '590b24ab1', 'c283d4609', 'e8bd579ae', '7298ca1ef', 'ce53d1a35', 'a8f80f111', '2a9fed806', 'feb40ad9f', 'cfd255ee3', '31015eaab', '303572ae2', 'cd15bb515', 'cb5161856', 'a65b73c87', '71d64e3f7', 'ec5fb550f', '4af2493b6', '18b4fa3f5', '3d655b0ed', '5cc9b6615', '88c0ec0a6', '8722f33bb', '5ed0c24d0', '54f26ee08', '04ecdcbb3', 'ade8a5a19', 'd5efae759', 'ac7a97382', 'e1b20c3a6', 'b0fcfeab8', '438b8b599', '43782ef36', 'df69cf626'],
    ['50603ae3d', '48282f315', '090dfb7e2', '6ccaaf2d7', '1bf2dfd4a', '50b1dd40f', '1604c0735', 'e94c03517', 'f9378f7ef', '65266ad22', 'ac61229b6', 'f5723deba', '1ced7f0b4', 'b9a4f06cd', '8132d18b8', 'df28ac53d', 'ae825156f', '936dc3bc4', '5b233cf72', '95a2e29fc', '882a3da34', '2cb4d123e', '0e1921717', 'c83d6b24d', '90a2428a5', '67e6c62b9', '320931ca8', '900045349', 'bf89fac56', 'da3b0b5bb', 'f06078487', '56896bb36', 'a79522786', '71c2f04c9', '1af96abeb', '4b1a994cc', 'dee843499', '645b47cde', 'a8e15505d', 'cc9c2fc87'],
    ['b6daeae32', '3bdee45be', '3d6d38290', '5a1589f1a', '961b91fe7', '29c059dd2', 'cfc1ce276', '0a953f97e', '30b3daec2', 'fb5f5836e', 'c7525612c', '6fa35fbba', '72d34a148', 'dcc269cfe', 'bdf773176', '469630e5c', '23db7d793', 'dc10234ae', '5ac278422', '6cf7866c1', 'a39758dae', '45f6d00da', '251d1aa17', '84d9d1228', 'b98f3e0d7', '66146c12d', 'd6470c4ce', '3f4a39818', 'f16a196c6', 'b8f892930', '6f88afe65', 'ed8951a75', '371da7669', '4b9540ab3', '230a025ca', 'f8cd9ae02', 'de4e75360', '540cc3cd1', '7623d805a', 'c2dae3a5a'],
    ['d0d340214', '34d3715d5', '9c404d218', 'c624e6627', 'a1b169a3a', 'c144a70b1', 'b36a21d49', 'dfcf7c0fa', 'c63b4a070', '43ebb15de', '1f2a670dd', '3f07a4581', '0b1560062', 'e9f588de5', '65d14abf0', '9ed0e6ddb', '0b790ba3a', '9e89978e3', 'ee6264d2b', 'c86c0565e', '4de164057', '87ba924b1', '4d05e2995', '2c0babb55', 'e9375ad86', '8988e8da5', '8a1b76aaf', '724b993fd', '654dd8a3b', 'f423cf205', '3b54cc2cf', 'e04141e42', 'cacc1edae', '314396b31', '2c339d4f2', '3f8614071', '16d1d6204', '80b6e9a8b', 'a84cbdab5', '1a6d13c4a'],
    ['a9819bda9', 'ea26c7fe6', '3a89d003b', '1029d9146', '759c9e85d', '1f71b76c1', '854e37761', '56cb93fd8', '946d16369', '33e4f9a0e', '5a6a1ec1a', '4c835bd02', 'b3abb64d2', 'fe0dd1a15', 'de63b3487', 'c059f2574', 'e36687647', 'd58172aef', 'd746efbfe', 'ccf6632e6', 'f1c272f04', 'da7f4b066', '3a7771f56', '5807de036', 'b22eb2036', 'b77c707ef', 'e4e9c8cc6', 'ff3b49c1d', '800f38b6b', '9a1d8054b', '0c9b00a91', 'fe28836c3', '1f8415d03', '6a542a40a', 'd53d64307', 'e700276a2', 'bb6f50464', '988518e2d', 'f0eb7b98f', 'd7447b2c5'],
    ['87ffda550', '63c094ba4', '2e103d632', '1c71183bb', 'd5fa73ead', 'e078302ef', 'a6b6bc34a', 'f6eba969e', '0d51722ca', 'ce3d7595b', '6c5c8869c', 'dfd179071', '122c135ed', 'b4cfe861f', 'b7c931383', '44d5b820f', '4bcf15776', '51d4053c7', '1fe5d56b9', 'ea772e115', 'ad009c8b9', '68a945b18', '62fb56487', 'c10f31664', 'cbb673163', 'c8d582dd2', '8781e4b91', 'bd6da0cca', 'ca2b906e8', '11e12dbe8', 'bb0ce54e9', 'c0d2348b7', '77deffdf0', 'f97d9431e', 'a09a238d0', '935ca66a9', '9de83dc23', '861076e21', 'f02ecb19c', '166008929'],
    ['f3cf9341c', 'fa11da6df', 'd47c58fe2', '0d5215715', '555f18bd3', '134ac90df', '716e7d74d', 'c00611668', '1bf8c2597', '1f6b2bafa', '174edf08a', 'f1851d155', '5bc7ab64f', 'a61aa00b0', 'b2e82c050', '26417dec4', '53a550111', '51707c671', 'e8d9394a0', 'cbbc9c431', '6b119d8ce', 'f296082ec', 'be2e15279', '698d05d29', '38e6f8d32', '93ca30057', '7af000ac2', '1fd0a1f2a', '41bc25fef', '0df1d7b9a', '88d29cfaf', '2b2b5187e', 'bf59c51c3', 'cfe749e26', 'ad207f7bb', '11114a47a', '341daa7d1', 'a8dd5cea5', '7b672b310', 'b88e5de84'],
])

In [11]:
# train.loc[fr_n, np.concatenate([['target'], colgroups[0]])[0:20]] 
train.loc[frows, np.concatenate([['target'], colgroups[0]])[0:10]]


Out[11]:
target f190486d6 58e2e02e6 eeb9cd3aa 9fd594eec 6eef030c1 15ace8c9f fb0f5dbfe 58e056e12 20aa07010
1757 115636.36 1015000.00 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76
3809 834800.00 540000.00 1015000.00 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76
511 296444.44 115636.36 540000.00 1015000.00 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76
3798 247166.66 834800.00 115636.36 540000.00 1015000.00 1563411.76 1563411.76 1563411.76 1563411.76 1563411.76
625 550000.00 296444.44 834800.00 115636.36 540000.00 1015000.00 1563411.76 1563411.76 1563411.76 1563411.76
3303 3076666.66 247166.66 296444.44 834800.00 115636.36 540000.00 1015000.00 1563411.76 1563411.76 1563411.76
4095 440000.00 550000.00 247166.66 296444.44 834800.00 115636.36 540000.00 1015000.00 1563411.76 1563411.76
1283 1600000.00 440000.00 3076666.66 550000.00 247166.66 296444.44 834800.00 115636.36 540000.00 1015000.00
4209 466461.54 0.00 440000.00 3076666.66 550000.00 247166.66 296444.44 834800.00 115636.36 540000.00
1696 3147200.00 1600000.00 0.00 440000.00 3076666.66 550000.00 247166.66 296444.44 834800.00 115636.36
3511 75000.00 466461.54 1600000.00 0.00 440000.00 3076666.66 550000.00 247166.66 296444.44 834800.00
816 1586888.88 3147200.00 466461.54 1600000.00 0.00 440000.00 3076666.66 550000.00 247166.66 296444.44
245 1477600.00 75000.00 3147200.00 466461.54 1600000.00 0.00 440000.00 3076666.66 550000.00 247166.66
1383 310000.00 1477600.00 1586888.88 75000.00 3147200.00 466461.54 1600000.00 0.00 440000.00 3076666.66
2071 3513333.34 0.00 1477600.00 1586888.88 75000.00 3147200.00 466461.54 1600000.00 0.00 440000.00
3492 160000.00 310000.00 0.00 1477600.00 1586888.88 75000.00 3147200.00 466461.54 1600000.00 0.00
378 2352551.72 3513333.34 310000.00 0.00 1477600.00 1586888.88 75000.00 3147200.00 466461.54 1600000.00
2971 280000.00 160000.00 3513333.34 310000.00 0.00 1477600.00 1586888.88 75000.00 3147200.00 466461.54
2366 5450500.00 2352551.72 160000.00 3513333.34 310000.00 0.00 1477600.00 1586888.88 75000.00 3147200.00
4414 1359000.00 280000.00 2352551.72 160000.00 3513333.34 310000.00 0.00 1477600.00 1586888.88 75000.00
2790 60000.00 5450500.00 280000.00 2352551.72 160000.00 3513333.34 310000.00 0.00 1477600.00 1586888.88
3979 12000000.00 1359000.00 5450500.00 280000.00 2352551.72 160000.00 3513333.34 310000.00 0.00 1477600.00
193 500000.00 60000.00 1359000.00 5450500.00 280000.00 2352551.72 160000.00 3513333.34 310000.00 0.00
1189 1878571.42 12000000.00 60000.00 1359000.00 5450500.00 280000.00 2352551.72 160000.00 3513333.34 310000.00
3516 814800.00 500000.00 12000000.00 60000.00 1359000.00 5450500.00 280000.00 2352551.72 160000.00 3513333.34
810 307000.00 1878571.42 500000.00 12000000.00 60000.00 1359000.00 5450500.00 280000.00 2352551.72 160000.00
4443 528666.66 814800.00 1878571.42 500000.00 12000000.00 60000.00 1359000.00 5450500.00 280000.00 2352551.72
3697 609200.00 307000.00 814800.00 1878571.42 500000.00 12000000.00 60000.00 1359000.00 5450500.00 280000.00
235 406000.00 528666.66 307000.00 814800.00 1878571.42 500000.00 12000000.00 60000.00 1359000.00 5450500.00
1382 448666.66 609200.00 528666.66 307000.00 814800.00 1878571.42 500000.00 12000000.00 60000.00 1359000.00
4384 400000.00 448666.66 406000.00 609200.00 528666.66 307000.00 814800.00 1878571.42 500000.00 12000000.00
3418 60666.66 0.00 448666.66 406000.00 609200.00 528666.66 307000.00 814800.00 1878571.42 500000.00
4396 267428.58 400000.00 0.00 448666.66 406000.00 609200.00 528666.66 307000.00 814800.00 1878571.42
921 1304800.00 60666.66 400000.00 0.00 448666.66 406000.00 609200.00 528666.66 307000.00 814800.00
3176 8120000.00 267428.58 60666.66 400000.00 0.00 448666.66 406000.00 609200.00 528666.66 307000.00
650 380000.00 1304800.00 267428.58 60666.66 400000.00 0.00 448666.66 406000.00 609200.00 528666.66

In [590]:
# groups = pd.read_csv('columns_grouped.csv')
# groups.groupby(by='position').count()

In [596]:
groups = pd.read_csv('lags.csv')

In [749]:
gr_list = groups.columns[1:].values

In [750]:
gr_list.shape


Out[750]:
(40,)

In [814]:
n_groups= 1
mtrain = test.loc[:,gr_list].values.reshape(-1,n_groups,40)
cur = 17 # 6487 # random start
n_rows = [cur]

In [817]:
shift = 10
(mtrain[45,:,:window] == mtrain[45,:,shift:window+shift]).sum() == n_groups*window


Out[817]:
False

In [818]:
shift = 1
window = 25
flag_find = False
while True:
    for i in range(test.shape[0]):
        if i != cur and (mtrain[cur,:,:window] == mtrain[i,:,shift:window+shift]).sum() > n_groups*window*.97:
            print(i)
            n_rows.append(i)
            cur = i
            flag_find = True
            shift = 1
    if not flag_find:
        shift += 1
        flag_find = False
    if shift > 14:
        break

In [27]:
seq = pd.DataFrame(np.random.random_sample((5, 12)), columns=[str(x) for x in np.arange(12)])

In [28]:
seq.head()


Out[28]:
0 1 2 3 4 5 6 7 8 9 10 11
0 0.826275 0.540743 0.113430 0.794742 0.636555 0.472112 0.622190 0.920807 0.725347 0.319891 0.099129 0.234902
1 0.836836 0.247383 0.850434 0.859205 0.811625 0.656940 0.972340 0.971087 0.913129 0.639929 0.530023 0.298623
2 0.820177 0.199796 0.141831 0.050877 0.121558 0.062873 0.552418 0.844379 0.994383 0.031406 0.090501 0.760001
3 0.096935 0.142670 0.280082 0.453858 0.967749 0.356076 0.988922 0.046775 0.088428 0.915337 0.063829 0.081277
4 0.081789 0.293485 0.725756 0.699679 0.547469 0.333626 0.357223 0.865109 0.553969 0.096536 0.421046 0.356653

In [60]:
lag = 1
(seq.iloc[1].values.reshape(3,4)[:,:lag] == seq.iloc[1].values.reshape(3,4)[:,lag:]).all()


Out[60]:
False

In [61]:
lag = 1
if (seq.iloc[1].values.reshape(3,4)[:,:lag] == seq.iloc[1].values.reshape(3,4)[:,lag:]).all():
    print('hi')

In [48]:
seq.iloc[1].values.reshape(3,4)[:,:lag]


Out[48]:
array([[0.83683646, 0.24738325],
       [0.81162451, 0.65694047],
       [0.91312916, 0.639929  ]])

In [102]:
sequence


Out[102]:
[625,
 3303,
 4095,
 1283,
 4209,
 1696,
 3511,
 816,
 245,
 1383,
 2071,
 3492,
 378,
 2971,
 2366,
 4414,
 2790,
 3979,
 193,
 1189,
 3516,
 810,
 4443,
 3697,
 235,
 1382,
 4384,
 3418,
 4396,
 921,
 3176,
 650,
 3551,
 3850,
 423,
 813,
 1479,
 497,
 3227,
 1680,
 418,
 1986,
 3842,
 3558,
 1486,
 2555,
 1398,
 1430,
 521,
 4031,
 2848,
 1582,
 3509,
 626,
 2191,
 4261,
 3493,
 3340,
 156,
 4040,
 1681,
 1311,
 1125,
 774,
 2700,
 3742,
 3843,
 2162,
 1778,
 2217,
 4109,
 1918,
 1095,
 3075,
 4201,
 4204,
 180,
 3910,
 127,
 1220,
 2511,
 1440,
 4004,
 3620,
 2636,
 943,
 3920,
 1895,
 4412,
 1567,
 114,
 3226,
 2902,
 214,
 4298,
 2590,
 2824,
 2532,
 4281,
 1887]

In [ ]:
# df = train.loc[:, colgroups.flatten()]
# pool = np.arange(train.shape[0])
# start_pos = 625
# n_groups=16
# lag = 1
# sequence = [start_pos]
# pos = start_pos
# while True:
#     flag = False
#     for i in pool:
#         if (df.iloc[pos].values.reshape(n_groups, 40)[:,:-lag] == df.iloc[i].values.reshape(n_groups, 40)[:,lag:]).sum() == n_groups*(40-lag):
#             print(i)
#             sequence.append(i)
#             pos = i
#             flag = True
#             lag = 1
#     if not flag:
#         lag += 1
#         flag = False
#     if lag > 25:
#         break

In [ ]:
def shift_search(df, pool, start_pos, window=10, n_groups=16):
    lag = 1
    pos = start_pos
    sequence = [start_pos]
    while True:
        flag = False
        line = df.loc[start_pos].values.reshape(n_groups, 40)
        for i in pool:
            try:
                if (df.iloc[pos].values.reshape(n_groups, 40)[:,:-lag] == df.iloc[i].values.reshape(n_groups, 40)[:,lag:]).sum() == n_groups*(40-lag):
                    print(i)
                    sequence.append(i)
                    pos = i
                    flag = True
                    lag = 1
            except:
                print("somthing wrong")
        if not flag:
            lag += 1
            flag = False
        if lag > 25:
            break 
    return sequence

In [ ]:
shift_search(train.loc[:, colgroups.flatten()], np.arange(train.shape[0]), start_pos = 0)

In [811]:
test.loc[n_rows, group_0[0:20]]


Out[811]:
f190486d6 58e2e02e6 eeb9cd3aa 9fd594eec 6eef030c1 15ace8c9f fb0f5dbfe 58e056e12 20aa07010 024c577b9 d6bb78916 b43a7cfd5 58232a6fb 1702b5bf0 324921c7b 62e59a501 2ec5b290f 241f0f867 fb49e4212 66ace2992
6487 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10000000.0 0.0 10000000.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
29352 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 10000000.0 0.0 10000000.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

In [ ]:
lag = 0
flag = False
ad_v = False
while True:
    for i in range(test.shape[0]):
        if (test.lookup([cur]*w, colgroups[group][0:w]) == test.lookup([i]*w, colgroups[group][1+lag:w+1+lag])).sum() == w\
        and test.lookup([i]*w, colgroups[group][1+lag:w+1+lag]).sum() > 0:
            print(i)
            fr_n.append(i)
            cur = i
            ad_v = True
    if ad_v:
        lag = 0
        ad_v = False
    else:
        lag +=1
    if lag > 40-w-2:
        flag = True
    if flag:
        break

In [585]:
lag = 0
flag = False
ad_v = False
while True:
    for i in range(train.shape[0]):
        if (train.lookup([cur]*w, colgroups[group][0:w]) == train.lookup([i]*w, colgroups[group][1+lag:w+1+lag])).sum() == w\
        and train.lookup([i]*w, colgroups[group][1+lag:w+1+lag]).sum() > 0:
            print(i)
            fr_n.append(i)
            cur = i
            ad_v = True
    if ad_v:
        lag = 0
        ad_v = False
    else:
        lag +=1
    if lag > 40-w-2:
        flag = True
    if flag:
        break


664
2882
2067
539
3160
3423
924
3175
1067
2356
3419
3341
2241
3258
68
1116
495
2886
334
3295
2261

In [463]:
test.loc[fr_test, colgroups[0]]


Out[463]:
target f190486d6 58e2e02e6 eeb9cd3aa 9fd594eec 6eef030c1 15ace8c9f fb0f5dbfe 58e056e12 20aa07010 ... 6619d81fc 1db387535 fc99f9426 91f701ba2 0572565c2 190db8488 adb64ff71 c47340d97 c5a231d81 0ff32eb98
44294 NaN 0.0 16000000.0 20000000.0 4333333.34 20000000.0 0.0 0.0 0.0 200000.0 ... 0.0 0.0 0.0 0.0 0.0 55000.0 0.0 550000.0 0.0 4000000.0
365 NaN 100000000.0 0.0 0.0 0.00 0.0 0.0 1200000.0 12500000.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2674 NaN 0.0 100000000.0 0.0 0.00 0.0 0.0 0.0 1200000.0 12500000.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 41 columns


In [ ]: