In [9]:
import numpy as np
import string
import os
from collections import Counter
import itertools

In [11]:
train_paths = os.listdir("../data/train/")
test_paths = os.listdir("../data/test/")

train_ids = []
train_class = []
test_ids = []
extract_id =False
extract_class = False

def generate_xml_paths(train_paths, test_paths, xml_processor=lambda x: x, i=0):
    """ 
    Processes the provided paths, extracting id and class information and 
    applying whatever function on the xml is desired.
    xml_processor should takes in xml_string and should return something
    """
    paths = train_paths + test_paths
    print "The length of the test data is {0}, training data {1}".format(
        len(test_paths), len(train_paths)
    )
    while i < len(paths):
        abs_path = ''
        # Split the file name into a list of [id, class_name, xml]
        id_class_xml = paths[i].split('.')
        assert id_class_xml[2] == 'xml'

        # If the file is part of the test set, append the id to test_ids
        if i >= len(train_paths):
            if extract_id:
                test_ids.append(id_class_xml[0])
            assert id_class_xml[1] == 'X'
            abs_path = os.path.join(
                os.path.abspath("../data/test/"), paths[i])

        # Otherwise file is in training set. Append id and class
        else:
            if extract_id:
                train_ids.append(id_class_xml[0])
            if extract_class:
                train_class.append(id_class_xml[1])
            abs_path = os.path.join(
                os.path.abspath("../data/train/"), paths[i])

        # Open the file, process, and yield string
        with open(abs_path, 'r') as xml_file:
            xml_content = xml_processor(xml_file.read())
            assert type(xml_content) == str
            yield xml_content
            if (i % 500) == 0:
                print "sent file {0}, named \n {1} to processing".format(i, paths[i])
            i += 1

In [4]:
def remove_special_xml(xmlstr):
    
    table = string.maketrans('=\/<>.\r\n-"ABCDEFGHIJKLMNOPQRSTUVWXYZ',(
                             '          abcdefghijklmnopqrstuvwxyz'))
    delete = '_?:'
    return string.translate(xmlstr,table,delete)

In [ ]:
xml_corpus = generate_xml_paths(train_paths, test_paths, 
                                xml_processor=remove_special_xml)
xml_tokens = [xml.split() for xml in xml_corpus]

counter = Counter(itertools.chain.from_iterable(xml_tokens))
print counter.most_common(10)

In [18]:
def frequency_check(counts, index, cutoff):
    if counts < cutoff:
        return 0
    else:
        return index

cutoff = 3
# most common returns word, count pairs (as a tuple) check that the num
# counts is greater than the cutoff
word_to_int = {
    word[0]: frequency_check(word[1], i+1, cutoff) for i, word in enumerate(counter.most_common())}
print len(word_to_int.items())
print word_to_int.items()[:10]
print word_to_int['windows']


934615
[('j!&#x7f;q', 0), ('&#x3c;p[;j', 0), ('9l2u', 0), ('mdbm', 0), ('mdbi', 0), ('7cba6d393775bf994ac22813684055d704bb606d', 0), ('cmayj', 0), ('5t&#x5e;', 0), ('i3d&#x60;o&#x7e;', 180938), ('wt&#x24;g', 0)]
1

In [ ]:


In [14]:
len(counter.most_common())


Out[14]:
934615

In [19]:
# Convert every word in every token list into the correct index value.
# Should look like this:
#[ [ 1,2,3,...(sequence of ints representing words in xml file 1)]
# [(sequence of ints representing word in xml file 2)]
# ...]
xml_int_tokens = [[word_to_int[word] for word in xml_file] for xml_file in xml_tokens]
print len(xml_int_tokens[1])
print len(xml_int_tokens[2])


8519
374

In [21]:
print xml_int_tokens[1][:400]


[733, 565, 3, 4, 1149, 557, 739, 3, 784, 1732, 10, 2, 171061, 546, 11, 39, 803, 0, 945, 171061, 946, 0, 810, 112, 804, 4, 802, 1821, 7419, 807, 2975, 4948, 806, 1590, 808, 1006, 805, 653, 1589, 1591, 477, 603, 1761, 494, 843, 10, 2, 171061, 546, 9, 3, 5, 1134, 12, 3233, 6, 916, 11, 39, 13, 10, 2, 1, 8, 679, 7, 9, 3, 5, 699, 12, 701, 6, 697, 11, 695, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 8, 773, 7, 9, 3, 5, 778, 12, 779, 6, 777, 11, 774, 13, 10, 2, 1, 8, 775, 7, 9, 3, 5, 782, 12, 783, 6, 596, 11, 776, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722, 726, 9, 3, 5, 728, 12, 729, 6, 727, 11, 39, 13, 10, 2, 1, 8, 797, 7, 9, 3, 5, 817, 12, 816, 6, 791, 11, 801, 13, 10, 2, 1, 8, 741, 7, 9, 3, 5, 746, 12, 745, 6, 742, 11, 743, 13, 10, 2, 1, 8, 542, 7, 9, 3, 5, 632, 12, 635, 6, 629, 11, 634, 13, 10, 2, 1, 8, 811, 7, 9, 3, 5, 814, 12, 815, 6, 813, 11, 812, 13, 10, 2, 1, 8, 2313, 7, 9, 3, 5, 2317, 12, 2318, 6, 1431, 11, 2312, 13, 10, 2, 1, 8, 514, 7, 9, 3, 5, 610, 12, 611, 6, 609, 11, 608, 13, 10, 2, 1, 8, 492, 7, 9, 3, 5, 604, 12, 606, 6, 605, 11, 602, 13, 10, 2, 1, 8, 750, 7, 9, 3, 5, 754, 12, 753, 6, 747, 11, 751, 13, 10, 2, 1, 8, 599, 7, 9, 3, 5, 691, 12, 692, 6, 690, 11, 683, 13, 10, 2, 1, 8, 839, 7, 9, 3, 5, 841, 12, 842, 6, 827, 11, 840, 13, 10, 2, 1, 8, 833, 7, 9, 3, 5, 837, 12, 838, 6, 596, 11, 835, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722]

In [25]:
token_arr = np.array([np.array(xml) for xml in xml_int_tokens])

In [26]:
print token_arr


[ array([   733,    565,      3,      4,   1149,    557,    739,      3,
          784,   5848,     10,      2, 196135,    546,     11,     39,
          803,   2048,    945, 196135,    946,      0,    810,    112,
          804,      4,    802,     67,   3871,    807,   1826,   2758,
          806,   1590,    808,   1173,    805,    653,   1589,   1591,
          477,    603,   5652,    494,    843,     10,      2, 196135,
          546,      9,      3,      5,   1221,     12,   4797,      6,
         1982,     11,     39,     13,     10,      2,      1,      8,
          679,      7,      9,      3,      5,    699,     12,    701,
            6,    697,     11,    695,     13,     10,      2,      1,
            8,    133,      7,      9,      3,      5,    308,     12,
          309,      6,    307,     11,    302,     13,     10,      2,
            1,    693,    708,      1,    677,    724,      4,    696,
          722,    726,      9,      3,      5,    728,     12,    729,
            6,    727,     11,     39,     13,     10,      2,      1,
            8,    797,      7,      9,      3,      5,    817,     12,
          816,      6,    791,     11,    801,     13,     10,      2,
            1,      8,    215,      7,      9,      3,      5,    475,
           12,    476,      6,    474,     11,    473,     13,     10,
            2,      1,      8,    773,      7,      9,      3,      5,
          778,     12,    779,      6,    777,     11,    774,     13,
           10,      2,      1,      8,    775,      7,      9,      3,
            5,    782,     12,    783,      6,    596,     11,    776,
           13,     10,      2,      1,      8,    741,      7,      9,
            3,      5,    746,     12,    745,      6,    742,     11,
          743,     13,     10,      2,      1,      8,    542,      7,
            9,      3,      5,    632,     12,    635,      6,    629,
           11,    634,     13,     10,      2,      1,      8,    811,
            7,      9,      3,      5,    814,     12,    815,      6,
          813,     11,    812,     13,     10,      2,      1,      8,
          750,      7,      9,      3,      5,    754,     12,    753,
            6,    747,     11,    751,     13,     10,      2,      1,
            8,    599,      7,      9,      3,      5,    691,     12,
          692,      6,    690,     11,    683,     13,     10,      2,
            1,      8,    514,      7,      9,      3,      5,    610,
           12,    611,      6,    609,     11,    608,     13,     10,
            2,      1,      8,    839,      7,      9,      3,      5,
          841,     12,    842,      6,    827,     11,    840,     13,
           10,      2,      1,      8,    833,      7,      9,      3,
            5,    837,     12,    838,      6,    596,     11,    835,
         1103,    144,   1135,     13,     10,      2,      1,      8,
          789,      7,      9,      3,      5,    799,     12,    800,
            6,    798,     11,    790,    462,     28,     21,     23,
           22,     29,    243,    882, 196135,    546,     28,     21,
           23,     22,     29,    243,    676,     41,     21,     23,
           22,     29,    243,    676,     40,    934,    536,    132,
          243,   1264,    617,      3,    290,    151,    531,    548,
          550,    508,    467,      4,    536,    132,    243,   1263,
          617,      3,    290,    151,    531,    548,    550,    508,
          467,      4,    536,    132,    243,   1257,    617,      3,
          290,    151,    531,    548,    550,    508,    467,      4,
          536,    132,    243,   1236,    617,      3,    290,    151,
          531,    548,    550,    508,    467,      4,    536,    132,
          243,   1262,    617,      3,    290,    151,    531,    548,
          550,    508,    467,      4,     28,     21,     63,    623,
          575,    637,     41,     21,     63,    623,    575,    637,
           40,    844,    748,     41,     21,     63,    623,    575,
          637,     40,    575,    748,    462,     28,     21,     23,
           22,     29,    243,     41,     21,     23,     22,     29,
          243,     40,   1265,    536,    132,    243,   1259,   1272,
            3,    290,    151,    531,    548,    550,   1260,   1269,
            3,    290,    151,    531,    467,      4,    757,    760,
         1131,    554,   5652,    758,   1167,    759,    756,    757,
          760,   1129,    554,   5652,    758,   1166,    759,    756,
           28,     21,     23,     22,     29,      1,    441,     44,
         1102,     41,     21,     23,     22,     29,      1,    441,
           44,   1102,     40,    615,     31,     13,     10,      2,
            1,      8,    565,      7,      9,      3,      5,    786,
           12,    785,      6,    694,     11,    781,    462,     13,
           10,      2,      1,      8,    820,    615,      9,      3,
            5,  16494,     12,  16494,      6,      4,     11,    894,
          462,     13,     10,      2,      1,      8,    514,      7,
            9,      3,      5,    610,     12,    611,      6,    609,
           11,    608,     13,     10,      2,      1,      8,    820,
          615,      9,      3,      5,   1094,     12,   1093,      6,
          715,     11,    894,    462,    582,    328,  16567,    114,
          615,    170,    589,    615,    579,      4,    578,      4,
          574,      4,    577,      4,    488,    755,    584,    612,
          588,    585,    586,    583,    587,     28,     21,     63,
           22,     29,    243,     41,     21,     63,     22,     29,
          243,     40,   1204,    477,   1229,    631,     28,     21,
           23,     22,     29,    243,    676,     41,     21,     23,
           22,     29,    243,    676,     40,    934,    462,     13,
           10,      2,      1,      8,    820,    615,      9,      3,
            5,   1094,     12,   1093,      6,    715,     11,    894,
          582,    328,  17884,    114,    466,    170,   4341,   4341,
         4341,    579,   5781,    578,  18235,    574,    107,    577,
          353,    488,   1130,   1151,   1150,    584,    612,   1576,
         1063,    588,    585,  14553,   1430,   1624,    586,    583,
          587,   1147,     13,     10,      2,      1,      8,    789,
            7,      9,      3,      5,    799,     12,    800,      6,
          798,     11,    790,     13,     10,      2,      1,      8,
          750,      7,      9,      3,      5,    754,     12,    753,
            6,    747,     11,    751,    498,     31,    451,    447,
          485,     39,    483,   5377,  22749, 105871,     40, 235526,
          498,     31,    451,    447,    485,     39,    483,   5377,
        22749, 105871,     40, 281035,    582,    328,   9610,    114,
          820,   1515,    170,    253,    579,      4,    578,      4,
          574,      4,    577,      4,    488,    755,    584,    612,
          588,    585,    586,    583,    587,    155,    114,   2208,
          255,    582,    328,  11877,    114,    538,    170,  17246,
          579,      4,    578,      4,    574,      4,    577,      4,
          488,    755,    584,    612,    588,    585,    586,    583,
          587,    613,    328,  17884,   1250,   1388,  42664,    684,
          255,    613,    328,  17884,   1250,   1388,  42664,    684,
           28,     21,     63,    623,    575,    637,     41,     21,
           63,    623,    575,    637,     40,    844,    748,     41,
           21,     63,    623,    575,    637,     40,    575,    748,
           28,     21,     63,     22,     29,    243,   1371,     28,
           21,     23,     22,     29,    243,   1371,    536,    132,
         1068,    734,   1639,   6223,    467,      4,    462,    494,
          477,    557,   1149])
 array([ 733,  565,    3, ...,  477,  557, 1149])
 array([   733,    565,      3,      4,   1149,    557,    739,      3,
          784,   4804,     10,      2, 261843,    546,     11,     39,
          803, 175707,    945, 261843,    946,      0,    810,    112,
          804,      4,    802,     67,   7819,    807,   1821,   5124,
          806,   1590,    808,   1006,    805,    653,   1589,   1591,
          477,    603,   2054,    494,    843,     10,      2, 261843,
          546,      9,      3,      5,   1221,     12,  66349,      6,
         5061,     11,     39,     13,     10,      2,      1,      8,
          679,      7,      9,      3,      5,    699,     12,    701,
            6,    697,     11,    695,     13,     10,      2,      1,
            8,    133,      7,      9,      3,      5,    308,     12,
          309,      6,    307,     11,    302,     13,     10,      2,
            1,      8,    797,      7,      9,      3,      5,    817,
           12,    816,      6,    791,     11,    801,     13,     10,
            2,      1,      8,    215,      7,      9,      3,      5,
          475,     12,    476,      6,    474,     11,    473,     13,
           10,      2,      1,      8,    773,      7,      9,      3,
            5,    778,     12,    779,      6,    777,     11,    774,
           13,     10,      2,      1,      8,    775,      7,      9,
            3,      5,    782,     12,    783,      6,    596,     11,
          776,     13,     10,      2,      1,      8,    542,      7,
            9,      3,      5,    632,     12,    635,      6,    629,
           11,    634,     13,     10,      2,      1,      8,    741,
            7,      9,      3,      5,    746,     12,    745,      6,
          742,     11,    743,     13,     10,      2,      1,      8,
        11845,      7,      9,      3,      5,  12527,     12,  12518,
            6,   4923,     11,  11857,     13,     10,      2,      1,
            8,   1423,      7,      9,      3,      5,   1595,     12,
         1596,      6,    715,     11,   1581,     13,     10,      2,
            1,      8,   1175,      7,      9,      3,      5,   1434,
           12,   1435,      6,   1427,     11,   1395,     13,     10,
            2,      1,      8,   1279,      7,      9,      3,      5,
         1660,     12,   1659,      6,    972,     11,   1626,     13,
           10,      2,      1,      8,   1570,      7,      9,      3,
            5,   1584,     12,   1583,      6,   1538,     11,   1571,
           13,     10,      2,      1,      8,    514,      7,      9,
            3,      5,    610,     12,    611,      6,    609,     11,
          608,     13,     10,      2,      1,      8,    750,      7,
            9,      3,      5,    754,     12,    753,      6,    747,
           11,    751,     13,     10,      2,      1,      8,    599,
            7,      9,      3,      5,    691,     12,    692,      6,
          690,     11,    683,     13,     10,      2,      1,      8,
          839,      7,      9,      3,      5,    841,     12,    842,
            6,    827,     11,    840,     13,     10,      2,      1,
            8,    833,      7,      9,      3,      5,    837,     12,
          838,      6,    596,     11,    835,   1010,     45,   4804,
          144,   1011,    494,    477,    557,   1149])
 ..., array([ 733,  565,    3, ...,  477,  557, 1149])
 array([ 733,  565,    3, ...,  477,  557, 1149])
 array([ 733,  565,    3, ...,  477,  557, 1149])]

In [27]:
token_arr.shape


Out[27]:
(6810,)

In [28]:
# Save and load the array to check that this operation works

np.save("../data/features/3_cutoff_word_to_intseq.npy", token_arr)

In [29]:
test = np.load("../data/features/3_cutoff_word_to_intseq.npy")

In [30]:
print test


[ array([   733,    565,      3,      4,   1149,    557,    739,      3,
          784,   5848,     10,      2, 196135,    546,     11,     39,
          803,   2048,    945, 196135,    946,      0,    810,    112,
          804,      4,    802,     67,   3871,    807,   1826,   2758,
          806,   1590,    808,   1173,    805,    653,   1589,   1591,
          477,    603,   5652,    494,    843,     10,      2, 196135,
          546,      9,      3,      5,   1221,     12,   4797,      6,
         1982,     11,     39,     13,     10,      2,      1,      8,
          679,      7,      9,      3,      5,    699,     12,    701,
            6,    697,     11,    695,     13,     10,      2,      1,
            8,    133,      7,      9,      3,      5,    308,     12,
          309,      6,    307,     11,    302,     13,     10,      2,
            1,    693,    708,      1,    677,    724,      4,    696,
          722,    726,      9,      3,      5,    728,     12,    729,
            6,    727,     11,     39,     13,     10,      2,      1,
            8,    797,      7,      9,      3,      5,    817,     12,
          816,      6,    791,     11,    801,     13,     10,      2,
            1,      8,    215,      7,      9,      3,      5,    475,
           12,    476,      6,    474,     11,    473,     13,     10,
            2,      1,      8,    773,      7,      9,      3,      5,
          778,     12,    779,      6,    777,     11,    774,     13,
           10,      2,      1,      8,    775,      7,      9,      3,
            5,    782,     12,    783,      6,    596,     11,    776,
           13,     10,      2,      1,      8,    741,      7,      9,
            3,      5,    746,     12,    745,      6,    742,     11,
          743,     13,     10,      2,      1,      8,    542,      7,
            9,      3,      5,    632,     12,    635,      6,    629,
           11,    634,     13,     10,      2,      1,      8,    811,
            7,      9,      3,      5,    814,     12,    815,      6,
          813,     11,    812,     13,     10,      2,      1,      8,
          750,      7,      9,      3,      5,    754,     12,    753,
            6,    747,     11,    751,     13,     10,      2,      1,
            8,    599,      7,      9,      3,      5,    691,     12,
          692,      6,    690,     11,    683,     13,     10,      2,
            1,      8,    514,      7,      9,      3,      5,    610,
           12,    611,      6,    609,     11,    608,     13,     10,
            2,      1,      8,    839,      7,      9,      3,      5,
          841,     12,    842,      6,    827,     11,    840,     13,
           10,      2,      1,      8,    833,      7,      9,      3,
            5,    837,     12,    838,      6,    596,     11,    835,
         1103,    144,   1135,     13,     10,      2,      1,      8,
          789,      7,      9,      3,      5,    799,     12,    800,
            6,    798,     11,    790,    462,     28,     21,     23,
           22,     29,    243,    882, 196135,    546,     28,     21,
           23,     22,     29,    243,    676,     41,     21,     23,
           22,     29,    243,    676,     40,    934,    536,    132,
          243,   1264,    617,      3,    290,    151,    531,    548,
          550,    508,    467,      4,    536,    132,    243,   1263,
          617,      3,    290,    151,    531,    548,    550,    508,
          467,      4,    536,    132,    243,   1257,    617,      3,
          290,    151,    531,    548,    550,    508,    467,      4,
          536,    132,    243,   1236,    617,      3,    290,    151,
          531,    548,    550,    508,    467,      4,    536,    132,
          243,   1262,    617,      3,    290,    151,    531,    548,
          550,    508,    467,      4,     28,     21,     63,    623,
          575,    637,     41,     21,     63,    623,    575,    637,
           40,    844,    748,     41,     21,     63,    623,    575,
          637,     40,    575,    748,    462,     28,     21,     23,
           22,     29,    243,     41,     21,     23,     22,     29,
          243,     40,   1265,    536,    132,    243,   1259,   1272,
            3,    290,    151,    531,    548,    550,   1260,   1269,
            3,    290,    151,    531,    467,      4,    757,    760,
         1131,    554,   5652,    758,   1167,    759,    756,    757,
          760,   1129,    554,   5652,    758,   1166,    759,    756,
           28,     21,     23,     22,     29,      1,    441,     44,
         1102,     41,     21,     23,     22,     29,      1,    441,
           44,   1102,     40,    615,     31,     13,     10,      2,
            1,      8,    565,      7,      9,      3,      5,    786,
           12,    785,      6,    694,     11,    781,    462,     13,
           10,      2,      1,      8,    820,    615,      9,      3,
            5,  16494,     12,  16494,      6,      4,     11,    894,
          462,     13,     10,      2,      1,      8,    514,      7,
            9,      3,      5,    610,     12,    611,      6,    609,
           11,    608,     13,     10,      2,      1,      8,    820,
          615,      9,      3,      5,   1094,     12,   1093,      6,
          715,     11,    894,    462,    582,    328,  16567,    114,
          615,    170,    589,    615,    579,      4,    578,      4,
          574,      4,    577,      4,    488,    755,    584,    612,
          588,    585,    586,    583,    587,     28,     21,     63,
           22,     29,    243,     41,     21,     63,     22,     29,
          243,     40,   1204,    477,   1229,    631,     28,     21,
           23,     22,     29,    243,    676,     41,     21,     23,
           22,     29,    243,    676,     40,    934,    462,     13,
           10,      2,      1,      8,    820,    615,      9,      3,
            5,   1094,     12,   1093,      6,    715,     11,    894,
          582,    328,  17884,    114,    466,    170,   4341,   4341,
         4341,    579,   5781,    578,  18235,    574,    107,    577,
          353,    488,   1130,   1151,   1150,    584,    612,   1576,
         1063,    588,    585,  14553,   1430,   1624,    586,    583,
          587,   1147,     13,     10,      2,      1,      8,    789,
            7,      9,      3,      5,    799,     12,    800,      6,
          798,     11,    790,     13,     10,      2,      1,      8,
          750,      7,      9,      3,      5,    754,     12,    753,
            6,    747,     11,    751,    498,     31,    451,    447,
          485,     39,    483,   5377,  22749, 105871,     40, 235526,
          498,     31,    451,    447,    485,     39,    483,   5377,
        22749, 105871,     40, 281035,    582,    328,   9610,    114,
          820,   1515,    170,    253,    579,      4,    578,      4,
          574,      4,    577,      4,    488,    755,    584,    612,
          588,    585,    586,    583,    587,    155,    114,   2208,
          255,    582,    328,  11877,    114,    538,    170,  17246,
          579,      4,    578,      4,    574,      4,    577,      4,
          488,    755,    584,    612,    588,    585,    586,    583,
          587,    613,    328,  17884,   1250,   1388,  42664,    684,
          255,    613,    328,  17884,   1250,   1388,  42664,    684,
           28,     21,     63,    623,    575,    637,     41,     21,
           63,    623,    575,    637,     40,    844,    748,     41,
           21,     63,    623,    575,    637,     40,    575,    748,
           28,     21,     63,     22,     29,    243,   1371,     28,
           21,     23,     22,     29,    243,   1371,    536,    132,
         1068,    734,   1639,   6223,    467,      4,    462,    494,
          477,    557,   1149])
 array([ 733,  565,    3, ...,  477,  557, 1149])
 array([   733,    565,      3,      4,   1149,    557,    739,      3,
          784,   4804,     10,      2, 261843,    546,     11,     39,
          803, 175707,    945, 261843,    946,      0,    810,    112,
          804,      4,    802,     67,   7819,    807,   1821,   5124,
          806,   1590,    808,   1006,    805,    653,   1589,   1591,
          477,    603,   2054,    494,    843,     10,      2, 261843,
          546,      9,      3,      5,   1221,     12,  66349,      6,
         5061,     11,     39,     13,     10,      2,      1,      8,
          679,      7,      9,      3,      5,    699,     12,    701,
            6,    697,     11,    695,     13,     10,      2,      1,
            8,    133,      7,      9,      3,      5,    308,     12,
          309,      6,    307,     11,    302,     13,     10,      2,
            1,      8,    797,      7,      9,      3,      5,    817,
           12,    816,      6,    791,     11,    801,     13,     10,
            2,      1,      8,    215,      7,      9,      3,      5,
          475,     12,    476,      6,    474,     11,    473,     13,
           10,      2,      1,      8,    773,      7,      9,      3,
            5,    778,     12,    779,      6,    777,     11,    774,
           13,     10,      2,      1,      8,    775,      7,      9,
            3,      5,    782,     12,    783,      6,    596,     11,
          776,     13,     10,      2,      1,      8,    542,      7,
            9,      3,      5,    632,     12,    635,      6,    629,
           11,    634,     13,     10,      2,      1,      8,    741,
            7,      9,      3,      5,    746,     12,    745,      6,
          742,     11,    743,     13,     10,      2,      1,      8,
        11845,      7,      9,      3,      5,  12527,     12,  12518,
            6,   4923,     11,  11857,     13,     10,      2,      1,
            8,   1423,      7,      9,      3,      5,   1595,     12,
         1596,      6,    715,     11,   1581,     13,     10,      2,
            1,      8,   1175,      7,      9,      3,      5,   1434,
           12,   1435,      6,   1427,     11,   1395,     13,     10,
            2,      1,      8,   1279,      7,      9,      3,      5,
         1660,     12,   1659,      6,    972,     11,   1626,     13,
           10,      2,      1,      8,   1570,      7,      9,      3,
            5,   1584,     12,   1583,      6,   1538,     11,   1571,
           13,     10,      2,      1,      8,    514,      7,      9,
            3,      5,    610,     12,    611,      6,    609,     11,
          608,     13,     10,      2,      1,      8,    750,      7,
            9,      3,      5,    754,     12,    753,      6,    747,
           11,    751,     13,     10,      2,      1,      8,    599,
            7,      9,      3,      5,    691,     12,    692,      6,
          690,     11,    683,     13,     10,      2,      1,      8,
          839,      7,      9,      3,      5,    841,     12,    842,
            6,    827,     11,    840,     13,     10,      2,      1,
            8,    833,      7,      9,      3,      5,    837,     12,
          838,      6,    596,     11,    835,   1010,     45,   4804,
          144,   1011,    494,    477,    557,   1149])
 ..., array([ 733,  565,    3, ...,  477,  557, 1149])
 array([ 733,  565,    3, ...,  477,  557, 1149])
 array([ 733,  565,    3, ...,  477,  557, 1149])]

In [31]:
print test.shape


(6810,)

In [32]:
# Appears to work well. Clobber both to garbage collect
test = 0
token_arr = 0

In [33]:
# Rerun with a higher cutoff (to get a good set of options)
def frequency_check(counts, index, cutoff):
    if counts < cutoff:
        return 0
    else:
        return index

cutoff = 10
# most common returns word, count pairs (as a tuple) check that the num
# counts is greater than the cutoff
word_to_int = {
    word[0]: frequency_check(word[1], i+1, cutoff) for i, word in enumerate(counter.most_common())}
print len(word_to_int.items())
print word_to_int.items()[:10]
print word_to_int['windows']


934615
[('j!&#x7f;q', 0), ('&#x3c;p[;j', 0), ('9l2u', 0), ('mdbm', 0), ('mdbi', 0), ('7cba6d393775bf994ac22813684055d704bb606d', 0), ('cmayj', 0), ('5t&#x5e;', 0), ('i3d&#x60;o&#x7e;', 0), ('wt&#x24;g', 0)]
1

In [34]:
print xml_int_tokens[1][:400]


[733, 565, 3, 4, 1149, 557, 739, 3, 784, 1732, 10, 2, 171061, 546, 11, 39, 803, 0, 945, 171061, 946, 0, 810, 112, 804, 4, 802, 1821, 7419, 807, 2975, 4948, 806, 1590, 808, 1006, 805, 653, 1589, 1591, 477, 603, 1761, 494, 843, 10, 2, 171061, 546, 9, 3, 5, 1134, 12, 3233, 6, 916, 11, 39, 13, 10, 2, 1, 8, 679, 7, 9, 3, 5, 699, 12, 701, 6, 697, 11, 695, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 8, 773, 7, 9, 3, 5, 778, 12, 779, 6, 777, 11, 774, 13, 10, 2, 1, 8, 775, 7, 9, 3, 5, 782, 12, 783, 6, 596, 11, 776, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722, 726, 9, 3, 5, 728, 12, 729, 6, 727, 11, 39, 13, 10, 2, 1, 8, 797, 7, 9, 3, 5, 817, 12, 816, 6, 791, 11, 801, 13, 10, 2, 1, 8, 741, 7, 9, 3, 5, 746, 12, 745, 6, 742, 11, 743, 13, 10, 2, 1, 8, 542, 7, 9, 3, 5, 632, 12, 635, 6, 629, 11, 634, 13, 10, 2, 1, 8, 811, 7, 9, 3, 5, 814, 12, 815, 6, 813, 11, 812, 13, 10, 2, 1, 8, 2313, 7, 9, 3, 5, 2317, 12, 2318, 6, 1431, 11, 2312, 13, 10, 2, 1, 8, 514, 7, 9, 3, 5, 610, 12, 611, 6, 609, 11, 608, 13, 10, 2, 1, 8, 492, 7, 9, 3, 5, 604, 12, 606, 6, 605, 11, 602, 13, 10, 2, 1, 8, 750, 7, 9, 3, 5, 754, 12, 753, 6, 747, 11, 751, 13, 10, 2, 1, 8, 599, 7, 9, 3, 5, 691, 12, 692, 6, 690, 11, 683, 13, 10, 2, 1, 8, 839, 7, 9, 3, 5, 841, 12, 842, 6, 827, 11, 840, 13, 10, 2, 1, 8, 833, 7, 9, 3, 5, 837, 12, 838, 6, 596, 11, 835, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722]

In [35]:
# Convert every word in every token list into the correct index value.
# Should look like this:
#[ [ 1,22,31,...(sequence of ints representing words in xml file 1)]
# [(sequence of ints representing word in xml file 2)]
# ...]
xml_int_tokens = [[word_to_int[word] for word in xml_file] for xml_file in xml_tokens]
print len(xml_int_tokens[1])
print len(xml_int_tokens[2])


8519
374

In [36]:
print xml_int_tokens[1][:400]


[733, 565, 3, 4, 1149, 557, 739, 3, 784, 1732, 10, 2, 0, 546, 11, 39, 803, 0, 945, 0, 946, 0, 810, 112, 804, 4, 802, 1821, 7419, 807, 2975, 4948, 806, 1590, 808, 1006, 805, 653, 1589, 1591, 477, 603, 1761, 494, 843, 10, 2, 0, 546, 9, 3, 5, 1134, 12, 3233, 6, 916, 11, 39, 13, 10, 2, 1, 8, 679, 7, 9, 3, 5, 699, 12, 701, 6, 697, 11, 695, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 8, 773, 7, 9, 3, 5, 778, 12, 779, 6, 777, 11, 774, 13, 10, 2, 1, 8, 775, 7, 9, 3, 5, 782, 12, 783, 6, 596, 11, 776, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722, 726, 9, 3, 5, 728, 12, 729, 6, 727, 11, 39, 13, 10, 2, 1, 8, 797, 7, 9, 3, 5, 817, 12, 816, 6, 791, 11, 801, 13, 10, 2, 1, 8, 741, 7, 9, 3, 5, 746, 12, 745, 6, 742, 11, 743, 13, 10, 2, 1, 8, 542, 7, 9, 3, 5, 632, 12, 635, 6, 629, 11, 634, 13, 10, 2, 1, 8, 811, 7, 9, 3, 5, 814, 12, 815, 6, 813, 11, 812, 13, 10, 2, 1, 8, 2313, 7, 9, 3, 5, 2317, 12, 2318, 6, 1431, 11, 2312, 13, 10, 2, 1, 8, 514, 7, 9, 3, 5, 610, 12, 611, 6, 609, 11, 608, 13, 10, 2, 1, 8, 492, 7, 9, 3, 5, 604, 12, 606, 6, 605, 11, 602, 13, 10, 2, 1, 8, 750, 7, 9, 3, 5, 754, 12, 753, 6, 747, 11, 751, 13, 10, 2, 1, 8, 599, 7, 9, 3, 5, 691, 12, 692, 6, 690, 11, 683, 13, 10, 2, 1, 8, 839, 7, 9, 3, 5, 841, 12, 842, 6, 827, 11, 840, 13, 10, 2, 1, 8, 833, 7, 9, 3, 5, 837, 12, 838, 6, 596, 11, 835, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722]

In [38]:
max([max(ele) for ele in xml_int_tokens])


Out[38]:
98228

In [39]:
# This is still too big probably. I'm going to save and then try excluding
# more.
token_arr = np.array([np.array(xml) for xml in xml_int_tokens])
np.save("../data/features/10_cutoff_word_to_intseq.npy", token_arr)
token_arr = 0

In [40]:
token_arr = 0

In [41]:
# Rerun with a cutoff of 50 (to get a good set of options)
def frequency_check(counts, index, cutoff):
    if counts < cutoff:
        return 0
    else:
        return index

cutoff = 50
# most common returns word, count pairs (as a tuple) check that the num
# counts is greater than the cutoff
word_to_int = {
    word[0]: frequency_check(word[1], i+1, cutoff) for i, word in enumerate(counter.most_common())}
print len(word_to_int.items())
print word_to_int.items()[:10]
print word_to_int['windows']


934615
[('j!&#x7f;q', 0), ('&#x3c;p[;j', 0), ('9l2u', 0), ('mdbm', 0), ('mdbi', 0), ('7cba6d393775bf994ac22813684055d704bb606d', 0), ('cmayj', 0), ('5t&#x5e;', 0), ('i3d&#x60;o&#x7e;', 0), ('wt&#x24;g', 0)]
1

In [42]:
# Convert every word in every token list into the correct index value.
# Should look like this:
#[ [ 1,22,31,...(sequence of ints representing words in xml file 1)]
# [(sequence of ints representing word in xml file 2)]
# ...]
xml_int_tokens = [[word_to_int[word] for word in xml_file] for xml_file in xml_tokens]
print len(xml_int_tokens[1])
print len(xml_int_tokens[2])


8519
374

In [43]:
print xml_int_tokens[1][:400]


[733, 565, 3, 4, 1149, 557, 739, 3, 784, 1732, 10, 2, 0, 546, 11, 39, 803, 0, 945, 0, 946, 0, 810, 112, 804, 4, 802, 1821, 7419, 807, 2975, 4948, 806, 1590, 808, 1006, 805, 653, 1589, 1591, 477, 603, 1761, 494, 843, 10, 2, 0, 546, 9, 3, 5, 1134, 12, 3233, 6, 916, 11, 39, 13, 10, 2, 1, 8, 679, 7, 9, 3, 5, 699, 12, 701, 6, 697, 11, 695, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 8, 773, 7, 9, 3, 5, 778, 12, 779, 6, 777, 11, 774, 13, 10, 2, 1, 8, 775, 7, 9, 3, 5, 782, 12, 783, 6, 596, 11, 776, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722, 726, 9, 3, 5, 728, 12, 729, 6, 727, 11, 39, 13, 10, 2, 1, 8, 797, 7, 9, 3, 5, 817, 12, 816, 6, 791, 11, 801, 13, 10, 2, 1, 8, 741, 7, 9, 3, 5, 746, 12, 745, 6, 742, 11, 743, 13, 10, 2, 1, 8, 542, 7, 9, 3, 5, 632, 12, 635, 6, 629, 11, 634, 13, 10, 2, 1, 8, 811, 7, 9, 3, 5, 814, 12, 815, 6, 813, 11, 812, 13, 10, 2, 1, 8, 2313, 7, 9, 3, 5, 2317, 12, 2318, 6, 1431, 11, 2312, 13, 10, 2, 1, 8, 514, 7, 9, 3, 5, 610, 12, 611, 6, 609, 11, 608, 13, 10, 2, 1, 8, 492, 7, 9, 3, 5, 604, 12, 606, 6, 605, 11, 602, 13, 10, 2, 1, 8, 750, 7, 9, 3, 5, 754, 12, 753, 6, 747, 11, 751, 13, 10, 2, 1, 8, 599, 7, 9, 3, 5, 691, 12, 692, 6, 690, 11, 683, 13, 10, 2, 1, 8, 839, 7, 9, 3, 5, 841, 12, 842, 6, 827, 11, 840, 13, 10, 2, 1, 8, 833, 7, 9, 3, 5, 837, 12, 838, 6, 596, 11, 835, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722]

In [44]:
max([max(ele) for ele in xml_int_tokens])


Out[44]:
29316

In [45]:
# This is starting to seem like a reasonable vocabulary size
#  I will do one more (shooting for ~10000 vocab size)
# This is still too big probably. I'm going to save and then try excluding
# more.
token_arr = np.array([np.array(xml) for xml in xml_int_tokens])
np.save("../data/features/50_cutoff_word_to_intseq.npy", token_arr)
token_arr = 0

In [46]:
# Rerun with a cutoff of 50 (to get a good set of options)
def frequency_check(counts, index, cutoff):
    if counts < cutoff:
        return 0
    else:
        return index

cutoff = 100
# most common returns word, count pairs (as a tuple) check that the num
# counts is greater than the cutoff
word_to_int = {
    word[0]: frequency_check(word[1], i+1, cutoff) for i, word in enumerate(counter.most_common())}
print len(word_to_int.items())
print word_to_int.items()[:10]
print word_to_int['windows']


934615
[('j!&#x7f;q', 0), ('&#x3c;p[;j', 0), ('9l2u', 0), ('mdbm', 0), ('mdbi', 0), ('7cba6d393775bf994ac22813684055d704bb606d', 0), ('cmayj', 0), ('5t&#x5e;', 0), ('i3d&#x60;o&#x7e;', 0), ('wt&#x24;g', 0)]
1

In [47]:
# Convert every word in every token list into the correct index value.
# Should look like this:
#[ [ 1,22,31,...(sequence of ints representing words in xml file 1)]
# [(sequence of ints representing word in xml file 2)]
# ...]
xml_int_tokens = [[word_to_int[word] for word in xml_file] for xml_file in xml_tokens]
print len(xml_int_tokens[1])
print len(xml_int_tokens[2])


8519
374

In [48]:
print xml_int_tokens[1][:400]


[733, 565, 3, 4, 1149, 557, 739, 3, 784, 1732, 10, 2, 0, 546, 11, 39, 803, 0, 945, 0, 946, 0, 810, 112, 804, 4, 802, 1821, 7419, 807, 2975, 4948, 806, 1590, 808, 1006, 805, 653, 1589, 1591, 477, 603, 1761, 494, 843, 10, 2, 0, 546, 9, 3, 5, 1134, 12, 3233, 6, 916, 11, 39, 13, 10, 2, 1, 8, 679, 7, 9, 3, 5, 699, 12, 701, 6, 697, 11, 695, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 8, 773, 7, 9, 3, 5, 778, 12, 779, 6, 777, 11, 774, 13, 10, 2, 1, 8, 775, 7, 9, 3, 5, 782, 12, 783, 6, 596, 11, 776, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722, 726, 9, 3, 5, 728, 12, 729, 6, 727, 11, 39, 13, 10, 2, 1, 8, 797, 7, 9, 3, 5, 817, 12, 816, 6, 791, 11, 801, 13, 10, 2, 1, 8, 741, 7, 9, 3, 5, 746, 12, 745, 6, 742, 11, 743, 13, 10, 2, 1, 8, 542, 7, 9, 3, 5, 632, 12, 635, 6, 629, 11, 634, 13, 10, 2, 1, 8, 811, 7, 9, 3, 5, 814, 12, 815, 6, 813, 11, 812, 13, 10, 2, 1, 8, 2313, 7, 9, 3, 5, 2317, 12, 2318, 6, 1431, 11, 2312, 13, 10, 2, 1, 8, 514, 7, 9, 3, 5, 610, 12, 611, 6, 609, 11, 608, 13, 10, 2, 1, 8, 492, 7, 9, 3, 5, 604, 12, 606, 6, 605, 11, 602, 13, 10, 2, 1, 8, 750, 7, 9, 3, 5, 754, 12, 753, 6, 747, 11, 751, 13, 10, 2, 1, 8, 599, 7, 9, 3, 5, 691, 12, 692, 6, 690, 11, 683, 13, 10, 2, 1, 8, 839, 7, 9, 3, 5, 841, 12, 842, 6, 827, 11, 840, 13, 10, 2, 1, 8, 833, 7, 9, 3, 5, 837, 12, 838, 6, 596, 11, 835, 13, 10, 2, 1, 8, 133, 7, 9, 3, 5, 308, 12, 309, 6, 307, 11, 302, 13, 10, 2, 1, 8, 215, 7, 9, 3, 5, 475, 12, 476, 6, 474, 11, 473, 13, 10, 2, 1, 693, 708, 1, 677, 724, 4, 696, 722]

In [49]:
max([max(ele) for ele in xml_int_tokens])


Out[49]:
19679

In [50]:
# This is still too big probably. I'm going to save and then try excluding
# more.
token_arr = np.array([np.array(xml) for xml in xml_int_tokens])
np.save("../data/features/100_cutoff_alphabet_19679_word_to_intseq.npy", token_arr)
token_arr = 0

In [ ]:
# Still not quite small enough. I will try one more time, at 200
# although I feel like significant info is being lost at this point
# Rerun with a cutoff of 50 (to get a good set of options)
def frequency_check(counts, index, cutoff):
    if counts < cutoff:
        return 0
    else:
        return index

cutoff = 200
# most common returns word, count pairs (as a tuple) check that the num
# counts is greater than the cutoff
word_to_int = {
    word[0]: frequency_check(word[1], i+1, cutoff) for i, word in enumerate(counter.most_common())}
print len(word_to_int.items())
print word_to_int.items()[:10]
print word_to_int['windows']


934615
[('j!&#x7f;q', 0), ('&#x3c;p[;j', 0), ('9l2u', 0), ('mdbm', 0), ('mdbi', 0), ('7cba6d393775bf994ac22813684055d704bb606d', 0), ('cmayj', 0), ('5t&#x5e;', 0), ('i3d&#x60;o&#x7e;', 0), ('wt&#x24;g', 0)]
1

In [ ]:
# Convert every word in every token list into the correct index value.
# Should look like this:
#[ [ 1,22,31,...(sequence of ints representing words in xml file 1)]
# [(sequence of ints representing word in xml file 2)]
# ...]
xml_int_tokens = [[word_to_int[word] for word in xml_file] for xml_file in xml_tokens]
print len(xml_int_tokens[1])
print len(xml_int_tokens[2])

In [ ]:
print xml_int_tokens[1][:400]

In [ ]:
max([max(ele) for ele in xml_int_tokens])

In [ ]:
token_arr = np.array([np.array(xml) for xml in xml_int_tokens])
np.save("../data/features/200_cutoff_alphabet_to_intseq.npy", token_arr)
token_arr = 0

In [ ]: