In [1]:
from __future__ import print_function
%matplotlib inline

Structure of the lines is: Line,Pada,Meter,MP1,MP2,MP3,MP4,MP5,MP6,MP7,MP8


In [2]:
hymns = {}
for line in open('dimeter-mp.csv', 'r'):
    comps = line.strip().split(',')
    line, pada, meter = comps[0:3]
    mps = comps[3:]
    hymn_id = '-'.join(line.split('.')[:2])
    if hymn_id not in hymns:
        hymns[hymn_id] = []
    hymns[hymn_id].append(mps)

How many hymns do we have in total?


In [3]:
print(len(hymns))


369

How are their lengths distributed?


In [4]:
from seaborn.distributions import distplot
lens = [len(hymns[h]) for h in hymns]
distplot(lens)


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x113ef6910>

Most seem to have around 25 lines, consisting of 8 MPs (metrical position).

Let us extract n-grams from the lines and find out what plain clustering does.


In [5]:
ids, texts = [], []
for h in hymns:
    # add book label:
    ids.append(h)
    # stringify mps:
    t = ''
    for line in hymns[h]:
        t += (''.join(line[:4])+'%'+''.join(line[4:])+'$')
    texts.append(t)

Let us turn this data into a corpus:


In [6]:
from pystyl.corpus import Corpus
corpus = Corpus(language='other')
corpus.add_texts(texts=texts,
                 titles=ids, #['b'+str(i) for i in range(len(texts))],
                 target_names=ids)

In [63]:
print(corpus)


<Corpus(369 texts)> 
Untokenized texts:

	- b0	(cat: 6):	'LLLH%LHLH$LHLH%LHLH$LHHH%LHLL$'[...]
	- b1	(cat: 6):	'HHHH%LHLL$LLHL%LHLL$HHHH%LHLL$'[...]
	- b2	(cat: 6):	'LLHH%LHLL$LHLH%LHLL$HLHH%LLLH$'[...]
	- b3	(cat: 6):	'HHLH%LHLL$LLHL%LHLH$LHLH%LHLH$'[...]
	- b4	(cat: 6):	'LHLH%LHLL$LHHL%LHLL$LHLH%LHLH$'[...]
	- b5	(cat: 6):	'HLHL%LHLH$HLHH%LHLH$LHLH%LHLL$'[...]
	- b6	(cat: 6):	'HHHL%LHLL$HHLH%LHLL$LHHL%LLLL$'[...]
	- b7	(cat: 6):	'HHHH%LHLH$LHHH%LHLH$HLHH%LHLH$'[...]
	- b8	(cat: 3):	'HHHH%LHLL$HHLH%LHLL$HLHH%LHLH$'[...]
	- b9	(cat: 3):	'LHHH%LHLH$HHLH%LHLL$LHHH%LHLH$'[...]
	- b10	(cat: 3):	'HHHH%LHLH$HLHL%LHLL$LHLH%LHLL$'[...]
	- b11	(cat: 10):	'HHHL%LHLL$HLHH%LHLH$LHLH%LHLL$'[...]
	- b12	(cat: 10):	'LLLL%LHLH$HHHL%LHLL$LHHH%HHLL$'[...]
	- b13	(cat: 4):	'LLHL%LHLL$HHHH%LHLL$LHHH%LHLL$'[...]
	- b14	(cat: 2):	'HHHH%LHLH$LHHH%LHLL$LHHH%LHLH$'[...]
	- b15	(cat: 7):	'LHHH%LHLL$HHHH%LHLL$LHHH%LHLH$'[...]
	- b16	(cat: 10):	'LHHL%LLLL$LHHH%LHLL$HHLL%HHLL$'[...]
	- b17	(cat: 2):	'LHHL%LHLH$HHHH%LLLH$LHLH%LHLL$'[...]
	- b18	(cat: 1):	'LLHH%LHLL$HHLH%LHLL$LHHH%LHLL$'[...]
	- b19	(cat: 9):	'HLLL%LHLH$HHHH%LHLL$LHHH%LHLL$'[...]
	- b20	(cat: 9):	'HLHL%LHLH$LLLH%LHLH$HLHL%LLLL$'[...]
	- b21	(cat: 9):	'HHLH%LHLH$LHHH%LHLL$HLHH%LHLH$'[...]
	- b22	(cat: 9):	'LHHH%LHLH$LHLH%LHLL$LHLL%LLLL$'[...]
	- b23	(cat: 9):	'LHLH%LHLH$LHLH%LHLL$LHHH%LHLL$'[...]
	- b24	(cat: 9):	'LHHH%LHLL$LLHH%LHLL$LHHH%LHLL$'[...]
	- b25	(cat: 9):	'LHLL%LHLH$LHHL%LHLL$HHHH%LHLH$'[...]
	- b26	(cat: 9):	'LLHH%LHLH$HHLH%LHLL$HHHH%LHLL$'[...]
	- b27	(cat: 9):	'LHHH%LHLH$HHHH%HHLL$LHHH%LHLH$'[...]
	- b28	(cat: 9):	'HHLH%LHLH$LLHL%LHLL$LHHH%LHLL$'[...]
	- b29	(cat: 8):	'LHLH%LLHL$LHLH%LLLL$LHLH%LLHH$'[...]
	- b30	(cat: 8):	'HLHH%LLLL$LHLH%LHLL$HHLH%LHLL$'[...]
	- b31	(cat: 8):	'LHHH%LHLH$HHHH%LHLL$HHHH%LHLH$'[...]
	- b32	(cat: 8):	'LHHH%LLLL$LLHH%HHLL$LHLH%LHLL$'[...]
	- b33	(cat: 8):	'HHLH%LHLH$LLHL%LHLL$LHHH%LHLL$'[...]
	- b34	(cat: 8):	'HHHH%LHLL$HLHL%LHLL$HHLH%LHLH$'[...]
	- b35	(cat: 8):	'LHLH%HHLL$HHLH%LHLL$HHHH%LHLH$'[...]
	- b36	(cat: 10):	'LLHH%LLLH$LLHH%LHLL$LHHH%LHLL$'[...]
	- b37	(cat: 9):	'LHLH%LHLL$LLHH%LHLL$HHLH%LHLL$'[...]
	- b38	(cat: 10):	'LLHH%LHLL$HHLH%LHLL$LHHH%LHLH$'[...]
	- b39	(cat: 6):	'LHHH%LHLH$LLHH%LHLL$HHHH%LHLL$'[...]
	- b40	(cat: 6):	'LLLL%LHLL$HHHH%LHLL$LHHH%LHLH$'[...]
	- b41	(cat: 6):	'HHHH%LHLH$LHLH%LHLL$LHLL%LHLL$'[...]
	- b42	(cat: 10):	'LHLH%HLLL$LHHH%LHLH$LHHH%LHHH$'[...]
	- b43	(cat: 10):	'LHLH%HHHH$HHHL%LHLH$LHLH%HHLL$'[...]
	- b44	(cat: 10):	'LHHH%HHLH$LHHL%LHLL$LLHL%LHLH$'[...]
	- b45	(cat: 3):	'LHLH%LHLH$HHLH%LHLL$HLHH%LHLH$'[...]
	- b46	(cat: 10):	'HHLH%HLHL$LLHH%LHHH$LLHH%HLHL$'[...]
	- b47	(cat: 10):	'HLHL%LHLH$HHHH%LHLL$HHHH%LHLL$'[...]
	- b48	(cat: 10):	'LHLH%LLLH$LLHL%LHLL$HHLL%LHLH$'[...]
	- b49	(cat: 10):	'LHLH%LHLL$HLHL%LHLH$LHLH%HHLH$'[...]
	- b50	(cat: 10):	'LHLH%LHLL$HLHH%LHLH$HHHH%HHLH$'[...]
	- b51	(cat: 10):	'HLHL%LLLL$HLHH%LHLH$HHHH%LHLH$'[...]
	- b52	(cat: 10):	'LHLH%LLLL$LHHL%LHLL$HHLH%HHHH$'[...]
	- b53	(cat: 10):	'HHLH%LHLL$HHHL%LHLL$LHLL%LHLH$'[...]
	- b54	(cat: 5):	'HHHL%LHLL$LLHH%LHLL$LHHH%HHLH$'[...]
	- b55	(cat: 5):	'HLLH%LHLH$LHHH%LHLL$LHLH%LHLL$'[...]
	- b56	(cat: 5):	'HLHH%LLLL$LHHH%LHLL$HHHL%LHLL$'[...]
	- b57	(cat: 5):	'HHHH%LHLL$LLHH%LHLL$LHHH%LHHL$'[...]
	- b58	(cat: 1):	'HHHH%LHLH$LLHH%LHLL$LHLH%LHLL$'[...]
	- b59	(cat: 1):	'LLHH%LHLL$HHHL%LHLH$LHLH%LHLL$'[...]
	- b60	(cat: 1):	'HHLH%HHLH$HHLH%LHLL$HHHH%LHLL$'[...]
	- b61	(cat: 1):	'HHLH%LHLH$HHLL%LHLH$LHLL%LHLH$'[...]
	- b62	(cat: 1):	'HHHH%LHLH$HHHH%LHLL$HLHH%LHLL$'[...]
	- b63	(cat: 1):	'LLHH%LHLL$HHHH%LHLH$HHLH%LHLL$'[...]
	- b64	(cat: 1):	'HLHH%LHLH$HHHH%LHLH$HHHH%LHLL$'[...]
	- b65	(cat: 1):	'HLHH%LLLH$HHLH%LHLL$HLHH%LHLL$'[...]
	- b66	(cat: 1):	'HHLH%LLLH$LLHH%LHLH$HLHH%LHLL$'[...]
	- b67	(cat: 1):	'HHLL%LHLL$HHHL%LHLH$HHHH%LHLH$'[...]
	- b68	(cat: 10):	'HHHH%LHLH$LLHH%LHLL$LLHL%LHLL$'[...]
	- b69	(cat: 10):	'LHHH%LHLL$HHLH%LHLL$LHHH%LHLH$'[...]
	- b70	(cat: 5):	'HLHH%LHLH$LLLH%LLLL$HHLH%LHLH$'[...]
	- b71	(cat: 5):	'HLHH%LHHH$LHHH%HLHL$LHHH%HLHH$'[...]
	- b72	(cat: 5):	'HHHH%LHLL$LLHH%LHLL$HHHH%LHLL$'[...]
	- b73	(cat: 5):	'LHLH%LHLH$HHHH%LHLH$HHHL%LHLL$'[...]
	- b74	(cat: 5):	'HHHH%LHLL$HHHH%LHLH$HHLH%LHLH$'[...]
	- b75	(cat: 5):	'LHLH%LHLL$HHHH%LHLH$LHLH%LHLL$'[...]
	- b76	(cat: 5):	'HHHH%LHLH$HHHL%LHLL$HHHH%LHLH$'[...]
	- b77	(cat: 9):	'HHLL%LHLH$HHLH%LHLL$HHHH%LHLL$'[...]
	- b78	(cat: 9):	'LLHH%LHLH$LLHH%LHLL$LHHL%LHLL$'[...]
	- b79	(cat: 9):	'LHHH%LHLL$HLLH%HHLL$HHHH%LLLL$'[...]
	- b80	(cat: 9):	'LHHH%LHLL$HHLH%LHLL$HHLH%HHLL$'[...]
	- b81	(cat: 9):	'LHHH%LHLH$LHLH%LHLL$HHLH%LHLH$'[...]
	- b82	(cat: 9):	'LHHH%LHLH$HHHH%LLLL$LHHH%LHLL$'[...]
	- b83	(cat: 9):	'LLHL%LHLL$LHHH%LHLH$HLHH%LHLH$'[...]
	- b84	(cat: 9):	'LLHH%LHLL$HHLH%LHLL$LLHH%LHLL$'[...]
	- b85	(cat: 9):	'LHLH%LHLH$LHHH%LHLL$LHHL%LHLL$'[...]
	- b86	(cat: 9):	'HHHH%LHLL$LHHL%LHLL$HHLH%LHLH$'[...]
	- b87	(cat: 10):	'LHLH%HHHH$LLHL%LHLL$LHHL%LHLL$'[...]
	- b88	(cat: 10):	'HLHL%LHLH$HHHH%LHLH$LHLH%LHLH$'[...]
	- b89	(cat: 8):	'LHHL%LLHH$HLLH%LHHL$LHHH%HLHL$'[...]
	- b90	(cat: 8):	'HHLL%LHLL$HHLL%LHLL$LHLH%LHLH$'[...]
	- b91	(cat: 8):	'LHHH%LHLH$HHHH%LHLL$HLHH%LHLL$'[...]
	- b92	(cat: 8):	'LHLH%LHLL$HHHL%LHLL$HHHL%LHLL$'[...]
	- b93	(cat: 8):	'LHLH%HLHL$HLHH%LLHH$LLLH%HLHL$'[...]
	- b94	(cat: 8):	'HHHL%LHLL$LLLL%LHLL$LHHH%LHLH$'[...]
	- b95	(cat: 8):	'LHLH%LHLL$HLHH%LHLH$LHHH%LHLH$'[...]
	- b96	(cat: 8):	'HHLH%LHLH$HHHH%LHLL$LHHH%LHLL$'[...]
	- b97	(cat: 8):	'LHLH%HLLL$HLHH%LHLL$HHHL%LHLL$'[...]
	- b98	(cat: 1):	'LHLH%LLLL$HHLL%LHLL$HHHH%HHHL$'[...]
	- b99	(cat: 3):	'LHHL%LHLH$LHHH%LHLH$HHLH%LHLL$'[...]
	- b100	(cat: 3):	'HHLH%LLLH$LLHH%LHLL$HLHL%HLHH$'[...]
	- b101	(cat: 6):	'LHLH%LHLL$LHHL%LHLL$HHHH%LHLL$'[...]
	- b102	(cat: 3):	'LHLH%LHLL$HLHH%LHLL$LHHH%LHLH$'[...]
	- b103	(cat: 3):	'LLHH%LHLH$HHHH%LHLL$LHHH%LHLH$'[...]
	- b104	(cat: 6):	'HLLH%LHLH$HHHH%LHLL$HHLH%LHLH$'[...]
	- b105	(cat: 6):	'LHLH%LHLL$LHHH%LHLL$HHLH%LHLH$'[...]
	- b106	(cat: 6):	'LLHH%LHLH$HHLL%LHLL$LHLH%LHLH$'[...]
	- b107	(cat: 3):	'HHLH%LHLL$LHHH%HLHL$HHHH%LHLH$'[...]
	- b108	(cat: 10):	'LLHL%LHLL$HHHH%LLLL$HHHH%LHLH$'[...]
	- b109	(cat: 10):	'HHHH%LLLH$LHLH%LHLL$HHHH%LHLH$'[...]
	- b110	(cat: 10):	'LHHH%LLLL$LHLL%LLLL$HLHH%LHLL$'[...]
	- b111	(cat: 7):	'LLHH%LHLL$HLHL%LHLL$HHHH%LHLL$'[...]
	- b112	(cat: 10):	'HLHH%LHLL$HLLH%LHLH$LLHH%LHLL$'[...]
	- b113	(cat: 10):	'HLHH%LHLL$HHHH%LHLL$LHLH%LHLL$'[...]
	- b114	(cat: 10):	'LLHH%LHHH$LHHH%LHLL$HHHH%HHLL$'[...]
	- b115	(cat: 7):	'LHHH%LHLL$HHHH%LHLL$HHHL%LHLL$'[...]
	- b116	(cat: 10):	'LHHH%LHLH$LLHH%LHLL$HHHH%LHLL$'[...]
	- b117	(cat: 10):	'LHHH%LLLH$HHLL%LHLH$HHHH%LHLH$'[...]
	- b118	(cat: 10):	'LHHH%HLLH$HHLL%LHLH$HLHH%HLLL$'[...]
	- b119	(cat: 10):	'HHHL%HLHL$LHHH%LHLL$LHHH%HHHL$'[...]
	- b120	(cat: 10):	'LHLL%LHLL$HLHH%LHLL$LLHH%LHLL$'[...]
	- b121	(cat: 4):	'HHHH%LHLH$HHHL%LHLH$HHHH%LHLL$'[...]
	- b122	(cat: 10):	'HHHL%LHHH$LHHL%LHLH$HHLH%LHHL$'[...]
	- b123	(cat: 10):	'HHLH%LHLL$HHLH%LHLL$LLHH%LHLL$'[...]
	- b124	(cat: 5):	'HHHH%LLLL$HHLH%LLLL$HHHL%LHLL$'[...]
	- b125	(cat: 5):	'HHLH%LHLH$HHHH%LHLL$HHHH%LHLH$'[...]
	- b126	(cat: 5):	'LHLH%LHLH$HHLH%LHLL$HHHL%LHLL$'[...]
	- b127	(cat: 5):	'HLHH%LHLL$LLLH%LHLH$LHHL%LHLL$'[...]
	- b128	(cat: 1):	'LHHL%LLLH$LHHH%LHLL$HHHH%LHLH$'[...]
	- b129	(cat: 1):	'LHHH%LHLH$HHHH%LHLH$LHLH%LHLL$'[...]
	- b130	(cat: 1):	'HHHH%LHLL$HHHH%LLLH$HHHH%LHLL$'[...]
	- b131	(cat: 1):	'HHLH%LHLL$HLHH%LLLH$HLHH%LHLH$'[...]
	- b132	(cat: 1):	'HHLH%LHLH$LHLL%LLLL$LHLL%LHLL$'[...]
	- b133	(cat: 1):	'HHLL%HHLL$LLHH%LHLL$HHHH%LHLH$'[...]
	- b134	(cat: 1):	'HHLH%HLHL$HHLH%HLHL$HHHL%HLHH$'[...]
	- b135	(cat: 1):	'LHHH%LHLL$LHLH%LHLL$HHLH%LHLL$'[...]
	- b136	(cat: 1):	'HHLH%LHLL$LHLL%LHLH$LLLH%LHLL$'[...]
	- b137	(cat: 8):	'HHLL%LHLH$HHHH%LHLL$HHHH%LHLL$'[...]
	- b138	(cat: 8):	'LHLH%LLLL$LLHH%LHLL$LHLH%LHLH$'[...]
	- b139	(cat: 8):	'LLHL%LHLL$HHHH%LHLL$HHHH%LHLL$'[...]
	- b140	(cat: 8):	'LLHL%LHLL$HLHH%LHLH$LHHH%LHLL$'[...]
	- b141	(cat: 8):	'HLHL%LHLL$LLHL%LHLL$HHHH%LHLL$'[...]
	- b142	(cat: 8):	'LLHH%LHLH$HHHL%LHLL$HHLH%LHLL$'[...]
	- b143	(cat: 10):	'HLHH%LHLH$HLHH%LHLL$HHLH%LHLL$'[...]
	- b144	(cat: 10):	'HLHH%LHLL$LHLH%HHLL$LHLH%LHLH$'[...]
	- b145	(cat: 5):	'LLHH%HHHL$HHHH%LHLH$HHHH%LHLL$'[...]
	- b146	(cat: 5):	'HLLL%LHLL$HLHH%LHLH$LLLH%LHLH$'[...]
	- b147	(cat: 5):	'HHLH%LHLH$HLHH%LHLH$HHHH%LHLL$'[...]
	- b148	(cat: 5):	'LHHH%LHLL$LHHL%LHLL$HHLH%LHLH$'[...]
	- b149	(cat: 5):	'LHHH%LHLL$LHHH%LHLL$HHHH%LHLL$'[...]
	- b150	(cat: 5):	'HHLH%LHLL$HHLH%LHLL$HHHH%LHLL$'[...]
	- b151	(cat: 5):	'LHLH%LHLL$HHLH%LHLH$HHLH%LHLH$'[...]
	- b152	(cat: 5):	'LHHL%LLLH$HHHH%LLLL$LLHH%LHLL$'[...]
	- b153	(cat: 6):	'LHLH%LHLH$HHHH%LHLH$LHLH%LHLH$'[...]
	- b154	(cat: 10):	'HHHH%HLLH$HHLL%LHLL$HHHH%LHLL$'[...]
	- b155	(cat: 7):	'LHLH%HHLH$HHHH%LHLL$LHLH%LHLL$'[...]
	- b156	(cat: 7):	'HLLL%LHLL$LLHH%LHLL$HHHH%LHLL$'[...]
	- b157	(cat: 8):	'HHHL%LHLH$HHHH%LHLH$HHLL%LHLL$'[...]
	- b158	(cat: 8):	'LLHH%LHLH$LHLH%LHLH$LHLL%LHLH$'[...]
	- b159	(cat: 8):	'LHHH%HLLH$LLHH%HHLL$LHHH%LHLL$'[...]
	- b160	(cat: 8):	'LHHH%LHLL$HLHH%LHLH$LHHH%LHLL$'[...]
	- b161	(cat: 9):	'LHLH%LHLH$HHLH%LHLL$LHLH%LHLL$'[...]
	- b162	(cat: 9):	'HLHH%LHLH$LHLH%LHLL$HHHH%LHLL$'[...]
	- b163	(cat: 9):	'LHHH%LHLL$LLHH%LHLL$HHLH%LHLL$'[...]
	- b164	(cat: 9):	'LHLH%LHLL$LLHL%LLLL$LHHH%LHLL$'[...]
	- b165	(cat: 9):	'HLHH%LHLL$HLHL%LHLL$LLHH%LHLL$'[...]
	- b166	(cat: 9):	'LHLH%LHLL$LHHH%LHLH$HLHH%LHLL$'[...]
	- b167	(cat: 9):	'HHLH%LHLH$LHLH%LHLH$HHLH%LHLL$'[...]
	- b168	(cat: 9):	'LLLH%LHLL$LHLH%LHLL$HHHL%LHLL$'[...]
	- b169	(cat: 9):	'HHHL%LLLL$HHLH%LHLL$HHLH%LHLL$'[...]
	- b170	(cat: 6):	'LHHL%LHLH$HHLL%LHLH$HHLL%LHLH$'[...]
	- b171	(cat: 10):	'LHLL%LHLL$HHHH%LHLL$HHHH%LHLL$'[...]
	- b172	(cat: 4):	'LLHH%LHLH$LLHH%LLLL$LHLH%LLLH$'[...]
	- b173	(cat: 4):	'HLHH%LHHL$HHLH%HLHL$HLHH%HHLH$'[...]
	- b174	(cat: 4):	'HHLL%LHLL$LHHL%LHLL$HHHH%LHLH$'[...]
	- b175	(cat: 4):	'LHLL%LLLL$LHLH%LHLH$LLLL%LHLH$'[...]
	- b176	(cat: 10):	'LHLH%LHLH$LLHH%LHLH$LLHH%LHLH$'[...]
	- b177	(cat: 10):	'HLHH%HHLL$HHLH%LHLH$HHHH%LHLH$'[...]
	- b178	(cat: 10):	'LLHL%LLLL$HLHL%LHLL$LHHH%LHHH$'[...]
	- b179	(cat: 10):	'HHHH%LLHH$HHHL%LHLL$HHHH%LHLH$'[...]
	- b180	(cat: 1):	'LLHH%LHLH$HLHH%LHLH$HHLL%LHLL$'[...]
	- b181	(cat: 1):	'LHLH%LHLL$HHHL%LHLL$HHLH%LHLL$'[...]
	- b182	(cat: 1):	'LHLH%LHLL$HHHH%LHLH$HHHH%LHLH$'[...]
	- b183	(cat: 1):	'LHLH%LHLL$LHHH%LHLL$HHHH%LHLL$'[...]
	- b184	(cat: 5):	'HHHL%LHHL$LLHH%LHLL$HHLH%LHHH$'[...]
	- b185	(cat: 5):	'LHHH%LHLL$LLHL%LHLH$LHHH%LHLL$'[...]
	- b186	(cat: 5):	'HHLH%HLLH$LHLH%LHLL$LLHH%LHLL$'[...]
	- b187	(cat: 5):	'HLHL%LHLL$HHHL%LHLL$LLHH%LHLH$'[...]
	- b188	(cat: 5):	'LLHH%LHLL$LHHH%LHLH$LLLH%LHLH$'[...]
	- b189	(cat: 5):	'LHHH%LHLL$HHHL%LHLL$LLLH%LHLL$'[...]
	- b190	(cat: 5):	'HLLH%LHLH$HHHL%LHLH$LLHL%LHLH$'[...]
	- b191	(cat: 9):	'LHLH%HHLL$HHLH%LHLL$LHLH%HHLL$'[...]
	- b192	(cat: 9):	'LHLH%LHLL$HLHH%LHLL$LHHH%LHLL$'[...]
	- b193	(cat: 9):	'HLHH%LLLL$HHLH%HHLL$LHLH%LHLL$'[...]
	- b194	(cat: 9):	'LHLH%HHLH$HHHH%LLLL$HLHH%LHLH$'[...]
	- b195	(cat: 9):	'LLHL%LHLL$HHLH%LHLL$HHHH%LHLL$'[...]
	- b196	(cat: 9):	'LHHL%LHLH$LHLH%LHLL$LHHH%LHLL$'[...]
	- b197	(cat: 9):	'HHHH%LHLH$HHHH%LLLL$HHLH%LHLL$'[...]
	- b198	(cat: 9):	'HHHH%LHLL$HHLH%LHLL$LHHH%LHLH$'[...]
	- b199	(cat: 9):	'LLHH%LHLL$LHHH%HHLH$HHHL%LHLH$'[...]
	- b200	(cat: 9):	'LHLH%LHLH$LLHH%LHLH$LLLH%LHLH$'[...]
	- b201	(cat: 8):	'LHLH%LHLH$LHHL%LHLL$LLHH%LLLL$'[...]
	- b202	(cat: 8):	'LHLH%LHLL$LHHH%LHLL$HHLH%LHLL$'[...]
	- b203	(cat: 8):	'HLHH%LHHL$LLHH%LHLL$LHHL%HHLL$'[...]
	- b204	(cat: 8):	'LHLH%LHLL$HHHH%LHLH$LHHH%LHLH$'[...]
	- b205	(cat: 8):	'LHHH%LHLL$HLHH%HHLL$HHHL%LHLH$'[...]
	- b206	(cat: 8):	'HHLH%LHLH$HHLH%LHLL$LLHL%LHLL$'[...]
	- b207	(cat: 8):	'HLHH%LLLL$HHHH%LHLH$LHHH%LHLH$'[...]
	- b208	(cat: 10):	'LLHH%LLHL$HHHH%LHHL$LHHH%LLHL$'[...]
	- b209	(cat: 10):	'HHHH%HLLL$HHHH%LHLL$HHLL%LHLL$'[...]
	- b210	(cat: 1):	'LHHH%LHLH$LHHL%LHLH$LLLH%LHHL$'[...]
	- b211	(cat: 3):	'HLHH%HHHL$HHHH%LHLL$HHLH%LHLH$'[...]
	- b212	(cat: 3):	'HLHH%LLLH$LLHH%LHLL$HLHH%LHLL$'[...]
	- b213	(cat: 8):	'HLLH%LHHH$LLLL%HHLL$LHLL%LHLL$'[...]
	- b214	(cat: 8):	'HHLL%LHLL$LLHL%LHLL$HHLH%LHLL$'[...]
	- b215	(cat: 8):	'LLHH%LHLL$HHLH%LHLL$HLLL%LHLL$'[...]
	- b216	(cat: 8):	'HHLH%LHLL$HHLH%LHLL$LLHL%LHLL$'[...]
	- b217	(cat: 8):	'HLHH%LHLH$LHLH%LHLL$LLHH%LHLH$'[...]
	- b218	(cat: 1):	'HLHL%LHLL$LHHH%LHLH$HHHH%LHLL$'[...]
	- b219	(cat: 3):	'HHLH%LHLL$LHHH%LHLH$LHHH%LHLL$'[...]
	- b220	(cat: 3):	'HLHL%LHLL$LHHH%LHLH$LHLH%HHLL$'[...]
	- b221	(cat: 3):	'LLHL%LHLL$HLHL%LHLL$LHHH%LHLL$'[...]
	- b222	(cat: 10):	'LHLL%LHLH$LLHL%LHLL$LHLH%HHHL$'[...]
	- b223	(cat: 10):	'HHLH%LHLL$HLHH%LHLL$LHLH%LHLH$'[...]
	- b224	(cat: 10):	'HHHH%LHLH$LLHH%LHLL$HHLH%HHHL$'[...]
	- b225	(cat: 4):	'HHHH%LHLL$HLHL%LHLL$LHLH%LHLH$'[...]
	- b226	(cat: 4):	'HHHL%LHLL$LHHH%LHLL$LHLH%LHLL$'[...]
	- b227	(cat: 4):	'HHLH%LHLL$LHHH%LHLL$LHLH%LHLH$'[...]
	- b228	(cat: 1):	'LLHH%LLLH$HHLH%LLLL$LLHH%LHLL$'[...]
	- b229	(cat: 1):	'LHLH%LHLH$LHLH%LHLL$HHLH%LHLL$'[...]
	- b230	(cat: 1):	'HLHH%LHLH$HLHH%LHLL$HHHH%LHLL$'[...]
	- b231	(cat: 1):	'HHHH%HHLL$LHHH%LHLL$HLLL%LHLL$'[...]
	- b232	(cat: 1):	'HLHH%LHLL$LHHH%LHLH$LHHH%LHLL$'[...]
	- b233	(cat: 1):	'LHLH%LHLL$HHHL%LHLH$HHLH%LHLL$'[...]
	- b234	(cat: 8):	'HHLL%LHLH$HHHH%LHLL$LHLL%LHLL$'[...]
	- b235	(cat: 8):	'LHHH%LHLL$HHLH%LHLH$HHHH%LHLH$'[...]
	- b236	(cat: 8):	'LLHL%HHLL$LHHL%LHLH$LLHL%LHLH$'[...]
	- b237	(cat: 5):	'HHHH%HLLL$HLLH%LHLL$HHHH%LHLL$'[...]
	- b238	(cat: 5):	'LHLH%LHLH$HHHH%LHLH$LHHH%LHLH$'[...]
	- b239	(cat: 5):	'LHLH%LHLH$HLHH%LHLL$HHHH%LHLL$'[...]
	- b240	(cat: 9):	'HHLH%LHLL$LHHH%LHLL$LHHH%LHLL$'[...]
	- b241	(cat: 9):	'LHHL%LHLL$LHHL%LHLL$LHHH%LLLH$'[...]
	- b242	(cat: 9):	'LHHH%LHLL$HLHH%LHLH$LHLH%LHLL$'[...]
	- b243	(cat: 9):	'LHLH%LHLH$LLHH%LHLH$LHLH%LHLL$'[...]
	- b244	(cat: 9):	'LHHH%LLLL$HLHH%LHLH$LHHL%LHLL$'[...]
	- b245	(cat: 9):	'LHHH%LHLL$LLHH%LHLL$HHLH%LHLL$'[...]
	- b246	(cat: 9):	'HLHL%LHLL$LHHL%LHLL$HHLH%LHLL$'[...]
	- b247	(cat: 9):	'HHLH%LHLL$LHLH%LHLL$HHLL%LHLH$'[...]
	- b248	(cat: 7):	'LLHL%LHLL$HHLH%LHLL$HHHL%LHLL$'[...]
	- b249	(cat: 7):	'HHHL%LHLL$LHHH%LHLH$LHLL%LLLL$'[...]
	- b250	(cat: 7):	'HHLH%LLHH$HLHL%LHLL$HHHH%LHLH$'[...]
	- b251	(cat: 7):	'LHLH%LLLH$HHLH%LHLL$LHHL%LHLH$'[...]
	- b252	(cat: 8):	'HHHH%LHLL$LHLL%LHLL$HHHH%LHLL$'[...]
	- b253	(cat: 8):	'LLHH%LHLL$HLLH%LHLL$LHLL%LHLL$'[...]
	- b254	(cat: 9):	'LHHH%LLLL$HHHL%LHLL$HHHH%HHLL$'[...]
	- b255	(cat: 9):	'LHHH%LHLL$LLHH%LHLH$LLHH%LHLH$'[...]
	- b256	(cat: 9):	'HLLH%LHLL$LLLH%LHLH$HHLL%LHLH$'[...]
	- b257	(cat: 9):	'HHLH%HHLL$LHLH%HHLL$HHHH%LHLL$'[...]
	- b258	(cat: 9):	'LLHH%LHLL$HHHH%LLLL$HHHH%LHLL$'[...]
	- b259	(cat: 9):	'HLLH%HHLH$HHLH%LHLL$LHHH%LHLL$'[...]
	- b260	(cat: 9):	'LHHH%LHLH$LHLH%LHLH$HHLH%LHLL$'[...]
	- b261	(cat: 9):	'LHHH%LHLH$HHHH%LHLL$HLLH%LHLL$'[...]
	- b262	(cat: 9):	'LHHH%LHLL$LLLH%LHLL$LHLH%LLLL$'[...]
	- b263	(cat: 9):	'HHLH%LHLL$LLHH%LHLL$HHLH%LHLL$'[...]
	- b264	(cat: 10):	'HHHH%HLHH$HHHL%LHLL$HHHL%LHLH$'[...]
	- b265	(cat: 10):	'HHHL%LLHL$LHHH%LHLL$HHHH%LHLL$'[...]
	- b266	(cat: 6):	'HHLL%LHLL$HLHH%LHLH$HHHH%LHLH$'[...]
	- b267	(cat: 6):	'LHHH%LHLH$HHHH%LHLH$HHLH%LLLL$'[...]
	- b268	(cat: 3):	'HHHH%LLLL$LLHH%LHLL$HHLH%LHLL$'[...]
	- b269	(cat: 4):	'LHHL%LHLL$LHHH%LHLL$HHLH%LHLL$'[...]
	- b270	(cat: 4):	'HHHH%LHLH$HHHH%LHLL$HHLH%LHLH$'[...]
	- b271	(cat: 4):	'LHHH%LHLL$LLHH%LHLH$HHLH%LHLH$'[...]
	- b272	(cat: 10):	'HLHH%LHLL$LHHH%LHLH$LHHL%LLLL$'[...]
	- b273	(cat: 9):	'LHLH%HHLL$LHLH%LHLH$LLHH%LHLL$'[...]
	- b274	(cat: 10):	'HHLH%LHLH$LHHH%LHLL$HLLL%LHLL$'[...]
	- b275	(cat: 1):	'LHHH%LHLL$LHHH%LHLL$LHLL%LHLL$'[...]
	- b276	(cat: 1):	'HHHL%LHLH$HHLH%LHLH$HHLH%LHLH$'[...]
	- b277	(cat: 1):	'HHHH%LHLL$LHHL%LHLH$HHHL%LHLL$'[...]
	- b278	(cat: 1):	'HHHL%LHLH$LLHH%HHLH$HHLH%LHLL$'[...]
	- b279	(cat: 1):	'HHLL%LHLH$LLLL%LHLL$LHHH%LHLL$'[...]
	- b280	(cat: 1):	'LLHH%LHLL$HHHL%LHLL$LHLH%LHLL$'[...]
	- b281	(cat: 10):	'HLHL%LHLL$HHHH%LHLL$HLHH%LHLL$'[...]
	- b282	(cat: 3):	'HHLL%LHLL$HLLL%LHLL$HHHH%HHLL$'[...]
	- b283	(cat: 7):	'HLLL%LHLL$LHHH%LHLL$HHHH%LHLL$'[...]
	- b284	(cat: 10):	'HHLH%HHLL$LHLH%LHLL$HLHH%LHLL$'[...]
	- b285	(cat: 5):	'LLHL%LHLH$HHHH%LLLL$LHLH%LHLL$'[...]
	- b286	(cat: 5):	'LHLH%LHLL$LHHL%LHLH$HHLH%LLLH$'[...]
	- b287	(cat: 5):	'LHLH%HLHL$LHHH%HLLL$HLHL%HLLL$'[...]
	- b288	(cat: 5):	'HHHH%LHLH$LLLH%LHLH$LHHH%LHLL$'[...]
	- b289	(cat: 5):	'HHHH%LLLL$LLHH%LHLL$HLHH%LHLH$'[...]
	- b290	(cat: 9):	'LHLH%LHLL$HHLH%LHLH$HHLL%LHLL$'[...]
	- b291	(cat: 9):	'HLHH%LHLL$HLHL%LHLL$HHHH%LHLL$'[...]
	- b292	(cat: 9):	'HHHH%LHLL$HLHH%LHLL$HLHH%LHLL$'[...]
	- b293	(cat: 9):	'LLHH%LHLH$HHHH%LHLL$HHHL%LLLL$'[...]
	- b294	(cat: 9):	'HLLH%LHLH$LHLH%LHLH$LLHH%LHLH$'[...]
	- b295	(cat: 9):	'HHHH%LHLH$LLLL%LHLL$HHHL%LHLL$'[...]
	- b296	(cat: 9):	'LHLH%LHLH$HHHH%LHLH$LHHH%LHLL$'[...]
	- b297	(cat: 9):	'LHHL%LHLL$LLHH%LHLL$HHLH%LHLL$'[...]
	- b298	(cat: 9):	'HLLL%LHLL$LHHL%LHLH$LHHH%LLLL$'[...]
	- b299	(cat: 9):	'LLHH%LHLL$LHHL%LHLL$HHHH%LHLH$'[...]
	- b300	(cat: 8):	'LHHL%LLLL$HLHH%LHLH$LHHH%LHLL$'[...]
	- b301	(cat: 8):	'LHLH%LHLH$LHLH%LHLH$LHHL%LHHL$'[...]
	- b302	(cat: 8):	'LHHH%LHLL$HHLH%LHLL$HHHH%LHLH$'[...]
	- b303	(cat: 8):	'HHLL%LHLL$HLHH%LLLL$HHHH%LHLL$'[...]
	- b304	(cat: 8):	'LHHH%HLHH$HHHH%HHHL$LHLH%HHHL$'[...]
	- b305	(cat: 1):	'LHLH%LHLL$HHHH%LHLL$HHHH%LHLL$'[...]
	- b306	(cat: 1):	'HHLH%LHLH$LHLH%LLLH$HHLL%LHLL$'[...]
	- b307	(cat: 9):	'HHHH%LHLL$LLHL%LHLL$HHHH%LLLL$'[...]
	- b308	(cat: 6):	'LHLL%LHLL$HLHH%LHLH$LLLH%LHLL$'[...]
	- b309	(cat: 7):	'LLHH%LHLL$LLHH%LHLL$LHHH%LHLH$'[...]
	- b310	(cat: 10):	'HHHL%LHHL$HHLH%LLHH$LHHH%HHHL$'[...]
	- b311	(cat: 8):	'HHHH%LHLL$LHLL%LHLL$HHLL%LHLL$'[...]
	- b312	(cat: 8):	'HLLL%LHLH$HHLH%LHLL$HHLL%LHLL$'[...]
	- b313	(cat: 8):	'HHHL%LHLL$LHLH%LHLL$HHHH%LHLH$'[...]
	- b314	(cat: 8):	'LLHH%LHLL$HLHH%LHLH$LHLH%LHLL$'[...]
	- b315	(cat: 8):	'HHLH%LLHL$HHHH%HLHL$LHHH%HLHL$'[...]
	- b316	(cat: 8):	'LLLL%LHLL$HHHH%LLLH$LHLH%LHLL$'[...]
	- b317	(cat: 8):	'HHLH%HLLH$HLHL%LHLL$HHHH%LHLH$'[...]
	- b318	(cat: 1):	'LHLH%LHLL$LHLH%LHLL$HHHH%LHLL$'[...]
	- b319	(cat: 3):	'HHHH%LHLL$LHLH%LHLL$HLHH%LHLL$'[...]
	- b320	(cat: 3):	'LLHH%LLLH$LLHL%LHLL$HHHH%LHLL$'[...]
	- b321	(cat: 3):	'HHLL%LHLL$LHLL%LLLL$LHLH%LHLL$'[...]
	- b322	(cat: 3):	'HHLH%LHLH$LHHH%LHLL$HHHH%LHLL$'[...]
	- b323	(cat: 10):	'LHHH%HLHL$HHLH%LHLH$HHHH%LHLH$'[...]
	- b324	(cat: 10):	'LHHH%HLLL$HHHL%LHLL$LHHH%LHHL$'[...]
	- b325	(cat: 4):	'LHHL%LHLL$HHHH%LHLL$LLLH%LHLL$'[...]
	- b326	(cat: 10):	'HLHH%LHLL$HHHL%LHLH$HHHH%LHLL$'[...]
	- b327	(cat: 10):	'HLHH%HLLH$LLHL%LHLH$HHLL%LHLL$'[...]
	- b328	(cat: 10):	'LHLH%HLLH$LHLL%LHLH$LHLH%LHLL$'[...]
	- b329	(cat: 10):	'HHHH%LHLL$HLHL%LHLL$HLHL%LHLL$'[...]
	- b330	(cat: 1):	'LHLH%LHLL$HHHH%LHLH$HHLH%LHLH$'[...]
	- b331	(cat: 4):	'HHLH%LHLH$HHLH%LHLL$LHLH%LHLL$'[...]
	- b332	(cat: 4):	'LHHH%LHLL$HHLH%LHLH$LHLH%LHLH$'[...]
	- b333	(cat: 4):	'LLHL%LHLH$LHHH%LHLL$LLHH%LHLL$'[...]
	- b334	(cat: 4):	'LLLH%LHLL$HHHH%LHLL$HHHH%LHLH$'[...]
	- b335	(cat: 10):	'LHLH%HLLL$LHLL%LHLL$LHLH%HLLL$'[...]
	- b336	(cat: 10):	'LHLH%HLLL$LHHH%LHLL$LHLH%LHLL$'[...]
	- b337	(cat: 10):	'HHHH%LHLL$LHHL%LHLL$HLHH%LHLL$'[...]
	- b338	(cat: 1):	'HLHH%LLHH$HHLH%HLHL$LHHH%HHHH$'[...]
	- b339	(cat: 1):	'LHHH%LHLL$LHHH%LHLH$LHHH%LLLL$'[...]
	- b340	(cat: 1):	'HHHH%LHLH$LLHL%LHLL$LLHH%LHLL$'[...]
	- b341	(cat: 1):	'LLHH%LLLL$HHLH%LHLL$LLHH%LLLL$'[...]
	- b342	(cat: 2):	'HLHL%LHLH$HHHH%LLLL$LHLH%LHLL$'[...]
	- b343	(cat: 2):	'HHLH%LHLL$LHLH%LHLH$LHHH%LHLL$'[...]
	- b344	(cat: 2):	'LHLH%HLLL$LHLL%LHLH$LLLL%LHLL$'[...]
	- b345	(cat: 2):	'HHLH%LHLL$HHLH%LHLL$LHLH%LHLL$'[...]
	- b346	(cat: 1):	'HHLH%LHLH$HHHH%LHLL$LHLH%LHLH$'[...]
	- b347	(cat: 1):	'LHLH%LHLL$LHHH%LHLL$HHLH%LHLL$'[...]
	- b348	(cat: 1):	'HLHH%LHLL$HLHH%LHLL$HHHH%LHLL$'[...]
	- b349	(cat: 1):	'HHLH%LLLL$LHHL%LHLL$HHHH%LHLL$'[...]
	- b350	(cat: 1):	'HLHH%LHLL$HLLL%LHLL$LHLH%LHLL$'[...]
	- b351	(cat: 1):	'LHLH%LHLH$LLHL%LHLH$LHLL%LHLL$'[...]
	- b352	(cat: 1):	'HLHH%LHLH$LHHH%LHLH$LLLH%LHLL$'[...]
	- b353	(cat: 1):	'HLHH%LHLL$LHHL%LHLH$HHHL%LHLL$'[...]
	- b354	(cat: 1):	'HLHH%LHLL$HHLH%LHLL$HHHH%LHLL$'[...]
	- b355	(cat: 9):	'LLHH%LHLL$LLHL%LHLL$HHLH%LHLL$'[...]
	- b356	(cat: 9):	'LLLH%LHLH$HHLL%LHLH$LHLH%LHLH$'[...]
	- b357	(cat: 5):	'HHHH%LLLH$HHLH%LLLH$HHHH%LHLH$'[...]
	- b358	(cat: 5):	'LLHH%LHLH$LHHH%LHLL$HLHH%LHLH$'[...]
	- b359	(cat: 5):	'LHHH%LHHH$HHHH%LHLH$HHHH%LHLL$'[...]
	- b360	(cat: 8):	'HHLH%LLHL$LLHH%LHLH$HHLH%LHLL$'[...]
	- b361	(cat: 8):	'LLHH%LHLL$LHHL%LHLL$HHHH%LHLL$'[...]
	- b362	(cat: 8):	'HHHH%LHLH$LHLH%LHLL$HHHH%LHLH$'[...]
	- b363	(cat: 8):	'HHHH%LHLH$HHHL%LHLL$HHHH%LHLH$'[...]
	- b364	(cat: 8):	'LHHH%LHLH$HHHL%LHLL$LLHH%LHLH$'[...]
	- b365	(cat: 1):	'LHHH%LHLL$HHHH%LHLL$HHLH%LHLL$'[...]
	- b366	(cat: 1):	'HHHH%LHLH$LHLH%LHLL$LHHH%LHLL$'[...]
	- b367	(cat: 1):	'HHHH%LHLH$LLLH%LHLL$HHHH%LHLL$'[...]
	- b368	(cat: 1):	'HLHL%LHLL$HLHL%LHLL$HHHH%LHLH$'[...]

In [7]:
mfi = corpus.vectorize(mfi=1000000,
                       ngram_type='char',
                       ngram_size=4,
                       vector_space='tf_std',
                       min_df=0.0)


Warning: corpus has not been tokenized yet: running tokenization with default settings first

In [8]:
print(len(mfi))


80

In [9]:
print(len(corpus))


369

In [11]:
from pystyl.analysis import pca
pca_coor, pca_loadings = pca(corpus, nb_dimensions=2)
from pystyl.visualization import scatterplot
scatterplot(corpus, coor=pca_coor,
            nb_clusters=3, loadings=pca_loadings,
            save=True, outputfile="/Users/mike/Desktop/pca.pdf")


<matplotlib.figure.Figure at 0x11bb54450>

In [12]:
from pystyl.analysis import pca
pca_coor, pca_loadings = pca(corpus, nb_dimensions=2)
from pystyl.visualization import scatterplot
scatterplot(corpus, coor=pca_coor,
            nb_clusters=3, loadings=pca_loadings,
            save=True, outputfile="/Users/mike/Desktop/pca.pdf")


<matplotlib.figure.Figure at 0x115f791d0>

In [13]:
from pystyl.analysis import distance_matrix
dm = distance_matrix(corpus, 'minmax')
from pystyl.analysis import hierarchical_clustering
cluster_tree = hierarchical_clustering(dm, linkage='ward')
from pystyl.visualization import scipy_dendrogram
scipy_dendrogram(corpus=corpus, tree=cluster_tree,
                 outputfile='~/Desktop/scipy_dendrogram.pdf',
                 fontsize=3, save=True, return_svg=False)


<matplotlib.figure.Figure at 0x11f0fac50>

In [14]:
from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(linkage='ward',
                                     affinity='euclidean',
                                     n_clusters=2)
clustering.fit(corpus.vectorizer.X.toarray())
for title, label in zip(corpus.titles, clustering.labels_):
    if label == 1:
        print(title)

# add slice names:
#for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_):


8-002
10-020
5-019
8-079
8-071
1-027
5-040
5-068
10-185
1-043
1-041
5-070
8-016
8-081
1-090