In [76]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os
import gzip
import xmltodict
import string
import re

In [3]:
df_all = pd.DataFrame()
# Loop through every genre - year - zipfile, to obtain the xmls and read in the corpuses
BASE_PATH = "../OpenSubtitles/raw/hu/"
for genre in os.listdir(BASE_PATH)[:1]:
    print(genre)
    for year in os.listdir("".join([BASE_PATH, genre])):
        for zipf in os.listdir("".join([BASE_PATH, genre, "/", year])):
            with gzip.GzipFile("".join([BASE_PATH, genre, "/", year, "/", zipf])) as xml_file:
                dicti = xmltodict.parse(xml_file.read())
                df_all = df_all.append(pd.DataFrame.from_dict(dicti['letsmt']['body']))


Action

In [55]:
def get_text(x, remove_punct=False):
    translator=str.maketrans('','',string.punctuation)
    if not remove_punct:
        words = x["#text"].replace(r"-", "").strip().lower().translate(translator)
    else:
        words = x["#text"].replace(r"-", "").strip().lower()
    return words

In [70]:
df_all['text'] = df_all['s'].map(lambda x: get_text(x))
list_of_words = df_all['s'].map(lambda x: get_text(x, remove_punct=True)).tolist()

In [48]:
text = " ".join(list_of_words)

In [49]:
list_for_dict = list(set(text.split(" ")))
dict_one_hot = {ix:word for ix, word in enumerate(list_for_dict)}

In [68]:
# make reverse dictionary:
rev_dict_one_hot = dict(zip(dict_one_hot.values(), dict_one_hot.keys()))

In [50]:
len(dict_one_hot)


Out[50]:
73667

In [65]:
list_of_words[-10:] # twilight as an action movie, interesting..


Out[65]:
['mi a fene?!',
 'lezuhanunk!',
 'william!',
 'tudtam, hogy viktor nagy hibát követ el, hogy ölebként maga mellett tart.',
 'meg kellett volna ölnie a családoddal együtt!',
 'egy ismeretlen fejezet nyílt meg elõttünk.',
 'a klánokat elválasztó vonal most elmosódott.',
 'a káosz és a belharc már elkerülhetetlen.',
 'csak egy dolog biztos, hogy a sötétség megmarad.',
 'de most, elõször... a fényben egy új remény sugárzik.']

In [125]:
# Make sentences:
pre_sentence = re.split('(\.|\!|\?)', ' '.join(list_of_words))
sentences = [(one + two).lstrip() if len(one)>1 else None for one, two in zip(pre_sentence, pre_sentence[1:])]
sentences = [x for x in sentences if x is not None]
#TODO: fix Mr, mrs, dr tags

In [134]:
# filter only longer sentences
long_sentences = [x for x in sentences if x.count(' ') > 3]