In [76]:
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os
import gzip
import xmltodict
import string
import re
In [3]:
df_all = pd.DataFrame()
# Loop through every genre - year - zipfile, to obtain the xmls and read in the corpuses
BASE_PATH = "../OpenSubtitles/raw/hu/"
for genre in os.listdir(BASE_PATH)[:1]:
print(genre)
for year in os.listdir("".join([BASE_PATH, genre])):
for zipf in os.listdir("".join([BASE_PATH, genre, "/", year])):
with gzip.GzipFile("".join([BASE_PATH, genre, "/", year, "/", zipf])) as xml_file:
dicti = xmltodict.parse(xml_file.read())
df_all = df_all.append(pd.DataFrame.from_dict(dicti['letsmt']['body']))
In [55]:
def get_text(x, remove_punct=False):
translator=str.maketrans('','',string.punctuation)
if not remove_punct:
words = x["#text"].replace(r"-", "").strip().lower().translate(translator)
else:
words = x["#text"].replace(r"-", "").strip().lower()
return words
In [70]:
df_all['text'] = df_all['s'].map(lambda x: get_text(x))
list_of_words = df_all['s'].map(lambda x: get_text(x, remove_punct=True)).tolist()
In [48]:
text = " ".join(list_of_words)
In [49]:
list_for_dict = list(set(text.split(" ")))
dict_one_hot = {ix:word for ix, word in enumerate(list_for_dict)}
In [68]:
# make reverse dictionary:
rev_dict_one_hot = dict(zip(dict_one_hot.values(), dict_one_hot.keys()))
In [50]:
len(dict_one_hot)
Out[50]:
In [65]:
list_of_words[-10:] # twilight as an action movie, interesting..
Out[65]:
In [125]:
# Make sentences:
pre_sentence = re.split('(\.|\!|\?)', ' '.join(list_of_words))
sentences = [(one + two).lstrip() if len(one)>1 else None for one, two in zip(pre_sentence, pre_sentence[1:])]
sentences = [x for x in sentences if x is not None]
#TODO: fix Mr, mrs, dr tags
In [134]:
# filter only longer sentences
long_sentences = [x for x in sentences if x.count(' ') > 3]