notebook.community

Edit and run



In [76]:

    
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
import os
import gzip
import xmltodict
import string
import re



In [3]:

    
df_all = pd.DataFrame()
# Loop through every genre - year - zipfile, to obtain the xmls and read in the corpuses
BASE_PATH = "../OpenSubtitles/raw/hu/"
for genre in os.listdir(BASE_PATH)[:1]:
    print(genre)
    for year in os.listdir("".join([BASE_PATH, genre])):
        for zipf in os.listdir("".join([BASE_PATH, genre, "/", year])):
            with gzip.GzipFile("".join([BASE_PATH, genre, "/", year, "/", zipf])) as xml_file:
                dicti = xmltodict.parse(xml_file.read())
                df_all = df_all.append(pd.DataFrame.from_dict(dicti['letsmt']['body']))









    



Action



In [55]:

    
def get_text(x, remove_punct=False):
    translator=str.maketrans('','',string.punctuation)
    if not remove_punct:
        words = x["#text"].replace(r"-", "").strip().lower().translate(translator)
    else:
        words = x["#text"].replace(r"-", "").strip().lower()
    return words



In [70]:

    
df_all['text'] = df_all['s'].map(lambda x: get_text(x))
list_of_words = df_all['s'].map(lambda x: get_text(x, remove_punct=True)).tolist()



In [48]:

    
text = " ".join(list_of_words)



In [49]:

    
list_for_dict = list(set(text.split(" ")))
dict_one_hot = {ix:word for ix, word in enumerate(list_for_dict)}



In [68]:

    
# make reverse dictionary:
rev_dict_one_hot = dict(zip(dict_one_hot.values(), dict_one_hot.keys()))



In [50]:

    
len(dict_one_hot)









    Out[50]:





73667



In [65]:

    
list_of_words[-10:] # twilight as an action movie, interesting..









    Out[65]:





['mi a fene?!',
 'lezuhanunk!',
 'william!',
 'tudtam, hogy viktor nagy hibát követ el, hogy ölebként maga mellett tart.',
 'meg kellett volna ölnie a családoddal együtt!',
 'egy ismeretlen fejezet nyílt meg elõttünk.',
 'a klánokat elválasztó vonal most elmosódott.',
 'a káosz és a belharc már elkerülhetetlen.',
 'csak egy dolog biztos, hogy a sötétség megmarad.',
 'de most, elõször... a fényben egy új remény sugárzik.']



In [125]:

    
# Make sentences:
pre_sentence = re.split('(\.|\!|\?)', ' '.join(list_of_words))
sentences = [(one + two).lstrip() if len(one)>1 else None for one, two in zip(pre_sentence, pre_sentence[1:])]
sentences = [x for x in sentences if x is not None]
#TODO: fix Mr, mrs, dr tags



In [134]:

    
# filter only longer sentences
long_sentences = [x for x in sentences if x.count(' ') > 3]