In [1]:
from bs4 import BeautifulSoup
import bs4
import os
from os.path import isfile, join, isdir
from itertools import compress
import numpy as np
from unidecode import unidecode
In [2]:
dir_path = 'data'
#os.listdir(dir_path)
listdir = [dir_path + '/' + d for d in os.listdir(dir_path) if isdir(join(dir_path, d))]
listdir.sort()
print ('Cantidad de libros: ' + str(len(listdir)))
In [3]:
list_tag = ['p', 'p', 'div', 'p', 'p', 'p', 'p' ]
list_class = [['para-center', 'para-flush','para-indent', 'para-quote', 'para-ragged-left', 'para-ragged-right', 'sans-para-center', 'sans-para-flush', 'sans-para-indent', ''],
['para-flush','para-indent'],
['tx','tx1'],
['begin', 'para-indent', 'para-flush','para-ragged-left' ],
['block','blockb','blocki','indent','indent1','indent1b','indenta', 'indentab','indentb','indenti', 'noindent','noindenta','noindentb'],
['block','blocki','blockt','blockti','indent', 'indenta', 'indentb', 'noindent', 'noindent1', 'noindenta', 'noindentb', 'poem', 'poem1', 'poema', 'right', 'right1'],
['indent', 'indentb', 'indentb1', 'noindent', 'noindentc', 'noindentn', 'noindentn1']
]
In [4]:
def validar_parrafo_no_tag(parrafo):
tags = ['<', '>']
result = [tag in parrafo for tag in tags]
final = any(result)
return not final
def print_text_if_class(parrafo, html_cl):
if(html_cl in parrafo['class']):
print(''.join(parrafo.findAll(text=True)))
def validar_parrafo_clases(parrafo, clases):
l_valido = [c in parrafo['class'] for c in clases]
return any(l_valido)
In [5]:
def leer_parrafos(soupObj, html_tag, html_clases):
parrafos = soupObj.find_all(html_tag)
parrafos_validos = [validar_parrafo_clases(p, html_clases) for p in parrafos]
parrafos_final = list(compress(parrafos, parrafos_validos))
parrafos_str = [unicode(''.join(p.findAll(text=True))) \
for p in parrafos_final \
if type(p.findAll(text=True)[0]) == bs4.element.NavigableString]
return parrafos_str
def leer_parrafos_bs4(soupObj, html_tag):
parrafos = soupObj.find_all(html_tag)
return parrafos
In [6]:
def leer_libro(book_path, html_tag, html_clases):
list_files = [book_path + '/' + f for f in os.listdir(book_path) if isfile(join(book_path, f))]
list_files.sort()
print ('Cantidad de archivos en <' + book_path + '>: ' + str(len(list_files)))
list_html = [open(f,'r') for f in list_files]
l_BsOj = [ BeautifulSoup(html, 'html5lib') for html in list_html]
parrafos_matriz = [leer_parrafos(soupObj, html_tag, html_clases) for soupObj in l_BsOj]
parrafos_array = [unidecode(p) for p_list in parrafos_matriz for p in p_list]
l_vf = [validar_parrafo_no_tag(p) for p in parrafos_array]
verificar = all(l_vf)
total = len(parrafos_array)
print('Parrafos validos? ' + str(verificar))
print('Cantidad total de parrafos: ' + str(total))
return parrafos_array
In [7]:
l_par = []
for book, html_tag, html_clases in zip(listdir, list_tag, list_class):
print('Tag html a analizar: <' + html_tag + '>')
print('Clases html a analizar: ' + ', '.join(html_clases))
parrafos = leer_libro(book, html_tag, html_clases)
l_par.append(parrafos)
print
print
In [8]:
import json
import nltk
from corenlp import *
from nltk.tag import StanfordNERTagger
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import sentiwordnet as swn
In [9]:
st = StanfordNERTagger('../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
'../stanford-ner/stanford-ner.jar',
encoding='utf-8')
In [10]:
from jsonrpc import ServerProxy, JsonRpc20, TransportTcpIp
class StanfordNLP:
def __init__ (self):
self.server = ServerProxy(JsonRpc20(),
TransportTcpIp(addr=("127.0.0.1", 8080)))
def parse (self, text):
return json.loads(self.server.parse(text))
nlp = StanfordNLP()
In [11]:
alias = {
'Roland': ['the gunslinger'],
'Eddie': ['a drog addict', 'the prisioner'],
'Jake': ['Bama', 'the boy'],
'Oy': ['billy-bumbler', 'oy'],
'Cuthbert': ['Bert'],
'John': ['the good man'],
'Randal': ['the man in black', 'the ageless stranger', 'the walking dude',\
'walter o\'dim','walter o`dim', 'marten broadcloak', 'richard faninn', \
'rudin filaro', 'legion', 'covenant man', 'walter padick', 'son of sam'],
'Aballah': ['the crimson king', 'the ultimate evil', 'ram aballah'],
'Jack': ['the pusher'],
'Blaine': ['blaine the mono'],
'Rhea': ['rhea of the cöos'],
'Andrew': ['the tick-tock man']
}
global_entidades = set(['Roland', 'Eddie', 'Jake', 'Oy',
'Cuthbert', 'John', 'Randal', 'Aballah',
'Jack', 'Blaine', 'Rhea', 'Andrew'])
coincidencias = {
'Roland': 0, 'Eddie': 0, 'Jake': 0, 'Oy': 0, 'Cuthbert': 0,
'John': 0, 'Randal': 0, 'Aballah': 0, 'Jack': 0,
'Blaine': 0, 'Rhea': 0, 'Andrew': 0
}
In [12]:
def revisar_parrafo_tok(p, alias):
p_tok = word_tokenize(p.decode('utf-8') , language='english')
for word in p:
if (alias == word):
return True
return False
def cambiar_alias(parrafo, alias, coincidencias):
new_p = parrafo
for entidad, alias in alias.iteritems():
for to_replace in alias:
#to low
p_low = new_p.lower()
#buscar substr
coincide = True
if (len(to_replace.split(' ')) == 1):
coincide = revisar_parrafo_tok(p_low, to_replace)
ind_st = p_low.find(to_replace)
if (ind_st < 0 or not coincide): continue
ind_lt = ind_st + len(to_replace)
#remplazar por entidad
new_p = new_p[:ind_st] + entidad + new_p[ind_lt:]
#aumentar contador de apariencias
coincidencias[entidad] += 1
return new_p
In [13]:
new_book_par = []
for book in l_par:
new_b = [cambiar_alias(p, alias, coincidencias) for p in book]
new_book_par.append(new_b)
In [14]:
def concatenar_parrafos_cortos(book, min_sent):
new_book = []
book_len = len(book)
bef_short = False
for p in book:
if bef_short:
last_char = new_book[-1][-1]
if (last_char == '.' or last_char == '!' or last_char == '?' or last_char=='"'):
new_book[-1] = new_book[-1] + ' ' + p
else:
new_book[-1] = new_book[-1] + '. ' + p
else:
new_book.append(p)
bef_short = len(sent_tokenize(p.decode('utf-8'))) <= min_sent
return new_book
In [15]:
#Minima sentencias en un parrafo
min_sent = 3
newest_book_par = [concatenar_parrafos_cortos(b, min_sent) for b in new_book_par]
In [16]:
coincidencias
Out[16]:
In [17]:
def tokenizar_parrafo(text):
tokenized_sentences = nltk.sent_tokenize(text)
sents_token=[nltk.word_tokenize(sentence) for sentence in tokenized_sentences]
return sents_token
In [18]:
def unificador_nnp_extractor_verbos(sents_token):
tagged_sentences = [nltk.pos_tag(sentence) for sentence in sents_token]
ner_sentences = [st.tag(sentence) for sentence in sents_token]
paragraph_untokenized=[[] for s in sents_token]
verbs=[[] for s in sents_token]
entities=[{"PERSON":[],"ORGANIZATION":[],"LOCATION":[]} for s in sents_token]
nnp_before = False
for sentenceNer,sentencePos, i in zip(ner_sentences,tagged_sentences,range(len(ner_sentences))):
flag_aux = False
flag_ver = False
flag_final = False
polarity = False
for j,(wordNer,tagNer),(wordPos,tagPos) in zip(range(len(sentenceNer)),sentenceNer,sentencePos):
if j > 0 and tagNer != 'O' :
if nnp_before:
paragraph_untokenized[i][-1] += wordNer
entities[i][tagNer][-1] += wordNer
else:
paragraph_untokenized[i].append(wordNer)
entities[i][tagNer].append(wordNer)
nnp_before=True
else:
nnp_before = False
paragraph_untokenized[i].append(wordNer)
polarity = flag_final
if flag_final:
verbs[i].pop()
flag_aux = ('RB' == tagPos) and (wordPos == "n't" or wordPos == "not")
flag_final = flag_aux and flag_ver
if 'VB' in tagPos:
verbs[i].append((wordPos, polarity))
flag_ver = True
else:
flag_ver = False
#Llenar entidades globales
for e in entities:
for o in e['PERSON']:
global_entidades.add(o)
for o in e['ORGANIZATION']:
global_entidades.add(o)
for o in e['LOCATION']:
global_entidades.add(o);
return verbs,entities
#return tagged_sentences
In [19]:
def extraer_dato_parrafo(parrafo):
token_par=tokenizar_parrafo(parrafo)
verbos,entidades = unificador_nnp_extractor_verbos(token_par)
new_p = []
for v, e in zip(verbos, entidades):
new_p.append({
'verbos': v,
'entidades': e
})
return new_p
In [21]:
import time
def convertir_libros_verbos_entidades(book, i):
new_book = []
start_book = time.time()
print ('Libro parte: ' + str(i) + '\n')
for j, p in enumerate(book):
start_p = time.time()
print ('Parrafo'+ str(i) + ' : ' + str(j) + '/' + str(len(book)) + '\n')
new_book.append(extraer_dato_parrafo(p))
end_p = time.time()
print ('Tiempo parrafo' + str(i) + ': ' + str(end_p - start_p) + '\n')
end_book = time.time()
print ('Tiempo libro ' + str(i) + ': ' + str(end_book - start_book) + '\n')
return new_book
In [25]:
#parte 1
book2 = newest_book_par[2][0:100]
new_book_1 = convertir_libros_verbos_entidades(book1, 2)
In [26]:
with open('book2-0-100.json', 'w') as outfile:
json.dump(new_book_1, outfile)
In [ ]:
#Leer libro por partes
offset = 50
book = newest_book_par[0]
book_indice = 0
archivos = len(book) / offset + 1
for i in range(archivos):
ini = i*offset
fin = (i+1)*offset
book_part = book[ini:fin]
new_book_part = convertir_libros_verbos_entidades(book_part, book_indice)
with open('book2-0-100.json', 'w') as outfile:
json.dump('book' + str(book_indice) + '-' + str(ini) + '-' + str(fin), outfile)
In [ ]:
#para parrafos que contengan m'as de una oraci'on
def reemplazar_referencias_parrafo(text, nlp):
print 'Original text:'
print(text)
#nlp = StanfordCoreNLP()
result = nlp.parse(text)
tokenized_sentences = nltk.sent_tokenize(text)
if (len(tokenized_sentences) == 1) :
print " > Solo hay una oracion, no existen relaciones."
return
tokenized_in_words=[nltk.word_tokenize(sentence) for sentence in tokenized_sentences]
for block_to_replace in result["coref"]:
sentence_index=block_to_replace[0][1][1]
word_index=block_to_replace[0][1][2]
replace_sent=block_to_replace[0][1][0]
word_to_replace=tokenized_in_words[sentence_index][word_index]
if not word_to_replace in personajes:
continue
print 'Word_to_replace: ',word_to_replace
for i,lines_to_replace in enumerate(block_to_replace):
print "Converting sentence number ",i
ix_sent=lines_to_replace[0][1]
sent_to_replace=lines_to_replace[0][0]
tokenized_sentences[ix_sent]=tokenized_sentences[ix_sent].replace(sent_to_replace,word_to_replace)
tokenized_sentences[sentence_index]=tokenized_sentences[sentence_index].replace(replace_sent,word_to_replace)
print
final_text=' '.join(tokenized_sentences)
return final_text
In [ ]:
print reemplazar_referencias_parrafo(newest_book_par[0][10] , nlp)
In [ ]:
def senti_verb(verb):
list_meanings=swn.senti_synsets(verb[0])
pos_value=0
neg_value=0
n_vals=len(list_meanings)
meaning_counter=0
for meaning in list_meanings:
lit_string=str(meaning)
if not '.v.' in lit_string:
continue
pos_value+=meaning.pos_score()
neg_value+=meaning.neg_score()
meaning_counter+=1
if meaning_counter == 3: break
if n_vals>0:
pos_value/=n_vals
neg_value/=n_vals
if (verb[1]):
return neg_value,pos_value
else:
return pos_value,neg_value
In [ ]:
print verbos[0][0]
print senti_verb(verbos[0][0])