In [1]:
from testdataextractor.testdataextractor.extractor import Extractor
ext = Extractor('../test_data/1957284403.ofs.gold.xml')
article = ext.extract(verbose=True)
In [2]:
import pandas as pd
frame_art = pd.DataFrame.from_dict(article['sentences'], orient='index')
In [3]:
frame_art
Out[3]:
In [45]:
def calc_row_len(row):
if 'list' in str(type(row['links'])):
return len(row['links'])
else:
return 0
frame_num_links = frame_art.apply(
(lambda row: calc_row_len(row)), axis=1
)
frame_with_lengths = pd.concat([frame_art, frame_num_links], axis=1)
top_sentences = frame_with_lengths.sort_values(by=0, axis=0, ascending=False)[:11]
top_sentences.columns = ['text', 'comment', 'links', 'link length']
print top_sentences.ix[:, ['links', 'link length']]
print '\nCHUNKED SENTENCES'
for s in top_sentences['text']:
print s[:100]
#print "These are the most linked sentences in the corpus."
#print "Sentences\n", top_sentences['text']
#print "Links they have\n", top_sentences['links']
#print "Number of links they have Links they have\n", top_sentences[0]
In [ ]: