In [1]:
from collections import defaultdict
from dataflows import Flow
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
We used a library called fataflows since it is the one used by the 'public knowledge workshop.
The data download is done in parts - each committee text is divided to parts and processed separately.
Downloading each knesset data and analyzing it took arround 6 hourd per knesset.
This is why we kept a cache of the downloaded data and saved files of the analyzed data for each knesset.
In [2]:
# Limit processing of protocol parts for development, -1 means no limit.
PROCESS_PARTS_LIMIT = -1
# Knesset num to query
KNESSET_NUM = 18
# Enable caching of protocol parts data (not efficient, should only be used for local development with sensible PROCESS_PARTS_LIMIT)
PROCESS_PARTS_CACHE = True
# Filter the meetings to be processed, these kwargs are passed along to DataFlows filter_rows processor for meetings resource
MEETINGS_FILTER_ROWS_KWARGS = {'equals': [{'KnessetNum': KNESSET_NUM}]}
# Don'e use local data - loads everything from knesset data remote storage
# When set to False - also enables caching, so you won't download from remote storage on 2nd run.
USE_DATA = False
USE_CACHE = not USE_DATA
In [3]:
outfile = open(r"Extracted_data/meetings_topics_knesset_" + str(KNESSET_NUM) + ".csv", 'w')
outfile.write(",".join([
'KnessetNum',
'Year',
'Month',
'Day',
'Diplomacy_score',
'Ecologics_score',
'Economics_score',
'Education_score',
'Health_score',
'Security_score',
'CommitteeSessionID',
'Number',
'Topics',
'CommitteeID']) + "\n")
Out[3]:
We created manually hebrew lexicons that describes the topics:
Diplomacy, Ecologics, Economics, Education, Health, Security
In [4]:
import os
def read_topic_to_set(topic_name):
lines = open(os.path.join(dir_name, topic_name + ".txt"), 'r').readlines()
return set([line.strip().replace("\ufeff", "") for line in lines])
dir_name = "lexicons"
files = os.listdir(dir_name)
topics = [file.split('.')[0] for file in files]
lexicons = {}
for topic_name in topics:
lexicons[topic_name] = read_topic_to_set(topic_name)
print(topics)
In [5]:
from dataflows import filter_rows, cache
from datapackage_pipelines_knesset.common_flow import load_knesset_data, load_member_names
# Loads a dict containing mapping between knesset member id and the member name
member_names = load_member_names(use_data=USE_DATA)
# define flow steps for loading the source committee meetings data
# the actual loading is done later in the Flow
load_steps = (
load_knesset_data('people/committees/meeting-attendees/datapackage.json', USE_DATA),
filter_rows(**MEETINGS_FILTER_ROWS_KWARGS)
)
if USE_CACHE:
# when loading from URL - enable caching which will skip loading on 2nd run
path = '../.cache/people-committee-meeting-attendees-knesset-{}'.format(KNESSET_NUM)
load_steps = (cache(*load_steps, cache_path=path),)
In [6]:
running_index = 0
meeting_data_global = None
words_freq = defaultdict(int)
stats = defaultdict(int)
stats['processed parts'] = 0
member_attended_meetings = defaultdict(int)
Note that each committee contain 'topics' value which is a summary of the discussed topics within a meeting.
Each meeting topic score was calculated as the following:
1 topic words appearances in meeting body +
3 topic words appearances in meeting given 'topics' field
In [7]:
def word_permutations(word):
clean_word = word.strip()
permutations = [clean_word]
if len(word) > 1 and word.startswith('ה') or word.startswith('ב') or word.startswith('ל'):
permutations.append(word[1:])
return permutations
def in_lexicon(word, lexicon):
global words_freq
for p in word_permutations(word):
if p in lexicon:
words_freq[p] += 1
return True
return False
def lexicon_count(lexicon, words):
count = 0
for word in words:
if in_lexicon(word, lexicon):
count += 1
return count
def process_meeting_protocol_part(row):
global meeting_data_global
global running_index
global stats
stats['processed parts'] += 1
if 'header' in row and row['header'] is not None:
words = row['header'].split()
else:
words = []
if 'body' in row and row['body'] is not None:
words += row['body'].split()
words_size_2 = [" ".join(words[i:i+2]) for i in range(len(words) - 2)]
words_size_3 = [" ".join(words[i:i+3]) for i in range(len(words) - 3)]
for topic_name, lexicon in lexicons.items():
meeting_data_global[topic_name + "_score"] += lexicon_count(lexicon, words)
meeting_data_global[topic_name + "_score"] += lexicon_count(lexicon, words_size_2)
meeting_data_global[topic_name + "_score"] += lexicon_count(lexicon, words_size_3)
In [8]:
def write_meeting_data_to_file():
global meeting_data_global
if meeting_data_global is None:
print("first run, meeting_data_global is none")
return
array_to_write = [
meeting_data_global['KnessetNum'],
meeting_data_global['Year'],
meeting_data_global['Month'],
meeting_data_global['Day'],
meeting_data_global['Diplomacy_score'],
meeting_data_global['Ecologics_score'],
meeting_data_global['Economics_score'],
meeting_data_global['Education_score'],
meeting_data_global['Health_score'],
meeting_data_global['Security_score'],
meeting_data_global['CommitteeSessionID'],
meeting_data_global['Number'],
meeting_data_global['Topics'],
meeting_data_global['CommitteeID']
]
array_to_write = [str(w).replace(",","") for w in array_to_write]
outfile.write(",".join(array_to_write) + "\n")
def add_meeting_data_to_table(row):
global topics_df
global meeting_data_global
if topics_df is None:
topics_df = pd.DataFrame(meeting_data_global)
else:
topics_df = topics_df.append(pd.DataFrame(meeting_data_global), ignore_index=True)
In [9]:
def update_score_with_meeting_given_topics(given_topics_string):
topic_words = given_topics_string.split()
topic_words_size_2 = [" ".join(topic_words[i:i+2]) for i in range(len(topic_words) - 2)]
topic_words_size_3 = [" ".join(topic_words[i:i+3]) for i in range(len(topic_words) - 3)]
for topic_name, lexicon in lexicons.items():
count = lexicon_count(lexicon, topic_words) + lexicon_count(lexicon, topic_words_size_2) + lexicon_count(lexicon, topic_words_size_3)
meeting_data_global[topic_name + "_score"] = count * 3
def initialize_meeting_data_global(meeting_row):
global meeting_data_global
topics_exists_in_meeting_data = meeting_row['topics'] is not None
given_topics_string = " ; ".join(meeting_row['topics']).replace(",", "").replace("\n", "") if topics_exists_in_meeting_data else ""
meeting_data_global = {
'KnessetNum': meeting_row['KnessetNum'],
'Year': str(meeting_row['StartDate']).split("-")[0],
'Month': str(meeting_row['StartDate']).split("-")[1],
'Day': str(meeting_row['StartDate']).split("-")[2].split(' ')[0],
'CommitteeSessionID': meeting_row['CommitteeSessionID'],
'Number': meeting_row['Number'],
'Topics': given_topics_string,
'CommitteeID': meeting_row['CommitteeID']
}
for topic_name in topics:
meeting_data_global[topic_name + "_score"] = 0
if topics_exists_in_meeting_data:
update_score_with_meeting_given_topics(given_topics_string)
def process_meeting(row):
global stats
global meeting_data_global
stats['total meetings'] += 1
parts_filename = row['parts_parsed_filename']
keep_processing = PROCESS_PARTS_LIMIT == -1 or stats['processed parts'] < PROCESS_PARTS_LIMIT
# Process meeting
if parts_filename and keep_processing:
# Add previous data to file
write_meeting_data_to_file()
# Initialize new global data to file
initialize_meeting_data_global(row)
# Define the meeting processing steps
# load data step
steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
if USE_CACHE and PROCESS_PARTS_CACHE:
steps = (cache(*steps, cache_path='../.cache/committee-meeting-protocol-parts/' + parts_filename),)
# Text insights step
steps += (process_meeting_protocol_part,)
# Run meeting processing steps
Flow(*steps).process()
In [10]:
process_steps = (
process_meeting,
)
In [ ]:
from dataflows import Flow, dump_to_path
Flow(*load_steps, *process_steps, dump_to_path('../data/committee-meeting-attendees-parts')).process()
In [12]:
outfile.close()
In [13]:
words = list(words_freq.keys())
freqs = list(words_freq.values())
word_freq_d = {"Lexicon words": words,
"Frquency": freqs}
word_freq_df = pd.DataFrame(word_freq_d)
word_freq_df.to_csv("Extracted_data/words_freq_knesset_" + str(KNESSET_NUM) + ".csv")