Topics discussed in knesset committees.

Based on transcripts of the knesset committees.
The work was done in the 'public knowledge workshop' hackathon and won 3rd place prize.


In [1]:
from collections import defaultdict
from dataflows import Flow
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

Collecting committees texts and analyzing it.

We used a library called fataflows since it is the one used by the 'public knowledge workshop. The data download is done in parts - each committee text is divided to parts and processed separately.
Downloading each knesset data and analyzing it took arround 6 hourd per knesset.
This is why we kept a cache of the downloaded data and saved files of the analyzed data for each knesset.

Constants


In [2]:
# Limit processing of protocol parts for development, -1 means no limit.
PROCESS_PARTS_LIMIT = -1

# Knesset num to query
KNESSET_NUM = 18

# Enable caching of protocol parts data (not efficient, should only be used for local development with sensible PROCESS_PARTS_LIMIT)
PROCESS_PARTS_CACHE = True

# Filter the meetings to be processed, these kwargs are passed along to DataFlows filter_rows processor for meetings resource
MEETINGS_FILTER_ROWS_KWARGS = {'equals': [{'KnessetNum': KNESSET_NUM}]}

# Don'e use local data - loads everything from knesset data remote storage
# When set to False - also enables caching, so you won't download from remote storage on 2nd run.
USE_DATA = False
USE_CACHE = not USE_DATA

Create file to save the extracted data insights


In [3]:
outfile = open(r"Extracted_data/meetings_topics_knesset_" + str(KNESSET_NUM) + ".csv", 'w')
outfile.write(",".join([
    'KnessetNum',
    'Year',
    'Month',
    'Day',
    'Diplomacy_score',
    'Ecologics_score',
    'Economics_score',
    'Education_score',
    'Health_score',
    'Security_score',
    'CommitteeSessionID',
    'Number',
    'Topics',
    'CommitteeID']) + "\n")


Out[3]:
163

Loading lexicons

We created manually hebrew lexicons that describes the topics:
Diplomacy, Ecologics, Economics, Education, Health, Security


In [4]:
import os

def read_topic_to_set(topic_name):
    lines = open(os.path.join(dir_name, topic_name + ".txt"), 'r').readlines()
    return set([line.strip().replace("\ufeff", "") for line in lines])

dir_name = "lexicons"        
files = os.listdir(dir_name)
topics = [file.split('.')[0] for file in files]
lexicons = {}
for topic_name in topics:
    lexicons[topic_name] = read_topic_to_set(topic_name)
    
print(topics)


['Diplomacy', 'Ecologics', 'Economics', 'Education', 'Health', 'Security']

Load source data


In [5]:
from dataflows import filter_rows, cache
from datapackage_pipelines_knesset.common_flow import load_knesset_data, load_member_names

# Loads a dict containing mapping between knesset member id and the member name
member_names = load_member_names(use_data=USE_DATA)

# define flow steps for loading the source committee meetings data
# the actual loading is done later in the Flow
load_steps = (
    load_knesset_data('people/committees/meeting-attendees/datapackage.json', USE_DATA),
    filter_rows(**MEETINGS_FILTER_ROWS_KWARGS)
)

if USE_CACHE:
    # when loading from URL - enable caching which will skip loading on 2nd run
    path = '../.cache/people-committee-meeting-attendees-knesset-{}'.format(KNESSET_NUM)
    load_steps = (cache(*load_steps, cache_path=path),)


loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/members/mk_individual/datapackage.json
using cache data from .cache/members-mk-individual-names
loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/meeting-attendees/datapackage.json

Globals


In [6]:
running_index = 0
meeting_data_global = None

words_freq = defaultdict(int)
stats = defaultdict(int)
stats['processed parts'] = 0
member_attended_meetings = defaultdict(int)

Process row - extract topics apearances count from each row

Note that each committee contain 'topics' value which is a summary of the discussed topics within a meeting.
Each meeting topic score was calculated as the following:
1 topic words appearances in meeting body + 3 topic words appearances in meeting given 'topics' field


In [7]:
def word_permutations(word):
    clean_word = word.strip()
    permutations = [clean_word]
    if len(word) > 1 and word.startswith('ה') or word.startswith('ב') or word.startswith('ל'):
        permutations.append(word[1:])
    return permutations


def in_lexicon(word, lexicon):
    global words_freq
    for p in word_permutations(word):
        if p in lexicon:
            words_freq[p] += 1
            return True
    return False

            
def lexicon_count(lexicon, words):
    count = 0
    for word in words:
        if in_lexicon(word, lexicon):
            count += 1
    return count   


def process_meeting_protocol_part(row):
    global meeting_data_global
    global running_index
    global stats
    stats['processed parts'] += 1
    
    if 'header' in row and row['header'] is not None:
        words = row['header'].split()
    else:
        words = []
    if 'body' in row and row['body'] is not None:
        words += row['body'].split()      
    words_size_2 = [" ".join(words[i:i+2]) for i in range(len(words) - 2)]
    words_size_3 = [" ".join(words[i:i+3]) for i in range(len(words) - 3)]
                    
    for topic_name, lexicon in lexicons.items():
        meeting_data_global[topic_name + "_score"] += lexicon_count(lexicon, words)
        meeting_data_global[topic_name + "_score"] += lexicon_count(lexicon, words_size_2)  
        meeting_data_global[topic_name + "_score"] += lexicon_count(lexicon, words_size_3)

Append the meeting insights to all insights


In [8]:
def write_meeting_data_to_file():
    global meeting_data_global
    if meeting_data_global is None:
        print("first run, meeting_data_global is none")
        return

    array_to_write = [
        meeting_data_global['KnessetNum'],
        meeting_data_global['Year'],
        meeting_data_global['Month'],
        meeting_data_global['Day'],
        meeting_data_global['Diplomacy_score'],
        meeting_data_global['Ecologics_score'],
        meeting_data_global['Economics_score'],
        meeting_data_global['Education_score'],
        meeting_data_global['Health_score'],
        meeting_data_global['Security_score'],
        meeting_data_global['CommitteeSessionID'],
        meeting_data_global['Number'],
        meeting_data_global['Topics'],
        meeting_data_global['CommitteeID']
    ]
    array_to_write = [str(w).replace(",","") for w in array_to_write]
    outfile.write(",".join(array_to_write) + "\n")
            
def add_meeting_data_to_table(row):
    global topics_df
    global meeting_data_global
    if topics_df is None:
        topics_df = pd.DataFrame(meeting_data_global)
    else:
        topics_df = topics_df.append(pd.DataFrame(meeting_data_global), ignore_index=True)

Process meeting


In [9]:
def update_score_with_meeting_given_topics(given_topics_string):
    topic_words = given_topics_string.split()
    topic_words_size_2 = [" ".join(topic_words[i:i+2]) for i in range(len(topic_words) - 2)]
    topic_words_size_3 = [" ".join(topic_words[i:i+3]) for i in range(len(topic_words) - 3)]
      
    for topic_name, lexicon in lexicons.items():
        count = lexicon_count(lexicon, topic_words) + lexicon_count(lexicon, topic_words_size_2) + lexicon_count(lexicon, topic_words_size_3)
        meeting_data_global[topic_name + "_score"] = count * 3
    

def initialize_meeting_data_global(meeting_row):
    global meeting_data_global
    topics_exists_in_meeting_data = meeting_row['topics'] is not None
    given_topics_string = " ; ".join(meeting_row['topics']).replace(",", "").replace("\n", "") if topics_exists_in_meeting_data else ""

    meeting_data_global = {
        'KnessetNum': meeting_row['KnessetNum'],
        'Year': str(meeting_row['StartDate']).split("-")[0],
        'Month': str(meeting_row['StartDate']).split("-")[1],
        'Day': str(meeting_row['StartDate']).split("-")[2].split(' ')[0],
        'CommitteeSessionID': meeting_row['CommitteeSessionID'],
        'Number': meeting_row['Number'],
        'Topics': given_topics_string,
        'CommitteeID': meeting_row['CommitteeID']
    } 
    
    for topic_name in topics:
        meeting_data_global[topic_name + "_score"] = 0
    
    if topics_exists_in_meeting_data:
        update_score_with_meeting_given_topics(given_topics_string)

def process_meeting(row):
    global stats
    global meeting_data_global
    stats['total meetings'] += 1
    parts_filename = row['parts_parsed_filename']
    keep_processing = PROCESS_PARTS_LIMIT == -1 or stats['processed parts'] < PROCESS_PARTS_LIMIT
    
    # Process meeting
    if parts_filename and keep_processing:
        
        # Add previous data to file
        write_meeting_data_to_file()
        
        # Initialize new global data to file
        initialize_meeting_data_global(row)
        
        # Define the meeting processing steps
        # load data step
        steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
        if USE_CACHE and PROCESS_PARTS_CACHE:
            steps = (cache(*steps, cache_path='../.cache/committee-meeting-protocol-parts/' + parts_filename),)
            
        # Text insights step
        steps += (process_meeting_protocol_part,)
        
        # Run meeting processing steps
        Flow(*steps).process()

Define flow


In [10]:
process_steps = (
    process_meeting,
)

Run flow


In [ ]:
from dataflows import Flow, dump_to_path

Flow(*load_steps, *process_steps, dump_to_path('../data/committee-meeting-attendees-parts')).process()

In [12]:
outfile.close()

Saving the lexicon words that influenced the topic scores the most.


In [13]:
words = list(words_freq.keys())
freqs = list(words_freq.values())
word_freq_d = {"Lexicon words": words,
             "Frquency": freqs}
word_freq_df = pd.DataFrame(word_freq_d)
word_freq_df.to_csv("Extracted_data/words_freq_knesset_" + str(KNESSET_NUM) + ".csv")