In [23]:
import pandas as pd
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')
In [22]:
# Normalize the topics' scores
def normalize_scores(scores):
max_i = (0, -1)
second_i = (0, -1)
third_i = (0, -1)
for i in range(len(scores)):
if scores[i] != 0:
if scores[i] > max_i[0]:
third_i = second_i
second_i = max_i
max_i = (scores[i], i)
elif scores[i] > second_i[0]:
third_i = second_i
second_i = (scores[i], i)
elif scores[i] > third_i[0]:
third_i = (scores[i], i)
scores = [0] * len(scores)
if max_i[1] != -1:
scores[max_i[1]] = 3
if second_i[1] != -1:
scores[second_i[1]] = 2
if third_i[1] != -1:
scores[third_i[1]] = 1
return scores
def get_knesset_topics(knesset_num):
# Get knesset collected data
df_knesset = pd.read_csv("Extracted_data/meetings_topics_knesset_" + str(knesset_num) + ".csv")
smaller_df = df_knesset[['KnessetNum', 'Year', 'Month',
'Diplomacy_score', 'Ecologics_score', 'Economics_score', 'Education_score',
'Health_score', 'Security_score']]
# Normalize scores
topics = smaller_df.apply(lambda row: normalize_scores(row[3:]), axis=1)
topics_df = pd.DataFrame(topics)
smaller_df[['Diplomacy_score', 'Ecologics_score', 'Economics_score', 'Education_score',
'Health_score', 'Security_score']] = pd.DataFrame(topics_df[0].values.tolist(), index= topics_df.index)
smaller_df['Year.Month'] = smaller_df['Year'] + (smaller_df['Month'] -1)/12.0
return smaller_df
In [2]:
def draw_knesset_topics_over_time(knesset_num):
df = get_knesset_topics(knesset_num)
# Aggredate per month and year
by_month = df.groupby(['Year.Month']).mean()
# Plot topics graph
by_month.plot(y=['Diplomacy_score', 'Ecologics_score', 'Economics_score',
'Education_score', 'Health_score', 'Security_score'],
title="Knesset " + str(knesset_num) + " - Most popular topics over time",
figsize=(10,5))
In [3]:
def draw_knesset_topics(knesset_num):
df = get_knesset_topics(knesset_num)
# Aggredate per month and year
df_mean = df[['Diplomacy_score', 'Ecologics_score', 'Economics_score',
'Education_score', 'Health_score', 'Security_score']].mean()
# Plot topics graph
df_mean.plot(y=['Diplomacy_score', 'Ecologics_score', 'Economics_score',
'Education_score', 'Health_score', 'Security_score'],
title="Knesset " + str(knesset_num) + " - Most popular topics",
kind='barh', figsize=(10,5))
In [4]:
import csv
from wordcloud import WordCloud
def get_and_flip_freq_dictionary_for_knesset(knesset_num):
freq_dictionary = dict()
with open("Extracted_data/words_freq_knesset_" + str(knesset_num) + ".csv", 'r', encoding="utf-8") as csvFile:
reader = csv.reader(csvFile)
first = True
for row in reader:
if first:
first = False
continue
# we flip the word since it's in hebrew and not supported
flipped_word = ''.join(reversed(row[1]))
freq_dictionary[flipped_word] = int(row[2])
return freq_dictionary
def draw_lexicon_word_cloud_per_knesset(knesset_num):
wordcloud = WordCloud(width = 800, height = 800,
background_color ='black',
min_font_size = 10,
font_path='C:/Windows/Fonts/Gisha.ttf')
freq_dictionary = get_and_flip_freq_dictionary_for_knesset(knesset_num)
wordcloud.generate_from_frequencies(frequencies=freq_dictionary, )
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
In [56]:
import seaborn as sns
def get_correlation_between_topics(knesset_nums_list):
dfs = [get_knesset_topics(knesset_num) for knesset_num in knesset_nums_list]
total_df = pd.concat(dfs)
topics_df = total_df[['Diplomacy_score', 'Ecologics_score', 'Economics_score',
'Education_score', 'Health_score', 'Security_score']]
labels = ['Diplomacy', 'Ecologics', 'Economics',
'Education', 'Health', 'Security']
corr_df = topics_df.corr()
sns.set(rc={'figure.figsize':(8,8)})
sns.heatmap(corr_df,
xticklabels=labels,
yticklabels=labels)
return corr_df
In [6]:
draw_knesset_topics_over_time(17)
In [7]:
draw_knesset_topics(17)
In [8]:
draw_lexicon_word_cloud_per_knesset(17)
In [78]:
draw_knesset_topics_over_time(18)
We were able to find:
In [79]:
draw_knesset_topics(18)
In [80]:
draw_lexicon_word_cloud_per_knesset(18)
In [81]:
draw_knesset_topics_over_time(20)
This graph is basically showing inertion so it's interesting to see what is that 'normal' subject distribution.
In [82]:
draw_knesset_topics(20)
In [83]:
draw_lexicon_word_cloud_per_knesset(20)
In [57]:
get_correlation_between_topics([17,18,20])
Out[57]:
We were surprised by the results.