In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
felv = pd.read_csv("Pazmany_Ossz.csv")
felv.head()
Out[2]:
In [3]:
allamiak = felv[felv.Tipus == "A"]
allamiak.head()
Out[3]:
In [4]:
pont = pd.read_csv("Ponthatarok.csv")
## normalize the points
pont.Max_pontszam = pont.Max_pontszam / pont.Max_pont
pont.Min_pontszam = pont.Min_pontszam / pont.Max_pont
pont.head()
Out[4]:
In [5]:
## join the data frames
joined = pd.merge(allamiak, pont, on = ["Ev", "Kar"])
In [6]:
joined.Kar.unique()
Out[6]:
In [7]:
from bokeh.plotting import figure, output_file, output_notebook, show, gridplot
from bokeh.models import ColumnDataSource
In [8]:
# configure output, similar to %matplotlib inline
output_notebook()
In [9]:
# tools we want to see on the left or on the top
#
TOOLS = "pan,hover,wheel_zoom,box_zoom,reset,save,box_select,lasso_select"
In [10]:
## split the data frame into different column data sources
btk = joined[joined.Kar == "BTK"]
itk = joined[joined.Kar == "ITK"]
jak = joined[joined.Kar == "JAK"]
vjk = joined[joined.Kar == "VJK"]
btk_pl = ColumnDataSource(btk)
itk_pl = ColumnDataSource(itk)
jak_pl = ColumnDataSource(jak)
vjk_pl = ColumnDataSource(vjk)
btk_pl.column_names
Out[10]:
In [11]:
## creation of the first bokeh plot:
p = figure(title="Jelentkezok szama (allamilag tam.) karonkent", tools=TOOLS, plot_width=550, plot_height=450)
p.scatter("Ev", "Allami_Jel", source = btk_pl, legend ="BTK", color = "red")
p.line("Ev", "Allami_Jel", source = btk_pl, legend ="BTK", color = "red")
show(p)
Out[11]:
In [12]:
## creation of linked subplots:
## ## linked panning via axis X
p2 = figure(title="Felvettek szama (all. tam.) osszesen", tools=TOOLS, plot_width=450, plot_height=450,
x_range=p.x_range,)
## ## linked brushing via the shared source
p2.scatter("Ev", "Allami_Felv", source = btk_pl, legend ="BTK", color = "red")
p2.line("Ev", "Allami_Felv", source = btk_pl, legend ="BTK", color = "red")
Out[12]:
In [13]:
# put both plots into the same grid and show it
s = gridplot([[p,p2]])
# as both 'source' is the same, linked brushing is automatically enabled (see the lasso tool)
show(s)
Out[13]:
In [14]:
## Plotting every faculty one by one
p2 = figure(title="Felvettek szama (all. tam.) osszesen", tools=TOOLS, plot_width=550, plot_height=450)
p2.scatter("Ev", "Allami_Felv", source = btk_pl, legend ="BTK", color = "red")
p2.line("Ev", "Allami_Felv", source = btk_pl, legend ="BTK", color = "red")
p2.scatter("Ev", "Allami_Felv", source = itk_pl, legend ="ITK", color = "orange")
p2.line("Ev", "Allami_Felv", source = itk_pl, legend ="ITK", color = "orange")
p2.scatter("Ev", "Allami_Felv", source = vjk_pl, legend ="VJK", color = "green")
p2.line("Ev", "Allami_Felv", source = vjk_pl, legend ="VJK", color = "green")
p2.scatter("Ev", "Allami_Felv", source = jak_pl, legend ="JAK", color = "blue")
p2.line("Ev", "Allami_Felv", source = jak_pl, legend ="JAK", color = "blue")
show(p2)
Out[14]:
In [15]:
## maximal point thresholds
# common range on x we are interested in
x_range = [min(pont.Max_pontszam), max(pont.Max_pontszam)]
hist_btk, edges_btk = np.histogram(btk.Max_pontszam, density=False,
range = x_range, bins = 50)
hist_itk, edges_itk = np.histogram(itk.Max_pontszam, density=False,
range = x_range, bins = 50)
hist_vjk, edges_vjk = np.histogram(vjk.Max_pontszam, density=False,
range = x_range, bins = 50)
hist_jak, edges_jak = np.histogram(jak.Max_pontszam, density=False,
range = x_range, bins = 50)
hist_vjk, edges_vjk
Out[15]:
In [16]:
## creation of a histogram next to the scatterplot
pont_figure = figure(title = "Maximum ponthatarok", tools=TOOLS, plot_width=550, plot_height=450)
pont_figure.quad(top=hist_btk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
fill_color="red", line_color="#033649", legend = "BTK")
pont_figure.quad(top=hist_itk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
fill_color="orange", line_color="#033649", legend = "ITK")
pont_figure.quad(top=hist_jak, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
fill_color="green", line_color="#033649", legend = "JAK")
pont_figure.quad(top=hist_vjk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
fill_color="blue", line_color="#033649", legend = "VJK")
pont_figure.legend.location = 'top_left'
In [17]:
#s = gridplot([[p2,pont_figure]])
show(pont_figure)
Out[17]:
In [18]:
## double scatterplot
pontszamok = figure(title = "Minimum es maximum ponthatarok", tools=TOOLS, plot_width=550, plot_height=450)
pontszamok.scatter("Min_pontszam", "Max_pontszam", source = btk_pl, legend ="BTK", color = "red")
pontszamok.line("Min_pontszam", "Max_pontszam", source = btk_pl, legend ="BTK", color = "red")
pontszamok.scatter("Min_pontszam", "Max_pontszam", source = itk_pl, legend ="ITK", color = "orange")
pontszamok.line("Min_pontszam", "Max_pontszam", source = itk_pl, legend ="ITK", color = "orange")
pontszamok.scatter("Min_pontszam", "Max_pontszam", source = vjk_pl, legend ="VJK", color = "green")
pontszamok.line("Min_pontszam", "Max_pontszam", source = vjk_pl, legend ="VJK", color = "green")
pontszamok.scatter("Min_pontszam", "Max_pontszam", source = jak_pl, legend ="JAK", color = "blue")
pontszamok.line("Min_pontszam", "Max_pontszam", source = jak_pl, legend ="JAK", color = "blue")
pontszamok.legend.location = "bottom_right"
In [19]:
s = gridplot([[p2,pontszamok]])
show(s)
Out[19]:
In [20]:
## What about basic correlations between variables?
important_columns = ["Ossz_Jel", "Elso_Jel", "Ossz_Felv", "Allami_Felv", "Max_pontszam", "Min_pontszam"]
In [21]:
## compute correlations and creates a data frame
def computeCorrelations(df):
## computes the correlation matrix
cor = np.corrcoef(df[important_columns].as_matrix().T)
## transforms the matrix into a data frame with one column
return(pd.DataFrame(data = {"cor" : cor.flatten()}))
In [22]:
tmp_df_btk = computeCorrelations(btk)
In [23]:
## choosing colors
colors = [
"#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce",
"#ddb7b1", "#cc7878", "#933b41", "#550b1d"]
# some magic to map the [-1, +1] value range to colors
def chooseColors(values):
c = np.ceil(values * 4 + 4)
return([colors[int(i)] for i in c])
In [24]:
values = chooseColors(tmp_df_btk.cor.values)
In [25]:
## creating the input column data source
def createColumnDS(tmp_df, values):
## creation of a columnDS from the pandas data frame containing the correlation values
corgram = ColumnDataSource(tmp_df)
## adding the colors
corgram.add(data = values, name = "colors")
## little complicated range definitions (the corplot.rect accepts only numerical values on axes x and y)
corgram.add(data = np.tile([1, 2, 3, 4, 5, 6], len(important_columns)), name = "x_range")
corgram.add(data = np.repeat([1, 2, 3, 4, 5, 6], len(important_columns)), name = "y_range")
return(corgram)
In [26]:
corgram = createColumnDS(tmp_df_btk, values)
In [27]:
## for tooltips
from bokeh.models import HoverTool
In [28]:
def createCorgramPlot(faculty_name, corgram):
corplot = figure(title=faculty_name,
x_range=important_columns, y_range=important_columns,
x_axis_location="above", plot_width=500, plot_height=500, toolbar_location="left", tools=TOOLS)
corplot.rect("x_range", "y_range",
1, 1,
source = corgram,
color="colors")
## aesthetics changes
corplot.grid.grid_line_color = None
corplot.axis.axis_line_color = None
corplot.axis.major_tick_line_color = None
corplot.axis.major_label_text_font_size = "10pt"
corplot.axis.major_label_standoff = 0
corplot.xaxis.major_label_orientation = np.pi/3
corplot.select_one(HoverTool).tooltips = [
('variables', '@x_range, @y_range'),
('value', '@cor')
]
return(corplot)
In [29]:
corplot_btk = createCorgramPlot("BTK", corgram)
show(corplot_btk)
Out[29]:
In [30]:
#Correlogram for other faculties
tmp_df_itk = computeCorrelations(itk)
corgram = createColumnDS(tmp_df_itk, chooseColors(tmp_df_itk.cor.values))
corplot_itk = createCorgramPlot("ITK", corgram)
tmp_df_jak = computeCorrelations(jak)
corgram = createColumnDS(tmp_df_jak, chooseColors(tmp_df_jak.cor.values))
corplot_jak = createCorgramPlot("JAK", corgram)
tmp_df_vjk = computeCorrelations(vjk)
corgram = createColumnDS(tmp_df_vjk, chooseColors(tmp_df_vjk.cor.values))
corplot_vjk = createCorgramPlot("VJK", corgram)
In [31]:
s = gridplot([[corplot_btk, corplot_itk], [corplot_jak, corplot_vjk]])
show(s)
Out[31]:
In [ ]: