In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
felv = pd.read_csv("Pazmany_Ossz.csv")
felv.head()


Out[2]:
Kar Ev Tipus Ossz_Jel Elso_Jel Nappali_Jel Allami_Jel Ossz_Felv Nappali_Felv Allami_Felv
0 BTK 2015 P 85 85 47 0 46 25 0
1 BTK 2015 A 5713 1781 4028 5365 1332 857 1185
2 BTK 2014 P 94 94 48 0 61 34 0
3 BTK 2014 A 4386 1314 2803 4117 1240 749 1123
4 BTK 2013 P 93 93 77 0 62 50 0

In [3]:
allamiak = felv[felv.Tipus == "A"]
allamiak.head()


Out[3]:
Kar Ev Tipus Ossz_Jel Elso_Jel Nappali_Jel Allami_Jel Ossz_Felv Nappali_Felv Allami_Felv
1 BTK 2015 A 5713 1781 4028 5365 1332 857 1185
3 BTK 2014 A 4386 1314 2803 4117 1240 749 1123
5 BTK 2013 A 3149 907 2111 2855 882 609 782
7 BTK 2012 A 4440 1195 3078 3856 1031 627 693
9 BTK 2011 A 3846 1192 3004 3378 1081 732 647

In [4]:
pont = pd.read_csv("Ponthatarok.csv")
## normalize the points
pont.Max_pontszam = pont.Max_pontszam / pont.Max_pont
pont.Min_pontszam = pont.Min_pontszam / pont.Max_pont
pont.head()


Out[4]:
Ev Kar Max_pontszam Min_pontszam Max_pont
0 2015 BTK 0.936000 0.560000 500
1 2014 BTK 0.930000 0.530000 500
2 2013 BTK 0.930000 0.512000 500
3 2012 BTK 0.912500 0.754167 480
4 2011 BTK 0.902083 0.764583 480

In [5]:
## join the data frames
joined = pd.merge(allamiak, pont, on = ["Ev", "Kar"])

In [6]:
joined.Kar.unique()


Out[6]:
array(['BTK', 'ITK', 'JAK', 'VJK'], dtype=object)

In [7]:
from bokeh.plotting import figure, output_file, output_notebook,  show, gridplot
from bokeh.models import ColumnDataSource

In [8]:
# configure output, similar to %matplotlib inline
output_notebook()


Loading BokehJS ...

In [9]:
# tools we want to see on the left or on the top
# 
TOOLS = "pan,hover,wheel_zoom,box_zoom,reset,save,box_select,lasso_select"

In [10]:
## split the data frame into different column data sources
btk = joined[joined.Kar == "BTK"]
itk = joined[joined.Kar == "ITK"]
jak = joined[joined.Kar == "JAK"]
vjk = joined[joined.Kar == "VJK"]

btk_pl = ColumnDataSource(btk)
itk_pl = ColumnDataSource(itk)
jak_pl = ColumnDataSource(jak)
vjk_pl = ColumnDataSource(vjk)
btk_pl.column_names


Out[10]:
['Tipus',
 'Max_pont',
 'Min_pontszam',
 'Elso_Jel',
 'Nappali_Jel',
 'Allami_Felv',
 'Allami_Jel',
 'index',
 'Kar',
 'Ev',
 'Nappali_Felv',
 'Max_pontszam',
 'Ossz_Jel',
 'Ossz_Felv']

In [11]:
## creation of the first bokeh plot:
p = figure(title="Jelentkezok szama (allamilag tam.) karonkent", tools=TOOLS, plot_width=550, plot_height=450)

p.scatter("Ev", "Allami_Jel",  source = btk_pl, legend ="BTK", color = "red")
p.line("Ev", "Allami_Jel",  source = btk_pl, legend ="BTK", color = "red")

show(p)


Out[11]:

<Bokeh Notebook handle for In[11]>


In [12]:
## creation of linked subplots:
## ## linked panning via axis X
p2 = figure(title="Felvettek szama (all. tam.) osszesen", tools=TOOLS, plot_width=450, plot_height=450,
           x_range=p.x_range,)
## ## linked brushing via the shared source
p2.scatter("Ev", "Allami_Felv",  source = btk_pl, legend ="BTK", color = "red")
p2.line("Ev", "Allami_Felv",  source = btk_pl, legend ="BTK", color = "red")


Out[12]:
<bokeh.models.renderers.GlyphRenderer at 0x1168daf28>

In [13]:
# put both plots into the same grid and show it
s = gridplot([[p,p2]])
# as both 'source' is the same, linked brushing is automatically enabled (see the lasso tool)
show(s)


Out[13]:

<Bokeh Notebook handle for In[13]>


In [14]:
## Plotting every faculty one by one
p2 = figure(title="Felvettek szama (all. tam.) osszesen", tools=TOOLS, plot_width=550, plot_height=450)

p2.scatter("Ev", "Allami_Felv",  source = btk_pl, legend ="BTK", color = "red")
p2.line("Ev", "Allami_Felv",  source = btk_pl, legend ="BTK", color = "red")

p2.scatter("Ev", "Allami_Felv",  source = itk_pl, legend ="ITK", color = "orange")
p2.line("Ev", "Allami_Felv",  source = itk_pl, legend ="ITK", color = "orange")

p2.scatter("Ev", "Allami_Felv",  source = vjk_pl, legend ="VJK", color = "green")
p2.line("Ev", "Allami_Felv",  source = vjk_pl, legend ="VJK", color = "green")

p2.scatter("Ev", "Allami_Felv",  source = jak_pl, legend ="JAK", color = "blue")
p2.line("Ev", "Allami_Felv",  source = jak_pl, legend ="JAK", color = "blue")

show(p2)


Out[14]:

<Bokeh Notebook handle for In[14]>


In [15]:
## maximal point thresholds
# common range on x we are interested in
x_range = [min(pont.Max_pontszam), max(pont.Max_pontszam)]

hist_btk, edges_btk = np.histogram(btk.Max_pontszam, density=False,
                                   range = x_range, bins = 50)
hist_itk, edges_itk = np.histogram(itk.Max_pontszam, density=False,
                                   range = x_range, bins = 50)
hist_vjk, edges_vjk = np.histogram(vjk.Max_pontszam, density=False, 
                                   range = x_range, bins = 50)
hist_jak, edges_jak = np.histogram(jak.Max_pontszam, density=False, 
                                   range = x_range, bins = 50)
hist_vjk, edges_vjk


Out[15]:
(array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]),
 array([ 0.6       ,  0.60744444,  0.61488889,  0.62233333,  0.62977778,
         0.63722222,  0.64466667,  0.65211111,  0.65955556,  0.667     ,
         0.67444444,  0.68188889,  0.68933333,  0.69677778,  0.70422222,
         0.71166667,  0.71911111,  0.72655556,  0.734     ,  0.74144444,
         0.74888889,  0.75633333,  0.76377778,  0.77122222,  0.77866667,
         0.78611111,  0.79355556,  0.801     ,  0.80844444,  0.81588889,
         0.82333333,  0.83077778,  0.83822222,  0.84566667,  0.85311111,
         0.86055556,  0.868     ,  0.87544444,  0.88288889,  0.89033333,
         0.89777778,  0.90522222,  0.91266667,  0.92011111,  0.92755556,
         0.935     ,  0.94244444,  0.94988889,  0.95733333,  0.96477778,
         0.97222222]))

In [16]:
## creation of a histogram next to the scatterplot
pont_figure = figure(title = "Maximum ponthatarok", tools=TOOLS, plot_width=550, plot_height=450)

pont_figure.quad(top=hist_btk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
        fill_color="red", line_color="#033649", legend = "BTK")
pont_figure.quad(top=hist_itk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
        fill_color="orange", line_color="#033649", legend = "ITK")
pont_figure.quad(top=hist_jak, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
        fill_color="green", line_color="#033649", legend = "JAK")
pont_figure.quad(top=hist_vjk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
        fill_color="blue", line_color="#033649", legend = "VJK")

pont_figure.legend.location = 'top_left'

In [17]:
#s = gridplot([[p2,pont_figure]])
show(pont_figure)


Out[17]:

<Bokeh Notebook handle for In[17]>


In [18]:
## double scatterplot
pontszamok = figure(title = "Minimum es maximum ponthatarok", tools=TOOLS, plot_width=550, plot_height=450)

pontszamok.scatter("Min_pontszam", "Max_pontszam",  source = btk_pl, legend ="BTK", color = "red")
pontszamok.line("Min_pontszam", "Max_pontszam",  source = btk_pl, legend ="BTK", color = "red")

pontszamok.scatter("Min_pontszam", "Max_pontszam",   source = itk_pl, legend ="ITK", color = "orange")
pontszamok.line("Min_pontszam", "Max_pontszam",   source = itk_pl, legend ="ITK", color = "orange")

pontszamok.scatter("Min_pontszam", "Max_pontszam",   source = vjk_pl, legend ="VJK", color = "green")
pontszamok.line("Min_pontszam", "Max_pontszam",  source = vjk_pl, legend ="VJK", color = "green")

pontszamok.scatter("Min_pontszam", "Max_pontszam",   source = jak_pl, legend ="JAK", color = "blue")
pontszamok.line("Min_pontszam", "Max_pontszam",   source = jak_pl, legend ="JAK", color = "blue")

pontszamok.legend.location = "bottom_right"

In [19]:
s = gridplot([[p2,pontszamok]])
show(s)


Out[19]:

<Bokeh Notebook handle for In[19]>


In [20]:
## What about basic correlations between variables?
important_columns = ["Ossz_Jel", "Elso_Jel", "Ossz_Felv", "Allami_Felv", "Max_pontszam", "Min_pontszam"]

In [21]:
## compute correlations and creates a data frame
def computeCorrelations(df):
    ## computes the correlation matrix
    cor = np.corrcoef(df[important_columns].as_matrix().T)
    ## transforms the matrix into a data frame with one column
    return(pd.DataFrame(data = {"cor" : cor.flatten()}))

In [22]:
tmp_df_btk = computeCorrelations(btk)

In [23]:
## choosing colors
colors = [
    "#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce",
    "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]

# some magic to map the [-1, +1] value range to colors
def chooseColors(values): 
    c = np.ceil(values * 4 + 4)
    return([colors[int(i)] for i in c])

In [24]:
values = chooseColors(tmp_df_btk.cor.values)

In [25]:
## creating the input column data source
def createColumnDS(tmp_df, values):
    ## creation of a columnDS from the pandas data frame containing the correlation values
    corgram = ColumnDataSource(tmp_df)
    ## adding the colors
    corgram.add(data = values, name = "colors")
    
    ## little complicated range definitions (the corplot.rect accepts only numerical values on axes x and y)
    corgram.add(data = np.tile([1, 2, 3, 4, 5, 6], len(important_columns)), name = "x_range")
    corgram.add(data = np.repeat([1, 2, 3, 4, 5, 6], len(important_columns)), name = "y_range")
    return(corgram)

In [26]:
corgram = createColumnDS(tmp_df_btk, values)

In [27]:
## for tooltips
from bokeh.models import HoverTool

In [28]:
def createCorgramPlot(faculty_name, corgram):
    corplot = figure(title=faculty_name,
    x_range=important_columns, y_range=important_columns,
    x_axis_location="above", plot_width=500, plot_height=500, toolbar_location="left", tools=TOOLS)
    
    corplot.rect("x_range", "y_range",
                 1, 1,
                 source = corgram,
                 color="colors")
    
    ## aesthetics changes
    corplot.grid.grid_line_color = None
    corplot.axis.axis_line_color = None
    corplot.axis.major_tick_line_color = None
    corplot.axis.major_label_text_font_size = "10pt"
    corplot.axis.major_label_standoff = 0
    corplot.xaxis.major_label_orientation = np.pi/3
    
    corplot.select_one(HoverTool).tooltips = [
    ('variables', '@x_range, @y_range'),
        ('value', '@cor')   
]

    return(corplot)

In [29]:
corplot_btk = createCorgramPlot("BTK", corgram)
show(corplot_btk)


Out[29]:

<Bokeh Notebook handle for In[29]>


In [30]:
#Correlogram for other faculties

tmp_df_itk = computeCorrelations(itk)
corgram = createColumnDS(tmp_df_itk, chooseColors(tmp_df_itk.cor.values))
corplot_itk = createCorgramPlot("ITK", corgram)

tmp_df_jak = computeCorrelations(jak)
corgram = createColumnDS(tmp_df_jak, chooseColors(tmp_df_jak.cor.values))
corplot_jak = createCorgramPlot("JAK", corgram)

tmp_df_vjk = computeCorrelations(vjk)
corgram = createColumnDS(tmp_df_vjk, chooseColors(tmp_df_vjk.cor.values))
corplot_vjk = createCorgramPlot("VJK", corgram)

In [31]:
s = gridplot([[corplot_btk, corplot_itk], [corplot_jak, corplot_vjk]])
show(s)


Out[31]:

<Bokeh Notebook handle for In[31]>


In [ ]: