notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [2]:

    
felv = pd.read_csv("Pazmany_Ossz.csv")
felv.head()









    Out[2]:






  
    
      
      Kar
      Ev
      Tipus
      Ossz_Jel
      Elso_Jel
      Nappali_Jel
      Allami_Jel
      Ossz_Felv
      Nappali_Felv
      Allami_Felv
    
  
  
    
      0
      BTK
      2015
      P
      85
      85
      47
      0
      46
      25
      0
    
    
      1
      BTK
      2015
      A
      5713
      1781
      4028
      5365
      1332
      857
      1185
    
    
      2
      BTK
      2014
      P
      94
      94
      48
      0
      61
      34
      0
    
    
      3
      BTK
      2014
      A
      4386
      1314
      2803
      4117
      1240
      749
      1123
    
    
      4
      BTK
      2013
      P
      93
      93
      77
      0
      62
      50
      0



In [3]:

    
allamiak = felv[felv.Tipus == "A"]
allamiak.head()









    Out[3]:






  
    
      
      Kar
      Ev
      Tipus
      Ossz_Jel
      Elso_Jel
      Nappali_Jel
      Allami_Jel
      Ossz_Felv
      Nappali_Felv
      Allami_Felv
    
  
  
    
      1
      BTK
      2015
      A
      5713
      1781
      4028
      5365
      1332
      857
      1185
    
    
      3
      BTK
      2014
      A
      4386
      1314
      2803
      4117
      1240
      749
      1123
    
    
      5
      BTK
      2013
      A
      3149
      907
      2111
      2855
      882
      609
      782
    
    
      7
      BTK
      2012
      A
      4440
      1195
      3078
      3856
      1031
      627
      693
    
    
      9
      BTK
      2011
      A
      3846
      1192
      3004
      3378
      1081
      732
      647



In [4]:

    
pont = pd.read_csv("Ponthatarok.csv")
## normalize the points
pont.Max_pontszam = pont.Max_pontszam / pont.Max_pont
pont.Min_pontszam = pont.Min_pontszam / pont.Max_pont
pont.head()









    Out[4]:






  
    
      
      Ev
      Kar
      Max_pontszam
      Min_pontszam
      Max_pont
    
  
  
    
      0
      2015
      BTK
      0.936000
      0.560000
      500
    
    
      1
      2014
      BTK
      0.930000
      0.530000
      500
    
    
      2
      2013
      BTK
      0.930000
      0.512000
      500
    
    
      3
      2012
      BTK
      0.912500
      0.754167
      480
    
    
      4
      2011
      BTK
      0.902083
      0.764583
      480



In [5]:

    
## join the data frames
joined = pd.merge(allamiak, pont, on = ["Ev", "Kar"])



In [6]:

    
joined.Kar.unique()









    Out[6]:





array(['BTK', 'ITK', 'JAK', 'VJK'], dtype=object)



In [7]:

    
from bokeh.plotting import figure, output_file, output_notebook,  show, gridplot
from bokeh.models import ColumnDataSource



In [8]:

    
# configure output, similar to %matplotlib inline
output_notebook()









    





    
        
        Loading BokehJS ...



In [9]:

    
# tools we want to see on the left or on the top
# 
TOOLS = "pan,hover,wheel_zoom,box_zoom,reset,save,box_select,lasso_select"



In [10]:

    
## split the data frame into different column data sources
btk = joined[joined.Kar == "BTK"]
itk = joined[joined.Kar == "ITK"]
jak = joined[joined.Kar == "JAK"]
vjk = joined[joined.Kar == "VJK"]

btk_pl = ColumnDataSource(btk)
itk_pl = ColumnDataSource(itk)
jak_pl = ColumnDataSource(jak)
vjk_pl = ColumnDataSource(vjk)
btk_pl.column_names









    Out[10]:





['Tipus',
 'Max_pont',
 'Min_pontszam',
 'Elso_Jel',
 'Nappali_Jel',
 'Allami_Felv',
 'Allami_Jel',
 'index',
 'Kar',
 'Ev',
 'Nappali_Felv',
 'Max_pontszam',
 'Ossz_Jel',
 'Ossz_Felv']



In [11]:

    
## creation of the first bokeh plot:
p = figure(title="Jelentkezok szama (allamilag tam.) karonkent", tools=TOOLS, plot_width=550, plot_height=450)

p.scatter("Ev", "Allami_Jel",  source = btk_pl, legend ="BTK", color = "red")
p.line("Ev", "Allami_Jel",  source = btk_pl, legend ="BTK", color = "red")

show(p)









    






    







    Out[11]:




<Bokeh Notebook handle for In[11]>



In [12]:

    
## creation of linked subplots:
## ## linked panning via axis X
p2 = figure(title="Felvettek szama (all. tam.) osszesen", tools=TOOLS, plot_width=450, plot_height=450,
           x_range=p.x_range,)
## ## linked brushing via the shared source
p2.scatter("Ev", "Allami_Felv",  source = btk_pl, legend ="BTK", color = "red")
p2.line("Ev", "Allami_Felv",  source = btk_pl, legend ="BTK", color = "red")









    Out[12]:





<bokeh.models.renderers.GlyphRenderer at 0x1168daf28>



In [13]:

    
# put both plots into the same grid and show it
s = gridplot([[p,p2]])
# as both 'source' is the same, linked brushing is automatically enabled (see the lasso tool)
show(s)









    






    







    Out[13]:




<Bokeh Notebook handle for In[13]>



In [14]:

    
## Plotting every faculty one by one
p2 = figure(title="Felvettek szama (all. tam.) osszesen", tools=TOOLS, plot_width=550, plot_height=450)

p2.scatter("Ev", "Allami_Felv",  source = btk_pl, legend ="BTK", color = "red")
p2.line("Ev", "Allami_Felv",  source = btk_pl, legend ="BTK", color = "red")

p2.scatter("Ev", "Allami_Felv",  source = itk_pl, legend ="ITK", color = "orange")
p2.line("Ev", "Allami_Felv",  source = itk_pl, legend ="ITK", color = "orange")

p2.scatter("Ev", "Allami_Felv",  source = vjk_pl, legend ="VJK", color = "green")
p2.line("Ev", "Allami_Felv",  source = vjk_pl, legend ="VJK", color = "green")

p2.scatter("Ev", "Allami_Felv",  source = jak_pl, legend ="JAK", color = "blue")
p2.line("Ev", "Allami_Felv",  source = jak_pl, legend ="JAK", color = "blue")

show(p2)









    






    







    Out[14]:




<Bokeh Notebook handle for In[14]>



In [15]:

    
## maximal point thresholds
# common range on x we are interested in
x_range = [min(pont.Max_pontszam), max(pont.Max_pontszam)]

hist_btk, edges_btk = np.histogram(btk.Max_pontszam, density=False,
                                   range = x_range, bins = 50)
hist_itk, edges_itk = np.histogram(itk.Max_pontszam, density=False,
                                   range = x_range, bins = 50)
hist_vjk, edges_vjk = np.histogram(vjk.Max_pontszam, density=False, 
                                   range = x_range, bins = 50)
hist_jak, edges_jak = np.histogram(jak.Max_pontszam, density=False, 
                                   range = x_range, bins = 50)
hist_vjk, edges_vjk









    Out[15]:





(array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0]),
 array([ 0.6       ,  0.60744444,  0.61488889,  0.62233333,  0.62977778,
         0.63722222,  0.64466667,  0.65211111,  0.65955556,  0.667     ,
         0.67444444,  0.68188889,  0.68933333,  0.69677778,  0.70422222,
         0.71166667,  0.71911111,  0.72655556,  0.734     ,  0.74144444,
         0.74888889,  0.75633333,  0.76377778,  0.77122222,  0.77866667,
         0.78611111,  0.79355556,  0.801     ,  0.80844444,  0.81588889,
         0.82333333,  0.83077778,  0.83822222,  0.84566667,  0.85311111,
         0.86055556,  0.868     ,  0.87544444,  0.88288889,  0.89033333,
         0.89777778,  0.90522222,  0.91266667,  0.92011111,  0.92755556,
         0.935     ,  0.94244444,  0.94988889,  0.95733333,  0.96477778,
         0.97222222]))



In [16]:

    
## creation of a histogram next to the scatterplot
pont_figure = figure(title = "Maximum ponthatarok", tools=TOOLS, plot_width=550, plot_height=450)

pont_figure.quad(top=hist_btk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
        fill_color="red", line_color="#033649", legend = "BTK")
pont_figure.quad(top=hist_itk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
        fill_color="orange", line_color="#033649", legend = "ITK")
pont_figure.quad(top=hist_jak, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
        fill_color="green", line_color="#033649", legend = "JAK")
pont_figure.quad(top=hist_vjk, bottom=0, left=edges_btk[:-1], right=edges_btk[1:],
        fill_color="blue", line_color="#033649", legend = "VJK")

pont_figure.legend.location = 'top_left'



In [17]:

    
#s = gridplot([[p2,pont_figure]])
show(pont_figure)









    






    







    Out[17]:




<Bokeh Notebook handle for In[17]>



In [18]:

    
## double scatterplot
pontszamok = figure(title = "Minimum es maximum ponthatarok", tools=TOOLS, plot_width=550, plot_height=450)

pontszamok.scatter("Min_pontszam", "Max_pontszam",  source = btk_pl, legend ="BTK", color = "red")
pontszamok.line("Min_pontszam", "Max_pontszam",  source = btk_pl, legend ="BTK", color = "red")

pontszamok.scatter("Min_pontszam", "Max_pontszam",   source = itk_pl, legend ="ITK", color = "orange")
pontszamok.line("Min_pontszam", "Max_pontszam",   source = itk_pl, legend ="ITK", color = "orange")

pontszamok.scatter("Min_pontszam", "Max_pontszam",   source = vjk_pl, legend ="VJK", color = "green")
pontszamok.line("Min_pontszam", "Max_pontszam",  source = vjk_pl, legend ="VJK", color = "green")

pontszamok.scatter("Min_pontszam", "Max_pontszam",   source = jak_pl, legend ="JAK", color = "blue")
pontszamok.line("Min_pontszam", "Max_pontszam",   source = jak_pl, legend ="JAK", color = "blue")

pontszamok.legend.location = "bottom_right"



In [19]:

    
s = gridplot([[p2,pontszamok]])
show(s)









    






    







    Out[19]:




<Bokeh Notebook handle for In[19]>



In [20]:

    
## What about basic correlations between variables?
important_columns = ["Ossz_Jel", "Elso_Jel", "Ossz_Felv", "Allami_Felv", "Max_pontszam", "Min_pontszam"]



In [21]:

    
## compute correlations and creates a data frame
def computeCorrelations(df):
    ## computes the correlation matrix
    cor = np.corrcoef(df[important_columns].as_matrix().T)
    ## transforms the matrix into a data frame with one column
    return(pd.DataFrame(data = {"cor" : cor.flatten()}))



In [22]:

    
tmp_df_btk = computeCorrelations(btk)



In [23]:

    
## choosing colors
colors = [
    "#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce",
    "#ddb7b1", "#cc7878", "#933b41", "#550b1d"]

# some magic to map the [-1, +1] value range to colors
def chooseColors(values): 
    c = np.ceil(values * 4 + 4)
    return([colors[int(i)] for i in c])



In [24]:

    
values = chooseColors(tmp_df_btk.cor.values)



In [25]:

    
## creating the input column data source
def createColumnDS(tmp_df, values):
    ## creation of a columnDS from the pandas data frame containing the correlation values
    corgram = ColumnDataSource(tmp_df)
    ## adding the colors
    corgram.add(data = values, name = "colors")
    
    ## little complicated range definitions (the corplot.rect accepts only numerical values on axes x and y)
    corgram.add(data = np.tile([1, 2, 3, 4, 5, 6], len(important_columns)), name = "x_range")
    corgram.add(data = np.repeat([1, 2, 3, 4, 5, 6], len(important_columns)), name = "y_range")
    return(corgram)



In [26]:

    
corgram = createColumnDS(tmp_df_btk, values)



In [27]:

    
## for tooltips
from bokeh.models import HoverTool



In [28]:

    
def createCorgramPlot(faculty_name, corgram):
    corplot = figure(title=faculty_name,
    x_range=important_columns, y_range=important_columns,
    x_axis_location="above", plot_width=500, plot_height=500, toolbar_location="left", tools=TOOLS)
    
    corplot.rect("x_range", "y_range",
                 1, 1,
                 source = corgram,
                 color="colors")
    
    ## aesthetics changes
    corplot.grid.grid_line_color = None
    corplot.axis.axis_line_color = None
    corplot.axis.major_tick_line_color = None
    corplot.axis.major_label_text_font_size = "10pt"
    corplot.axis.major_label_standoff = 0
    corplot.xaxis.major_label_orientation = np.pi/3
    
    corplot.select_one(HoverTool).tooltips = [
    ('variables', '@x_range, @y_range'),
        ('value', '@cor')   
]

    return(corplot)



In [29]:

    
corplot_btk = createCorgramPlot("BTK", corgram)
show(corplot_btk)









    






    







    Out[29]:




<Bokeh Notebook handle for In[29]>



In [30]:

    
#Correlogram for other faculties

tmp_df_itk = computeCorrelations(itk)
corgram = createColumnDS(tmp_df_itk, chooseColors(tmp_df_itk.cor.values))
corplot_itk = createCorgramPlot("ITK", corgram)

tmp_df_jak = computeCorrelations(jak)
corgram = createColumnDS(tmp_df_jak, chooseColors(tmp_df_jak.cor.values))
corplot_jak = createCorgramPlot("JAK", corgram)

tmp_df_vjk = computeCorrelations(vjk)
corgram = createColumnDS(tmp_df_vjk, chooseColors(tmp_df_vjk.cor.values))
corplot_vjk = createCorgramPlot("VJK", corgram)



In [31]:

    
s = gridplot([[corplot_btk, corplot_itk], [corplot_jak, corplot_vjk]])
show(s)









    






    







    Out[31]:




<Bokeh Notebook handle for In[31]>



In [ ]:

	Kar	Ev	Tipus	Ossz_Jel	Elso_Jel	Nappali_Jel	Allami_Jel	Ossz_Felv	Nappali_Felv	Allami_Felv
0	BTK	2015	P	85	85	47	0	46	25	0
1	BTK	2015	A	5713	1781	4028	5365	1332	857	1185
2	BTK	2014	P	94	94	48	0	61	34	0
3	BTK	2014	A	4386	1314	2803	4117	1240	749	1123
4	BTK	2013	P	93	93	77	0	62	50	0

	Ev	Kar	Max_pontszam	Min_pontszam	Max_pont
0	2015	BTK	0.936000	0.560000	500
1	2014	BTK	0.930000	0.530000	500
2	2013	BTK	0.930000	0.512000	500
3	2012	BTK	0.912500	0.754167	480
4	2011	BTK	0.902083	0.764583	480