Probabilistic Graphical Models with `pgmpy`



In [1]:

    
!pip install pgmpy









    



Collecting pgmpy
  Downloading pgmpy-0.1.2.tar.gz (147kB)
    100% |████████████████████████████████| 153kB 5.9MB/s 
Requirement already satisfied (use --upgrade to upgrade): networkx>=1.8.1 in /opt/conda/lib/python3.5/site-packages (from pgmpy)
Requirement already satisfied (use --upgrade to upgrade): scipy>=0.12.1 in /opt/conda/lib/python3.5/site-packages (from pgmpy)
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0 in /opt/conda/lib/python3.5/site-packages (from pgmpy)
Requirement already satisfied (use --upgrade to upgrade): nose>=1.3.0 in /opt/conda/lib/python3.5/site-packages (from pgmpy)
Collecting coveralls>=0.4 (from pgmpy)
  Downloading coveralls-1.1-py2.py3-none-any.whl
Requirement already satisfied (use --upgrade to upgrade): decorator>=3.4.0 in /opt/conda/lib/python3.5/site-packages (from networkx>=1.8.1->pgmpy)
Collecting docopt>=0.6.1 (from coveralls>=0.4->pgmpy)
  Downloading docopt-0.6.2.tar.gz
Collecting coverage>=3.6 (from coveralls>=0.4->pgmpy)
  Downloading coverage-4.3.4-cp35-cp35m-manylinux1_x86_64.whl (191kB)
    100% |████████████████████████████████| 194kB 4.7MB/s 
Requirement already satisfied (use --upgrade to upgrade): requests>=1.0.0 in /opt/conda/lib/python3.5/site-packages (from coveralls>=0.4->pgmpy)
Building wheels for collected packages: pgmpy, docopt
  Running setup.py bdist_wheel for pgmpy ... - \ | done
  Stored in directory: /home/jovyan/.cache/pip/wheels/d3/21/0f/5b1fc282ee2ab16b693c1a0ed9cb8fde44dbaa28d907c90ff4
  Running setup.py bdist_wheel for docopt ... - \ done
  Stored in directory: /home/jovyan/.cache/pip/wheels/b2/16/5f/c33a2bb5f2dce71205f8e65cbfd05647d79d441282be31fd82
Successfully built pgmpy docopt
Installing collected packages: docopt, coverage, coveralls, pgmpy
Successfully installed coverage-4.3.4 coveralls-1.1 docopt-0.6.2 pgmpy-0.1.2
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.



In [78]:

    
from pgmpy.factors import TabularCPD



In [79]:

    
# Declare a CPD 
grade_cpd = TabularCPD(variable="G",
    variable_card=3,
    values=[[0.3, 0.05, 0.9, 0.5],
    [0.4, 0.25, 0.08, 0.3],
    [0.3, 0.7, 0.02, 0.2]],
    evidence=["I", "D"],
    evidence_card=[2, 2])
grade_cpd









    Out[79]:




TabularCPD for GD D_0 D_1
I I_0 I_1 I_0 I_1
G_0 0.3000 0.0500 0.9000 0.5000
G_1 0.4000 0.2500 0.0800 0.3000
G_2 0.3000 0.7000 0.0200 0.2000



In [80]:

    
# Declare the sudent model in pgmpy

from pgmpy.models import BayesianModel
from pgmpy.factors import TabularCPD

# Define nodes and edges
student_model = BayesianModel([("D", "G"),
("I", "G"),
("G", "L"),
("I", "S")])

#Define CPDs

grade_cpd = TabularCPD(
variable="G",
variable_card=3,
values=[[0.3, 0.05, 0.9, 0.5],
[0.4, 0.25, 0.08, 0.3],
[0.3, 0.7, 0.02, 0.2]],
evidence=["I", "D"],
evidence_card=[2, 2])

difficulty_cpd = TabularCPD(
variable="D",
variable_card=2,
values=[[0.6, 0.4]])

intel_cpd = TabularCPD(
variable="I",
variable_card=2,
values=[[0.7, 0.3]])

letter_cpd = TabularCPD(
variable="L",
variable_card=2,
values=[[0.1, 0.4, 0.99],
[0.9, 0.6, 0.01]],
evidence=["G"],
evidence_card=[3])

sat_cpd = TabularCPD(
variable="S",
variable_card=2,
values=[[0.95, 0.2],
[0.05, 0.8]],
evidence=["I"],
evidence_card=[2])


#Add CPDs to nodes and edges
student_model.add_cpds(grade_cpd, difficulty_cpd,
intel_cpd, letter_cpd,
sat_cpd)

grade_cpd









    Out[80]:




TabularCPD for GD D_0 D_1
I I_0 I_1 I_0 I_1
G_0 0.3000 0.0500 0.9000 0.5000
G_1 0.4000 0.2500 0.0800 0.3000
G_2 0.3000 0.7000 0.0200 0.2000



In [81]:

    
student_model.get_cpds('G')









    Out[81]:




TabularCPD for GD D_0 D_1
I I_0 I_1 I_0 I_1
G_0 0.3000 0.0500 0.9000 0.5000
G_1 0.4000 0.2500 0.0800 0.3000
G_2 0.3000 0.7000 0.0200 0.2000



In [85]:

    
student_model.get_parents('G')









    Out[85]:





['D', 'I']



In [56]:

    
from pgmpy.inference import VariableElimination

student_infer = VariableElimination(student_model)

prob_G = student_infer.query(variables='G')

print(prob_G['G'])









    



╒═════╤══════════╕
│ G   │   phi(G) │
╞═════╪══════════╡
│ G_0 │   0.4470 │
├─────┼──────────┤
│ G_1 │   0.2714 │
├─────┼──────────┤
│ G_2 │   0.2816 │
╘═════╧══════════╛



In [57]:

    
prob_G = student_infer.query(variables='G', evidence={'I': 1, 'D' : 0})

print(prob_G['G'])









    



╒═════╤══════════╕
│ G   │   phi(G) │
╞═════╪══════════╡
│ G_0 │   0.0500 │
├─────┼──────────┤
│ G_1 │   0.2500 │
├─────┼──────────┤
│ G_2 │   0.7000 │
╘═════╧══════════╛



In [58]:

    
prob_G = student_infer.query(variables='G', evidence={'I': 0, 'D' : 1})

print(prob_G['G'])









    



╒═════╤══════════╕
│ G   │   phi(G) │
╞═════╪══════════╡
│ G_0 │   0.9000 │
├─────┼──────────┤
│ G_1 │   0.0800 │
├─────┼──────────┤
│ G_2 │   0.0200 │
╘═════╧══════════╛



In [67]:

    
#Train Model from Data

from pgmpy.models import BayesianModel
import pandas as pd
import numpy as np
# Considering that each variable have only 2 states,
# we can generate some random data.

raw_data = np.random.randint(low=0,high=2,size=(1000, 5))


data = pd.DataFrame(raw_data,columns=["D", "I", "G","L", "S"])

print(data[: int(data.shape[0]*0.75)])

data_train = data[: int(data.shape[0] * 0.75)]

student_model = BayesianModel([("D", "G"),("I", "G"),("I", "S"),("G", "L")])
student_model.fit(data_train)
student_model.get_cpds('D')









    



     D  I  G  L  S
0    0  1  1  1  1
1    0  0  0  0  1
2    0  1  1  0  0
3    0  0  0  0  1
4    1  1  0  1  1
5    1  0  0  0  0
6    1  1  0  1  1
7    1  0  0  0  1
8    1  1  0  0  1
9    1  0  0  0  0
10   1  1  1  1  0
11   1  1  0  0  0
12   1  1  1  1  0
13   1  0  0  1  1
14   0  1  1  1  1
15   1  0  0  1  1
16   1  1  0  1  1
17   0  0  0  1  0
18   0  0  0  0  0
19   1  1  1  1  1
20   0  0  0  1  1
21   0  0  0  0  0
22   0  0  1  0  0
23   0  0  0  1  0
24   0  0  1  1  1
25   1  0  1  1  1
26   0  1  1  1  0
27   0  1  0  0  1
28   1  1  0  0  1
29   1  0  0  0  0
..  .. .. .. .. ..
720  1  1  1  0  1
721  0  0  1  1  1
722  1  1  1  0  1
723  1  0  0  0  0
724  1  0  0  1  1
725  0  0  1  1  1
726  0  0  1  0  1
727  0  0  1  0  0
728  1  1  1  0  0
729  1  1  0  0  0
730  0  0  0  0  0
731  1  1  1  0  0
732  0  0  1  0  1
733  1  1  0  1  0
734  0  1  0  1  1
735  0  1  1  1  0
736  1  0  1  0  0
737  1  1  1  1  0
738  1  1  1  1  1
739  0  1  1  0  0
740  1  1  1  1  1
741  1  0  1  1  0
742  0  1  1  0  1
743  1  0  1  1  0
744  0  1  1  1  1
745  1  0  0  0  1
746  1  0  0  1  0
747  0  1  1  0  1
748  1  0  0  1  1
749  0  0  1  1  1

[750 rows x 5 columns]






    Out[67]:




TabularCPD for DD_0 0.4400
D_1 0.5600



In [60]:

    
student_model.get_cpds('L')









    Out[60]:




TabularCPD for LG G_0 G_1
L_0 0.4545 0.5000
L_1 0.5455 0.5000



In [25]:

    
student_model.active_trail_nodes('D')









    Out[25]:





{'D', 'G', 'L'}



In [26]:

    
student_model.local_independencies('G')









    Out[26]:





(G _|_ S | D, I)



In [27]:

    
student_model.get_independencies()









    Out[27]:





(G _|_ L, I, S | D)
(G _|_ L, I, D | S)
(G _|_ I, S, D | L)
(G _|_ L, D | I)
(D _|_ I, S | G)
(D _|_ G, L | S)
(D _|_ G, I, S | L)
(D _|_ G, L | I)
(S _|_ I, D | G)
(S _|_ G, I, L | D)
(S _|_ G, I, D | L)
(L _|_ G, I, S | D)
(L _|_ G, I, D | S)
(L _|_ G, D | I)
(I _|_ D, S | G)
(I _|_ G, S, L | D)
(I _|_ G, L | S)
(I _|_ G, D, S | L)



In [77]:

    
data_test = data[int(0.75 * data.shape[0]) : data.shape[0]]

data_test.drop('G', axis=1, inplace=True)

student_model.predict(data_test)









    



/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()






    Out[77]:






  
    
      
      G
    
  
  
    
      750
      1
    
    
      751
      0
    
    
      752
      0
    
    
      753
      0
    
    
      754
      0
    
    
      755
      0
    
    
      756
      0
    
    
      757
      1
    
    
      758
      0
    
    
      759
      1
    
    
      760
      0
    
    
      761
      0
    
    
      762
      1
    
    
      763
      1
    
    
      764
      0
    
    
      765
      1
    
    
      766
      1
    
    
      767
      0
    
    
      768
      0
    
    
      769
      1
    
    
      770
      1
    
    
      771
      1
    
    
      772
      0
    
    
      773
      1
    
    
      774
      1
    
    
      775
      0
    
    
      776
      1
    
    
      777
      1
    
    
      778
      0
    
    
      779
      0
    
    
      ...
      ...
    
    
      970
      1
    
    
      971
      0
    
    
      972
      1
    
    
      973
      1
    
    
      974
      1
    
    
      975
      0
    
    
      976
      1
    
    
      977
      0
    
    
      978
      1
    
    
      979
      1
    
    
      980
      1
    
    
      981
      1
    
    
      982
      1
    
    
      983
      0
    
    
      984
      1
    
    
      985
      1
    
    
      986
      1
    
    
      987
      0
    
    
      988
      0
    
    
      989
      1
    
    
      990
      0
    
    
      991
      0
    
    
      992
      0
    
    
      993
      0
    
    
      994
      0
    
    
      995
      1
    
    
      996
      1
    
    
      997
      1
    
    
      998
      0
    
    
      999
      0
    
  

250 rows × 1 columns

D	D_0		D_1
I	I_0	I_1	I_0	I_1
G_0	0.3000	0.0500	0.9000	0.5000
G_1	0.4000	0.2500	0.0800	0.3000
G_2	0.3000	0.7000	0.0200	0.2000

	G
750	1
751	0
752	0
753	0
754	0
755	0
756	0
757	1
758	0
759	1
760	0
761	0
762	1
763	1
764	0
765	1
766	1
767	0
768	0
769	1
770	1
771	1
772	0
773	1
774	1
775	0
776	1
777	1
778	0
779	0
...	...
970	1
971	0
972	1
973	1
974	1
975	0
976	1
977	0
978	1
979	1
980	1
981	1
982	1
983	0
984	1
985	1
986	1
987	0
988	0
989	1
990	0
991	0
992	0
993	0
994	0
995	1
996	1
997	1
998	0
999	0

TabularCPD for D
D_0	0.4400
D_1	0.5600

	G
750	1
751	0
752	0
753	0
754	0
755	0
756	0
757	1
758	0
759	1
760	0
761	0
762	1
763	1
764	0
765	1
766	1
767	0
768	0
769	1
770	1
771	1
772	0
773	1
774	1
775	0
776	1
777	1
778	0
779	0
...	...
970	1
971	0
972	1
973	1
974	1
975	0
976	1
977	0
978	1
979	1
980	1
981	1
982	1
983	0
984	1
985	1
986	1
987	0
988	0
989	1
990	0
991	0
992	0
993	0
994	0
995	1
996	1
997	1
998	0
999	0

Probabilistic Graphical Models with pgmpy

Probabilistic Graphical Models with `pgmpy`

	G
750	1
751	0
752	0
753	0
754	0
755	0
756	0
757	1
758	0
759	1
760	0
761	0
762	1
763	1
764	0
765	1
766	1
767	0
768	0
769	1
770	1
771	1
772	0
773	1
774	1
775	0
776	1
777	1
778	0
779	0
...	...
970	1
971	0
972	1
973	1
974	1
975	0
976	1
977	0
978	1
979	1
980	1
981	1
982	1
983	0
984	1
985	1
986	1
987	0
988	0
989	1
990	0
991	0
992	0
993	0
994	0
995	1
996	1
997	1
998	0
999	0