Probabilistic Graphical Models with pgmpy


In [1]:
!pip install pgmpy


Collecting pgmpy
  Downloading pgmpy-0.1.2.tar.gz (147kB)
    100% |████████████████████████████████| 153kB 5.9MB/s 
Requirement already satisfied (use --upgrade to upgrade): networkx>=1.8.1 in /opt/conda/lib/python3.5/site-packages (from pgmpy)
Requirement already satisfied (use --upgrade to upgrade): scipy>=0.12.1 in /opt/conda/lib/python3.5/site-packages (from pgmpy)
Requirement already satisfied (use --upgrade to upgrade): numpy>=1.7.0 in /opt/conda/lib/python3.5/site-packages (from pgmpy)
Requirement already satisfied (use --upgrade to upgrade): nose>=1.3.0 in /opt/conda/lib/python3.5/site-packages (from pgmpy)
Collecting coveralls>=0.4 (from pgmpy)
  Downloading coveralls-1.1-py2.py3-none-any.whl
Requirement already satisfied (use --upgrade to upgrade): decorator>=3.4.0 in /opt/conda/lib/python3.5/site-packages (from networkx>=1.8.1->pgmpy)
Collecting docopt>=0.6.1 (from coveralls>=0.4->pgmpy)
  Downloading docopt-0.6.2.tar.gz
Collecting coverage>=3.6 (from coveralls>=0.4->pgmpy)
  Downloading coverage-4.3.4-cp35-cp35m-manylinux1_x86_64.whl (191kB)
    100% |████████████████████████████████| 194kB 4.7MB/s 
Requirement already satisfied (use --upgrade to upgrade): requests>=1.0.0 in /opt/conda/lib/python3.5/site-packages (from coveralls>=0.4->pgmpy)
Building wheels for collected packages: pgmpy, docopt
  Running setup.py bdist_wheel for pgmpy ... - \ | done
  Stored in directory: /home/jovyan/.cache/pip/wheels/d3/21/0f/5b1fc282ee2ab16b693c1a0ed9cb8fde44dbaa28d907c90ff4
  Running setup.py bdist_wheel for docopt ... - \ done
  Stored in directory: /home/jovyan/.cache/pip/wheels/b2/16/5f/c33a2bb5f2dce71205f8e65cbfd05647d79d441282be31fd82
Successfully built pgmpy docopt
Installing collected packages: docopt, coverage, coveralls, pgmpy
Successfully installed coverage-4.3.4 coveralls-1.1 docopt-0.6.2 pgmpy-0.1.2
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [78]:
from pgmpy.factors import TabularCPD

In [79]:
# Declare a CPD 
grade_cpd = TabularCPD(variable="G",
    variable_card=3,
    values=[[0.3, 0.05, 0.9, 0.5],
    [0.4, 0.25, 0.08, 0.3],
    [0.3, 0.7, 0.02, 0.2]],
    evidence=["I", "D"],
    evidence_card=[2, 2])
grade_cpd


Out[79]:
TabularCPD for G
DD_0D_1
II_0I_1I_0I_1
G_00.30000.05000.90000.5000
G_10.40000.25000.08000.3000
G_20.30000.70000.02000.2000

In [80]:
# Declare the sudent model in pgmpy

from pgmpy.models import BayesianModel
from pgmpy.factors import TabularCPD

# Define nodes and edges
student_model = BayesianModel([("D", "G"),
("I", "G"),
("G", "L"),
("I", "S")])

#Define CPDs

grade_cpd = TabularCPD(
variable="G",
variable_card=3,
values=[[0.3, 0.05, 0.9, 0.5],
[0.4, 0.25, 0.08, 0.3],
[0.3, 0.7, 0.02, 0.2]],
evidence=["I", "D"],
evidence_card=[2, 2])

difficulty_cpd = TabularCPD(
variable="D",
variable_card=2,
values=[[0.6, 0.4]])

intel_cpd = TabularCPD(
variable="I",
variable_card=2,
values=[[0.7, 0.3]])

letter_cpd = TabularCPD(
variable="L",
variable_card=2,
values=[[0.1, 0.4, 0.99],
[0.9, 0.6, 0.01]],
evidence=["G"],
evidence_card=[3])

sat_cpd = TabularCPD(
variable="S",
variable_card=2,
values=[[0.95, 0.2],
[0.05, 0.8]],
evidence=["I"],
evidence_card=[2])


#Add CPDs to nodes and edges
student_model.add_cpds(grade_cpd, difficulty_cpd,
intel_cpd, letter_cpd,
sat_cpd)

grade_cpd


Out[80]:
TabularCPD for G
DD_0D_1
II_0I_1I_0I_1
G_00.30000.05000.90000.5000
G_10.40000.25000.08000.3000
G_20.30000.70000.02000.2000

In [81]:
student_model.get_cpds('G')


Out[81]:
TabularCPD for G
DD_0D_1
II_0I_1I_0I_1
G_00.30000.05000.90000.5000
G_10.40000.25000.08000.3000
G_20.30000.70000.02000.2000

In [85]:
student_model.get_parents('G')


Out[85]:
['D', 'I']

In [56]:
from pgmpy.inference import VariableElimination

student_infer = VariableElimination(student_model)

prob_G = student_infer.query(variables='G')

print(prob_G['G'])


╒═════╤══════════╕
│ G   │   phi(G) │
╞═════╪══════════╡
│ G_0 │   0.4470 │
├─────┼──────────┤
│ G_1 │   0.2714 │
├─────┼──────────┤
│ G_2 │   0.2816 │
╘═════╧══════════╛

In [57]:
prob_G = student_infer.query(variables='G', evidence={'I': 1, 'D' : 0})

print(prob_G['G'])


╒═════╤══════════╕
│ G   │   phi(G) │
╞═════╪══════════╡
│ G_0 │   0.0500 │
├─────┼──────────┤
│ G_1 │   0.2500 │
├─────┼──────────┤
│ G_2 │   0.7000 │
╘═════╧══════════╛

In [58]:
prob_G = student_infer.query(variables='G', evidence={'I': 0, 'D' : 1})

print(prob_G['G'])


╒═════╤══════════╕
│ G   │   phi(G) │
╞═════╪══════════╡
│ G_0 │   0.9000 │
├─────┼──────────┤
│ G_1 │   0.0800 │
├─────┼──────────┤
│ G_2 │   0.0200 │
╘═════╧══════════╛

In [67]:
#Train Model from Data

from pgmpy.models import BayesianModel
import pandas as pd
import numpy as np
# Considering that each variable have only 2 states,
# we can generate some random data.

raw_data = np.random.randint(low=0,high=2,size=(1000, 5))


data = pd.DataFrame(raw_data,columns=["D", "I", "G","L", "S"])

print(data[: int(data.shape[0]*0.75)])

data_train = data[: int(data.shape[0] * 0.75)]

student_model = BayesianModel([("D", "G"),("I", "G"),("I", "S"),("G", "L")])
student_model.fit(data_train)
student_model.get_cpds('D')


     D  I  G  L  S
0    0  1  1  1  1
1    0  0  0  0  1
2    0  1  1  0  0
3    0  0  0  0  1
4    1  1  0  1  1
5    1  0  0  0  0
6    1  1  0  1  1
7    1  0  0  0  1
8    1  1  0  0  1
9    1  0  0  0  0
10   1  1  1  1  0
11   1  1  0  0  0
12   1  1  1  1  0
13   1  0  0  1  1
14   0  1  1  1  1
15   1  0  0  1  1
16   1  1  0  1  1
17   0  0  0  1  0
18   0  0  0  0  0
19   1  1  1  1  1
20   0  0  0  1  1
21   0  0  0  0  0
22   0  0  1  0  0
23   0  0  0  1  0
24   0  0  1  1  1
25   1  0  1  1  1
26   0  1  1  1  0
27   0  1  0  0  1
28   1  1  0  0  1
29   1  0  0  0  0
..  .. .. .. .. ..
720  1  1  1  0  1
721  0  0  1  1  1
722  1  1  1  0  1
723  1  0  0  0  0
724  1  0  0  1  1
725  0  0  1  1  1
726  0  0  1  0  1
727  0  0  1  0  0
728  1  1  1  0  0
729  1  1  0  0  0
730  0  0  0  0  0
731  1  1  1  0  0
732  0  0  1  0  1
733  1  1  0  1  0
734  0  1  0  1  1
735  0  1  1  1  0
736  1  0  1  0  0
737  1  1  1  1  0
738  1  1  1  1  1
739  0  1  1  0  0
740  1  1  1  1  1
741  1  0  1  1  0
742  0  1  1  0  1
743  1  0  1  1  0
744  0  1  1  1  1
745  1  0  0  0  1
746  1  0  0  1  0
747  0  1  1  0  1
748  1  0  0  1  1
749  0  0  1  1  1

[750 rows x 5 columns]
Out[67]:
TabularCPD for D
D_00.4400
D_10.5600

In [60]:
student_model.get_cpds('L')


Out[60]:
TabularCPD for L
GG_0G_1
L_00.45450.5000
L_10.54550.5000

In [25]:
student_model.active_trail_nodes('D')


Out[25]:
{'D', 'G', 'L'}

In [26]:
student_model.local_independencies('G')


Out[26]:
(G _|_ S | D, I)

In [27]:
student_model.get_independencies()


Out[27]:
(G _|_ L, I, S | D)
(G _|_ L, I, D | S)
(G _|_ I, S, D | L)
(G _|_ L, D | I)
(D _|_ I, S | G)
(D _|_ G, L | S)
(D _|_ G, I, S | L)
(D _|_ G, L | I)
(S _|_ I, D | G)
(S _|_ G, I, L | D)
(S _|_ G, I, D | L)
(L _|_ G, I, S | D)
(L _|_ G, I, D | S)
(L _|_ G, D | I)
(I _|_ D, S | G)
(I _|_ G, S, L | D)
(I _|_ G, L | S)
(I _|_ G, D, S | L)

In [77]:
data_test = data[int(0.75 * data.shape[0]) : data.shape[0]]

data_test.drop('G', axis=1, inplace=True)

student_model.predict(data_test)


/opt/conda/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
Out[77]:
G
750 1
751 0
752 0
753 0
754 0
755 0
756 0
757 1
758 0
759 1
760 0
761 0
762 1
763 1
764 0
765 1
766 1
767 0
768 0
769 1
770 1
771 1
772 0
773 1
774 1
775 0
776 1
777 1
778 0
779 0
... ...
970 1
971 0
972 1
973 1
974 1
975 0
976 1
977 0
978 1
979 1
980 1
981 1
982 1
983 0
984 1
985 1
986 1
987 0
988 0
989 1
990 0
991 0
992 0
993 0
994 0
995 1
996 1
997 1
998 0
999 0

250 rows × 1 columns