In [1]:
!pip install pandas-profiling
Requirement already satisfied: pandas-profiling in /Users/peta/anaconda3/lib/python3.6/site-packages
Requirement already satisfied: six>=1.9 in /Users/peta/anaconda3/lib/python3.6/site-packages (from pandas-profiling)
Requirement already satisfied: matplotlib>=1.4 in /Users/peta/anaconda3/lib/python3.6/site-packages (from pandas-profiling)
Requirement already satisfied: jinja2>=2.8 in /Users/peta/anaconda3/lib/python3.6/site-packages (from pandas-profiling)
Requirement already satisfied: pandas>=0.19 in /Users/peta/anaconda3/lib/python3.6/site-packages (from pandas-profiling)
Requirement already satisfied: numpy>=1.7.1 in /Users/peta/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: python-dateutil in /Users/peta/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: pytz in /Users/peta/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: cycler>=0.10 in /Users/peta/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=1.5.6 in /Users/peta/anaconda3/lib/python3.6/site-packages (from matplotlib>=1.4->pandas-profiling)
Requirement already satisfied: MarkupSafe>=0.23 in /Users/peta/anaconda3/lib/python3.6/site-packages (from jinja2>=2.8->pandas-profiling)
In [2]:
import pandas as pd
import pandas_profiling
In [3]:
df = pd.read_csv("diabetesqld-AgegroupRegistration-171020.csv", parse_dates=['Registration dates'], encoding='UTF-8')
In [4]:
pandas_profiling.ProfileReport(df)
Out[4]:
Overview
Dataset info
Number of variables
12
Number of observations
363
Total Missing (%)
22.6%
Total size in memory
34.1 KiB
Average record size in memory
96.2 B
Variables types
Numeric
0
Categorical
11
Date
1
Text (Unique)
0
Rejected
0
Warnings
00-15
has 331 / 91.2% missing values Missing16-20
has 288 / 79.3% missing values Missing21-29
has 188 / 51.8% missing values Missing30-39
has 112 / 30.9% missing values Missing30-39
has a high cardinality: 62 distinct values Warning40-49
has 44 / 12.1% missing values Missing40-49
has a high cardinality: 116 distinct values Warning50-59
has 15 / 4.1% missing values Missing50-59
has a high cardinality: 176 distinct values Warning60-69
has a high cardinality: 211 distinct values Warning70-79
has a high cardinality: 180 distinct values Warning80-89
has a high cardinality: 110 distinct values Warning90+
has 5 / 1.4% missing values MissingGrand Total
has a high cardinality: 301 distinct values Warning
Variables
00-15
Categorical
Distinct count
2
Unique (%)
6.2%
Missing (%)
91.2%
Missing (n)
331
.
32
(Missing)
331
Value
Count
Frequency (%)
.
32
8.8%
(Missing)
331
91.2%
16-20
Categorical
Distinct count
2
Unique (%)
2.7%
Missing (%)
79.3%
Missing (n)
288
.
75
(Missing)
288
Value
Count
Frequency (%)
.
75
20.7%
(Missing)
288
79.3%
21-29
Categorical
Distinct count
16
Unique (%)
9.1%
Missing (%)
51.8%
Missing (n)
188
.
131
11
9
12
7
Other values (12)
28
(Missing)
188
Value
Count
Frequency (%)
.
131
36.1%
11
9
2.5%
12
7
1.9%
15
5
1.4%
16
4
1.1%
17
4
1.1%
25
3
0.8%
24
2
0.6%
13
2
0.6%
22
2
0.6%
Other values (5)
6
1.7%
(Missing)
188
51.8%
30-39
Categorical
Distinct count
62
Unique (%)
24.7%
Missing (%)
30.9%
Missing (n)
112
.
111
11
7
51
6
Other values (58)
127
(Missing)
112
Value
Count
Frequency (%)
.
111
30.6%
11
7
1.9%
51
6
1.7%
15
6
1.7%
23
6
1.7%
14
5
1.4%
50
5
1.4%
66
4
1.1%
21
4
1.1%
36
4
1.1%
Other values (51)
93
25.6%
(Missing)
112
30.9%
40-49
Categorical
Distinct count
116
Unique (%)
36.4%
Missing (%)
12.1%
Missing (n)
44
.
56
11
11
18
8
Other values (112)
244
(Missing)
44
Value
Count
Frequency (%)
.
56
15.4%
11
11
3.0%
18
8
2.2%
14
7
1.9%
17
7
1.9%
16
6
1.7%
15
6
1.7%
33
6
1.7%
26
5
1.4%
12
5
1.4%
Other values (105)
202
55.6%
(Missing)
44
12.1%
50-59
Categorical
Distinct count
176
Unique (%)
50.6%
Missing (%)
4.1%
Missing (n)
15
.
53
78
8
30
6
Other values (172)
281
(Missing)
15
Value
Count
Frequency (%)
.
53
14.6%
78
8
2.2%
30
6
1.7%
33
5
1.4%
32
5
1.4%
35
5
1.4%
38
4
1.1%
106
4
1.1%
264
4
1.1%
28
4
1.1%
Other values (165)
250
68.9%
(Missing)
15
4.1%
60-69
Categorical
Distinct count
211
Unique (%)
58.4%
Missing (%)
0.6%
Missing (n)
2
.
46
14
6
35
5
Other values (207)
304
Value
Count
Frequency (%)
.
46
12.7%
14
6
1.7%
35
5
1.4%
248
5
1.4%
12
4
1.1%
173
4
1.1%
27
4
1.1%
63
3
0.8%
15
3
0.8%
315
3
0.8%
Other values (200)
278
76.6%
70-79
Categorical
Distinct count
180
Unique (%)
49.6%
Missing (%)
0.0%
Missing (n)
0
.
36
187
7
232
5
Other values (177)
315
Value
Count
Frequency (%)
.
36
9.9%
187
7
1.9%
232
5
1.4%
11
5
1.4%
212
5
1.4%
36
5
1.4%
44
4
1.1%
25
4
1.1%
205
4
1.1%
219
4
1.1%
Other values (170)
284
78.2%
80-89
Categorical
Distinct count
110
Unique (%)
30.3%
Missing (%)
0.0%
Missing (n)
0
.
44
88
9
106
8
Other values (107)
302
Value
Count
Frequency (%)
.
44
12.1%
88
9
2.5%
106
8
2.2%
22
8
2.2%
96
7
1.9%
109
6
1.7%
97
6
1.7%
23
6
1.7%
58
6
1.7%
83
6
1.7%
Other values (100)
257
70.8%
90+
Categorical
Distinct count
23
Unique (%)
6.4%
Missing (%)
1.4%
Missing (n)
5
.
149
12
28
11
25
Other values (19)
156
Value
Count
Frequency (%)
.
149
41.0%
12
28
7.7%
11
25
6.9%
14
23
6.3%
13
20
5.5%
16
19
5.2%
15
14
3.9%
19
13
3.6%
17
12
3.3%
20
12
3.3%
Other values (12)
43
11.8%
Grand Total
Categorical
Distinct count
301
Unique (%)
82.9%
Missing (%)
0.0%
Missing (n)
0
15
6
14
5
.
4
Other values (298)
348
Value
Count
Frequency (%)
15
6
1.7%
14
5
1.4%
.
4
1.1%
13
4
1.1%
22
4
1.1%
253
3
0.8%
475
3
0.8%
205
3
0.8%
548
3
0.8%
11
3
0.8%
Other values (291)
325
89.5%
Registration dates
Date
Distinct count
363
Unique (%)
100.0%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Minimum
1987-08-01 00:00:00
Maximum
2017-10-01 00:00:00
Sample
Registration dates
00-15
16-20
21-29
30-39
40-49
50-59
60-69
70-79
80-89
90+
Grand Total
0
1988-04-01
NaN
NaN
NaN
NaN
NaN
NaN
.
.
.
.
13
1
1989-04-01
NaN
NaN
NaN
NaN
NaN
NaN
.
.
.
.
14
2
1990-04-01
NaN
NaN
NaN
NaN
NaN
.
.
.
.
.
22
3
1991-04-01
NaN
NaN
NaN
NaN
NaN
.
.
15
12
.
38
4
1992-04-01
NaN
NaN
NaN
NaN
.
.
20
21
21
12
78
In [ ]:
pfr = pandas_profiling.ProfileReport(df)
pfr.to_file("diabetesqld-agegroupregistration-profile.html")
In [ ]:
#### Print existing ProfileReport object inline
pfr
Content source: bigdatabigheart/Datasets
Similar notebooks: