In [1]:
# Import py_entitymatching package
import py_entitymatching as em
import os
import pandas as pd
import pandas_profiling
Then, read the (sample) input tables
In [2]:
# Get the datasets directory
datasets_dir = em.get_install_path() + os.sep + 'datasets'
# Get the paths of the input tables
path_A = datasets_dir + os.sep + 'dblp_demo.csv'
In [4]:
# Read the CSV files and set 'ID' as the key attribute
A = em.read_csv_metadata(path_A, key='id')
A.head()
Metadata file is not present in the given path; proceeding to read the csv file.
Out[4]:
id
title
authors
venue
year
0
l0
Paradise: A Database System for GIS Applications
Paradise Team
SIGMOD Conference
1995
1
l1
A Query Language and Optimization Techniques for Unstructured Data
Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu
SIGMOD Conference
1996
2
l2
Turbo-charging Vertical Mining of Large Databases
Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia
SIGMOD Conference
2000
3
l3
Maintenance of Data Cubes and Summary Tables in a Warehouse
Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick
SIGMOD Conference
1997
4
l4
On Relational Support for XML Publishing: Beyond Sorting and Tagging
Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri
SIGMOD Conference
2003
In [5]:
pandas_profiling.ProfileReport(A)
Out[5]:
Overview
Dataset info
Number of variables
5
Number of observations
1800
Total Missing (%)
0.0%
Total size in memory
70.4 KiB
Average record size in memory
40.0 B
Variables types
Numeric
1
Categorical
3
Date
0
Text (Unique)
1
Rejected
0
Warnings
authors
has a high cardinality: 1703 distinct values Warningtitle
has a high cardinality: 1797 distinct values Warning
Variables
authors
Categorical
Distinct count
1703
Unique (%)
94.6%
Missing (%)
0.0%
Missing (n)
0
Dan Suciu
7
C. Mohan
6
Andrew Eisenberg, Jim Melton
5
Other values (1700)
1782
Value
Count
Frequency (%)
Dan Suciu
7
0.4%
C. Mohan
6
0.3%
Andrew Eisenberg, Jim Melton
5
0.3%
Xiaolei Qian
5
0.3%
Joseph M. Hellerstein
4
0.2%
Richard T. Snodgrass
4
0.2%
Praveen Seshadri
3
0.2%
H. V. Jagadish
3
0.2%
Nam Huyn
3
0.2%
Viswanath Poosala, Yannis E. Ioannidis
3
0.2%
Other values (1693)
1757
97.6%
id
Categorical, Unique
First 3 values
l415
l1574
l1364
Last 3 values
l492
l273
l92
First 10 values
Value
Count
Frequency (%)
l0
1
0.1%
l1
1
0.1%
l10
1
0.1%
l100
1
0.1%
l1000
1
0.1%
Last 10 values
Value
Count
Frequency (%)
l995
1
0.1%
l996
1
0.1%
l997
1
0.1%
l998
1
0.1%
l999
1
0.1%
title
Categorical
Distinct count
1797
Unique (%)
99.8%
Missing (%)
0.0%
Missing (n)
0
Editorial
2
Guest editorial
2
Keynote Address
2
Other values (1794)
1794
Value
Count
Frequency (%)
Editorial
2
0.1%
Guest editorial
2
0.1%
Keynote Address
2
0.1%
Integrating Modelling Systems for Environmental Management Information Systems
1
0.1%
Historical Queries Along Multiple Lines of Time Evolution
1
0.1%
Selectivity Estimation Without the Attribute Value Independence Assumption
1
0.1%
Analysis of existing databases at the logical level: the DBA companion project
1
0.1%
Using Versions in Update Transactions: Application to Integrity Checking
1
0.1%
Power efficient data gathering and aggregation in wireless sensor networks
1
0.1%
Instance-based attribute identification in database integration
1
0.1%
Other values (1787)
1787
99.3%
venue
Categorical
Distinct count
5
Unique (%)
0.3%
Missing (%)
0.0%
Missing (n)
0
SIGMOD Conference
654
VLDB
512
SIGMOD Record
381
Other values (2)
253
Value
Count
Frequency (%)
SIGMOD Conference
654
36.3%
VLDB
512
28.4%
SIGMOD Record
381
21.2%
VLDB J.
146
8.1%
ACM Trans. Database Syst.
107
5.9%
year
Numeric
Distinct count
10
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
1998.4
Minimum
1994
Maximum
2003
Zeros (%)
0.0%
Quantile statistics
Minimum
1994
5-th percentile
1994
Q1
1996
Median
1998
Q3
2001
95-th percentile
2003
Maximum
2003
Range
9
Interquartile range
5
Descriptive statistics
Standard deviation
2.8231
Coef of variation
0.0014127
Kurtosis
-1.2004
Mean
1998.4
MAD
2.4525
Skewness
-0.007014
Sum
3597166
Variance
7.97
Memory size
14.1 KiB
Value
Count
Frequency (%)
2001
218
12.1%
1998
194
10.8%
2000
191
10.6%
1995
188
10.4%
1996
182
10.1%
1994
182
10.1%
1999
176
9.8%
1997
164
9.1%
2003
154
8.6%
2002
151
8.4%
Minimum 5 values
Value
Count
Frequency (%)
1994
182
10.1%
1995
188
10.4%
1996
182
10.1%
1997
164
9.1%
1998
194
10.8%
Maximum 5 values
Value
Count
Frequency (%)
1999
176
9.8%
2000
191
10.6%
2001
218
12.1%
2002
151
8.4%
2003
154
8.6%
Sample
id
title
authors
venue
year
0
l0
Paradise: A Database System for GIS Applications
Paradise Team
SIGMOD Conference
1995
1
l1
A Query Language and Optimization Techniques for Unstructured Data
Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu
SIGMOD Conference
1996
2
l2
Turbo-charging Vertical Mining of Large Databases
Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia
SIGMOD Conference
2000
3
l3
Maintenance of Data Cubes and Summary Tables in a Warehouse
Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick
SIGMOD Conference
1997
4
l4
On Relational Support for XML Publishing: Beyond Sorting and Tagging
Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri
SIGMOD Conference
2003
In [6]:
pfr = pandas_profiling.ProfileReport(A)
pfr.to_file("/tmp/example.html")
In [7]:
pfr
Out[7]:
Overview
Dataset info
Number of variables
5
Number of observations
1800
Total Missing (%)
0.0%
Total size in memory
70.4 KiB
Average record size in memory
40.0 B
Variables types
Numeric
1
Categorical
3
Date
0
Text (Unique)
1
Rejected
0
Warnings
authors
has a high cardinality: 1703 distinct values Warningtitle
has a high cardinality: 1797 distinct values Warning
Variables
authors
Categorical
Distinct count
1703
Unique (%)
94.6%
Missing (%)
0.0%
Missing (n)
0
Dan Suciu
7
C. Mohan
6
Andrew Eisenberg, Jim Melton
5
Other values (1700)
1782
Value
Count
Frequency (%)
Dan Suciu
7
0.4%
C. Mohan
6
0.3%
Andrew Eisenberg, Jim Melton
5
0.3%
Xiaolei Qian
5
0.3%
Joseph M. Hellerstein
4
0.2%
Richard T. Snodgrass
4
0.2%
Praveen Seshadri
3
0.2%
H. V. Jagadish
3
0.2%
Nam Huyn
3
0.2%
Viswanath Poosala, Yannis E. Ioannidis
3
0.2%
Other values (1693)
1757
97.6%
id
Categorical, Unique
First 3 values
l415
l1574
l1364
Last 3 values
l492
l273
l92
First 10 values
Value
Count
Frequency (%)
l0
1
0.1%
l1
1
0.1%
l10
1
0.1%
l100
1
0.1%
l1000
1
0.1%
Last 10 values
Value
Count
Frequency (%)
l995
1
0.1%
l996
1
0.1%
l997
1
0.1%
l998
1
0.1%
l999
1
0.1%
title
Categorical
Distinct count
1797
Unique (%)
99.8%
Missing (%)
0.0%
Missing (n)
0
Editorial
2
Guest editorial
2
Keynote Address
2
Other values (1794)
1794
Value
Count
Frequency (%)
Editorial
2
0.1%
Guest editorial
2
0.1%
Keynote Address
2
0.1%
Integrating Modelling Systems for Environmental Management Information Systems
1
0.1%
Historical Queries Along Multiple Lines of Time Evolution
1
0.1%
Selectivity Estimation Without the Attribute Value Independence Assumption
1
0.1%
Analysis of existing databases at the logical level: the DBA companion project
1
0.1%
Using Versions in Update Transactions: Application to Integrity Checking
1
0.1%
Power efficient data gathering and aggregation in wireless sensor networks
1
0.1%
Instance-based attribute identification in database integration
1
0.1%
Other values (1787)
1787
99.3%
venue
Categorical
Distinct count
5
Unique (%)
0.3%
Missing (%)
0.0%
Missing (n)
0
SIGMOD Conference
654
VLDB
512
SIGMOD Record
381
Other values (2)
253
Value
Count
Frequency (%)
SIGMOD Conference
654
36.3%
VLDB
512
28.4%
SIGMOD Record
381
21.2%
VLDB J.
146
8.1%
ACM Trans. Database Syst.
107
5.9%
year
Numeric
Distinct count
10
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
1998.4
Minimum
1994
Maximum
2003
Zeros (%)
0.0%
Quantile statistics
Minimum
1994
5-th percentile
1994
Q1
1996
Median
1998
Q3
2001
95-th percentile
2003
Maximum
2003
Range
9
Interquartile range
5
Descriptive statistics
Standard deviation
2.8231
Coef of variation
0.0014127
Kurtosis
-1.2004
Mean
1998.4
MAD
2.4525
Skewness
-0.007014
Sum
3597166
Variance
7.97
Memory size
14.1 KiB
Value
Count
Frequency (%)
2001
218
12.1%
1998
194
10.8%
2000
191
10.6%
1995
188
10.4%
1996
182
10.1%
1994
182
10.1%
1999
176
9.8%
1997
164
9.1%
2003
154
8.6%
2002
151
8.4%
Minimum 5 values
Value
Count
Frequency (%)
1994
182
10.1%
1995
188
10.4%
1996
182
10.1%
1997
164
9.1%
1998
194
10.8%
Maximum 5 values
Value
Count
Frequency (%)
1999
176
9.8%
2000
191
10.6%
2001
218
12.1%
2002
151
8.4%
2003
154
8.6%
Sample
id
title
authors
venue
year
0
l0
Paradise: A Database System for GIS Applications
Paradise Team
SIGMOD Conference
1995
1
l1
A Query Language and Optimization Techniques for Unstructured Data
Gerd G. Hillebrand, Peter Buneman, Susan B. Davidson, Dan Suciu
SIGMOD Conference
1996
2
l2
Turbo-charging Vertical Mining of Large Databases
Jayant R. Haritsa, Devavrat Shah, S. Sudarshan, Pradeep Shenoy, Mayank Bawa, Gaurav Bhalotia
SIGMOD Conference
2000
3
l3
Maintenance of Data Cubes and Summary Tables in a Warehouse
Inderpal Singh Mumick, Dallan Quass, Barinderpal Singh Mumick
SIGMOD Conference
1997
4
l4
On Relational Support for XML Publishing: Beyond Sorting and Tagging
Raghav Kaushik, Jeffrey F. Naughton, Surajit Chaudhuri
SIGMOD Conference
2003
In [ ]:
Content source: anhaidgroup/py_entitymatching
Similar notebooks: