In [1]:
from platform import python_version

python_version()


Out[1]:
'3.6.7'

In [2]:
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

In [3]:
pd.__version__,matplotlib.__version__


Out[3]:
('0.25.1', '3.0.3')

In [4]:
df = pd.DataFrame({
    'title': ['bar','bar','baz','baz','foo','foo'],
    'contents':[
        'Sed mollis tempor accumsan.',
        'Sed mollis tempor accumsan.',
        'Nullam et feugiat turpis, non condimentum dolor.',
        'Aenean eu aliquam nunc.',
        'Lorem ipsum dolor sit amet.',
        'Lorem ipsum dolor sit amet.'
    ],
    'year':[2010,2010,2005,2005,2011,2011]
})

df


Out[4]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
1 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2011
5 foo Lorem ipsum dolor sit amet. 2011

show


In [5]:
df[df.duplicated()]


Out[5]:
title contents year
1 bar Sed mollis tempor accumsan. 2010
5 foo Lorem ipsum dolor sit amet. 2011

count


In [6]:
len(df[df.duplicated()])


Out[6]:
2

show, some columns only


In [7]:
df[df.duplicated(subset=['title','year'])]


Out[7]:
title contents year
1 bar Sed mollis tempor accumsan. 2010
3 baz Aenean eu aliquam nunc. 2005
5 foo Lorem ipsum dolor sit amet. 2011

drop duplicates, keep original


In [8]:
df.drop_duplicates()


Out[8]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2011

drop duplicates based on some columns


In [9]:
df.drop_duplicates(subset=['title','year'])


Out[9]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
4 foo Lorem ipsum dolor sit amet. 2011

drop columns that are or have duplicates


In [10]:
df.drop_duplicates(keep=False)


Out[10]:
title contents year
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005

mark duplicates


In [11]:
df.assign(
    is_duplicate= lambda d: d.duplicated()
).sort_values(['title','contents','year']).reset_index(drop=True)


Out[11]:
title contents year is_duplicate
0 bar Sed mollis tempor accumsan. 2010 False
1 bar Sed mollis tempor accumsan. 2010 True
2 baz Aenean eu aliquam nunc. 2005 False
3 baz Nullam et feugiat turpis, non condimentum dolor. 2005 False
4 foo Lorem ipsum dolor sit amet. 2011 False
5 foo Lorem ipsum dolor sit amet. 2011 True

custom keep logic


In [12]:
df = pd.DataFrame({
    'title': ['bar','bar','baz','baz','foo','foo'],
    'contents':[
        'Sed mollis tempor accumsan.',
        'Sed mollis tempor accumsan.',
        'Nullam et feugiat turpis, non condimentum dolor.',
        'Aenean eu aliquam nunc.',
        'Lorem ipsum dolor sit amet.',
        'Lorem ipsum dolor sit amet.'
    ],
    'year':[2009,2019,2005,2005,2015,1995]
})

df


Out[12]:
title contents year
0 bar Sed mollis tempor accumsan. 2009
1 bar Sed mollis tempor accumsan. 2019
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2015
5 foo Lorem ipsum dolor sit amet. 1995

In [13]:
df.sort_values(
    ['title','contents','year']
).drop_duplicates(
    subset=['title','contents'],keep='last'
)


Out[13]:
title contents year
1 bar Sed mollis tempor accumsan. 2019
3 baz Aenean eu aliquam nunc. 2005
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
4 foo Lorem ipsum dolor sit amet. 2015

In [ ]: