In [1]:
from platform import python_version

python_version()


Out[1]:
'3.6.7'

In [15]:
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

In [27]:
pd.__version__,matplotlib.__version__


Out[27]:
('0.25.1', '3.0.3')

In [28]:
df = pd.DataFrame({
    'title': ['bar','bar','baz','baz','foo','foo'],
    'contents':[
        'Sed mollis tempor accumsan.',
        'Sed mollis tempor accumsan.',
        'Nullam et feugiat turpis, non condimentum dolor.',
        'Aenean eu aliquam nunc.',
        'Lorem ipsum dolor sit amet.',
        'Lorem ipsum dolor sit amet.'
    ],
    'year':[2010,2010,2005,2005,2011,2011]
})

df


Out[28]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
1 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2011
5 foo Lorem ipsum dolor sit amet. 2011

show


In [29]:
df[df.duplicated()]


Out[29]:
title contents year
1 bar Sed mollis tempor accumsan. 2010
5 foo Lorem ipsum dolor sit amet. 2011

show including original


In [31]:
df[df.duplicated(keep=False)]


Out[31]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
1 bar Sed mollis tempor accumsan. 2010
4 foo Lorem ipsum dolor sit amet. 2011
5 foo Lorem ipsum dolor sit amet. 2011

count


In [30]:
len(df[df.duplicated()])


Out[30]:
2

show, some columns only


In [20]:
df[df.duplicated(subset=['title','year'])]


Out[20]:
title contents year
1 bar Sed mollis tempor accumsan. 2010
3 baz Aenean eu aliquam nunc. 2005
5 foo Lorem ipsum dolor sit amet. 2011

drop duplicates, keep original


In [21]:
df.drop_duplicates()


Out[21]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2011

drop duplicates based on some columns


In [22]:
df.drop_duplicates(subset=['title','year'])


Out[22]:
title contents year
0 bar Sed mollis tempor accumsan. 2010
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
4 foo Lorem ipsum dolor sit amet. 2011

drop columns that are or have duplicates


In [23]:
df.drop_duplicates(keep=False)


Out[23]:
title contents year
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005

mark duplicates


In [24]:
df.assign(
    is_duplicate=lambda d: d.duplicated()
)


Out[24]:
title contents year is_duplicate
0 bar Sed mollis tempor accumsan. 2010 False
1 bar Sed mollis tempor accumsan. 2010 True
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005 False
3 baz Aenean eu aliquam nunc. 2005 False
4 foo Lorem ipsum dolor sit amet. 2011 False
5 foo Lorem ipsum dolor sit amet. 2011 True

custom keep logic


In [25]:
df = pd.DataFrame({
    'title': ['bar','bar','baz','baz','foo','foo'],
    'contents':[
        'Sed mollis tempor accumsan.',
        'Sed mollis tempor accumsan.',
        'Nullam et feugiat turpis, non condimentum dolor.',
        'Aenean eu aliquam nunc.',
        'Lorem ipsum dolor sit amet.',
        'Lorem ipsum dolor sit amet.'
    ],
    'year':[2009,2019,2005,2005,2015,1995]
})

df


Out[25]:
title contents year
0 bar Sed mollis tempor accumsan. 2009
1 bar Sed mollis tempor accumsan. 2019
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
3 baz Aenean eu aliquam nunc. 2005
4 foo Lorem ipsum dolor sit amet. 2015
5 foo Lorem ipsum dolor sit amet. 1995

In [26]:
df.sort_values(
    ['title','contents','year']
).drop_duplicates(
    subset=['title','contents'],keep='last'
)


Out[26]:
title contents year
1 bar Sed mollis tempor accumsan. 2019
3 baz Aenean eu aliquam nunc. 2005
2 baz Nullam et feugiat turpis, non condimentum dolor. 2005
4 foo Lorem ipsum dolor sit amet. 2015

In [ ]:


In [ ]:


In [ ]: