In [1]:
from platform import python_version
python_version()
Out[1]:
In [2]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
In [3]:
pd.__version__,matplotlib.__version__
Out[3]:
In [4]:
df = pd.DataFrame({
'title': ['bar','bar','baz','baz','foo','foo'],
'contents':[
'Sed mollis tempor accumsan.',
'Sed mollis tempor accumsan.',
'Nullam et feugiat turpis, non condimentum dolor.',
'Aenean eu aliquam nunc.',
'Lorem ipsum dolor sit amet.',
'Lorem ipsum dolor sit amet.'
],
'year':[2010,2010,2005,2005,2011,2011]
})
df
Out[4]:
In [5]:
df[df.duplicated()]
Out[5]:
In [6]:
len(df[df.duplicated()])
Out[6]:
In [7]:
df[df.duplicated(subset=['title','year'])]
Out[7]:
In [8]:
df.drop_duplicates()
Out[8]:
In [9]:
df.drop_duplicates(subset=['title','year'])
Out[9]:
In [10]:
df.drop_duplicates(keep=False)
Out[10]:
In [11]:
df.assign(
is_duplicate= lambda d: d.duplicated()
).sort_values(['title','contents','year']).reset_index(drop=True)
Out[11]:
In [12]:
df = pd.DataFrame({
'title': ['bar','bar','baz','baz','foo','foo'],
'contents':[
'Sed mollis tempor accumsan.',
'Sed mollis tempor accumsan.',
'Nullam et feugiat turpis, non condimentum dolor.',
'Aenean eu aliquam nunc.',
'Lorem ipsum dolor sit amet.',
'Lorem ipsum dolor sit amet.'
],
'year':[2009,2019,2005,2005,2015,1995]
})
df
Out[12]:
In [13]:
df.sort_values(
['title','contents','year']
).drop_duplicates(
subset=['title','contents'],keep='last'
)
Out[13]:
In [ ]: