In [1]:
from platform import python_version
python_version()
Out[1]:
In [15]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
In [27]:
pd.__version__,matplotlib.__version__
Out[27]:
In [28]:
df = pd.DataFrame({
'title': ['bar','bar','baz','baz','foo','foo'],
'contents':[
'Sed mollis tempor accumsan.',
'Sed mollis tempor accumsan.',
'Nullam et feugiat turpis, non condimentum dolor.',
'Aenean eu aliquam nunc.',
'Lorem ipsum dolor sit amet.',
'Lorem ipsum dolor sit amet.'
],
'year':[2010,2010,2005,2005,2011,2011]
})
df
Out[28]:
In [29]:
df[df.duplicated()]
Out[29]:
In [31]:
df[df.duplicated(keep=False)]
Out[31]:
In [30]:
len(df[df.duplicated()])
Out[30]:
In [20]:
df[df.duplicated(subset=['title','year'])]
Out[20]:
In [21]:
df.drop_duplicates()
Out[21]:
In [22]:
df.drop_duplicates(subset=['title','year'])
Out[22]:
In [23]:
df.drop_duplicates(keep=False)
Out[23]:
In [24]:
df.assign(
is_duplicate=lambda d: d.duplicated()
)
Out[24]:
In [25]:
df = pd.DataFrame({
'title': ['bar','bar','baz','baz','foo','foo'],
'contents':[
'Sed mollis tempor accumsan.',
'Sed mollis tempor accumsan.',
'Nullam et feugiat turpis, non condimentum dolor.',
'Aenean eu aliquam nunc.',
'Lorem ipsum dolor sit amet.',
'Lorem ipsum dolor sit amet.'
],
'year':[2009,2019,2005,2005,2015,1995]
})
df
Out[25]:
In [26]:
df.sort_values(
['title','contents','year']
).drop_duplicates(
subset=['title','contents'],keep='last'
)
Out[26]:
In [ ]:
In [ ]:
In [ ]: