In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/src/sample_pandas_normal.csv')
df.iloc[1] = np.nan
print(df)


      name   age state  point
0    Alice  24.0    NY   64.0
1      NaN   NaN   NaN    NaN
2  Charlie  18.0    CA   70.0
3     Dave  68.0    TX   70.0
4    Ellen  24.0    CA   88.0
5    Frank  30.0    NY   57.0

In [3]:
u = df['state'].unique()
print(u)
print(type(u))


['NY' nan 'CA' 'TX']
<class 'numpy.ndarray'>

In [4]:
vc = df['state'].value_counts()
print(vc)
print(type(vc))


NY    2
CA    2
TX    1
Name: state, dtype: int64
<class 'pandas.core.series.Series'>

In [5]:
print(df['state'].value_counts(ascending=True))


TX    1
CA    2
NY    2
Name: state, dtype: int64

In [6]:
print(df['state'].value_counts(sort=False))


CA    2
NY    2
TX    1
Name: state, dtype: int64

In [7]:
print(df['state'].value_counts(dropna=False))


NY     2
CA     2
TX     1
NaN    1
Name: state, dtype: int64

In [8]:
print(df['state'].value_counts(dropna=False, normalize=True))


NY     0.333333
CA     0.333333
TX     0.166667
NaN    0.166667
Name: state, dtype: float64

In [9]:
nu = df['state'].nunique()
print(nu)
print(type(nu))


3
<class 'int'>

In [10]:
print(df['state'].nunique(dropna=False))


4

In [11]:
nu_col = df.nunique()
print(nu_col)
print(type(nu_col))


name     5
age      4
state    3
point    4
dtype: int64
<class 'pandas.core.series.Series'>

In [12]:
print(df.nunique(dropna=False))


name     6
age      5
state    4
point    5
dtype: int64

In [13]:
print(df.nunique(dropna=False, axis='columns'))


0    4
1    1
2    4
3    4
4    4
5    4
dtype: int64

In [14]:
print(df['state'].nunique())


3

In [15]:
print(df.nunique())


name     5
age      4
state    3
point    4
dtype: int64

In [16]:
print(df['state'].unique().tolist())
print(type(df['state'].unique().tolist()))


['NY', nan, 'CA', 'TX']
<class 'list'>

In [17]:
print(df['state'].value_counts().index.tolist())
print(type(df['state'].value_counts().index.tolist()))


['NY', 'CA', 'TX']
<class 'list'>

In [18]:
print(df['state'].value_counts(dropna=False).index.values)
print(type(df['state'].value_counts().index.values))


['NY' 'CA' 'TX' nan]
<class 'numpy.ndarray'>

In [19]:
print(df['state'].value_counts()['NY'])


2

In [20]:
print(df['state'].value_counts().NY)


2

In [21]:
for index, value in df['state'].value_counts().iteritems():
    print(index, ': ', value)


NY :  2
CA :  2
TX :  1

In [22]:
d = df['state'].value_counts().to_dict()
print(d)
print(type(d))


{'NY': 2, 'CA': 2, 'TX': 1}
<class 'dict'>

In [23]:
print(d['NY'])


2

In [24]:
for key, value in d.items():
    print(key, ': ', value)


NY :  2
CA :  2
TX :  1

In [25]:
print(df['state'].value_counts())


NY    2
CA    2
TX    1
Name: state, dtype: int64

In [26]:
print(df['state'].value_counts().index[0])


NY

In [27]:
print(df['state'].value_counts().iat[0])


2

In [28]:
print(df.apply(lambda x: x.value_counts().index[0]))


name     Frank
age         24
state       NY
point       70
dtype: object

In [29]:
print(df.apply(lambda x: x.value_counts().iat[0]))


name     1
age      2
state    2
point    2
dtype: int64

In [30]:
print(df['state'].mode())


0    CA
1    NY
dtype: object

In [31]:
print(df['state'].mode().tolist())


['CA', 'NY']

In [32]:
print(df['age'].mode().tolist())


[24.0]

In [33]:
s_mode = df.apply(lambda x: x.mode().tolist())
print(s_mode)


name     [Alice, Charlie, Dave, Ellen, Frank]
age                                    [24.0]
state                                [CA, NY]
point                                  [70.0]
dtype: object

In [34]:
print(type(s_mode))


<class 'pandas.core.series.Series'>

In [35]:
print(s_mode['name'])


['Alice', 'Charlie', 'Dave', 'Ellen', 'Frank']

In [36]:
print(type(s_mode['name']))


<class 'list'>

In [37]:
print(df.mode())


      name   age state  point
0    Alice  24.0    CA   70.0
1  Charlie   NaN    NY    NaN
2     Dave   NaN   NaN    NaN
3    Ellen   NaN   NaN    NaN
4    Frank   NaN   NaN    NaN

In [38]:
print(df.mode().count())


name     5
age      1
state    2
point    1
dtype: int64

In [39]:
print(df.astype('str').describe())


         name   age state point
count       6     6     6     6
unique      6     5     4     5
top     Frank  24.0    CA  70.0
freq        1     2     2     2

In [40]:
print(df.astype('str').describe().loc['top'])


name     Frank
age       24.0
state       CA
point     70.0
Name: top, dtype: object