In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('./data/training_data.csv', encoding='latin1')

In [3]:
def is_not_text(text):
    if isinstance(text, float):
        return True
    if len(text) <= 0:
        return True
    return False

index = data['article_content'].index[data['article_content'].apply(is_not_text)]

In [4]:
index


Out[4]:
Int64Index([45090], dtype='int64')

In [5]:
data.iloc[45090]


Out[5]:
TIDM                             CTEC
company_name       ConvaTec Group PLC
article_id                   13061032
article_content                   NaN
cat_id                             59
open_1                          22500
high_1                          22700
low_1                           21800
close_1                         22500
volume_1                  19260480000
open_2                          22500
high_2                          23061
low_2                           22500
close_2                         22900
volume_2                   7.4403e+08
open_3                          22900
high_3                          24875
low_3                           22850
close_3                         24000
volume_3                   4.4098e+08
open_4                          24875
high_4                          25075
low_4                           23800
close_4                         25000
volume_4                   3.6777e+08
open_5                          25975
high_5                          25975
low_5                           22734
close_5                         24900
volume_5                   3.9152e+08
                          ...        
open_30                           NaN
high_30                           NaN
low_30                            NaN
close_30                          NaN
volume_30                         NaN
open_31                           NaN
high_31                           NaN
low_31                            NaN
close_31                          NaN
volume_31                         NaN
open_32                           NaN
high_32                           NaN
low_32                            NaN
close_32                          NaN
volume_32                         NaN
open_33                           NaN
high_33                           NaN
low_33                            NaN
close_33                          NaN
volume_33                         NaN
open_34                           NaN
high_34                           NaN
low_34                            NaN
close_34                          NaN
volume_34                         NaN
open_35                           NaN
high_35                           NaN
low_35                            NaN
close_35                          NaN
volume_35                         NaN
Name: 45090, dtype: object

In [6]:
data.dropna(inplace=True)

In [7]:
index = data['article_content'].index[data['article_content'].apply(is_not_text)]

In [8]:
index


Out[8]:
Int64Index([], dtype='int64')

In [9]:
len(data)


Out[9]:
43636

In [13]:
data = data.query('cat_id not in [40, 41, 42, 43, 98, 168]')

In [14]:
len(data)


Out[14]:
41912