Cleaning data

  • Remove unwanted '\r' in 'message' column
  • Fill NAs with 0 and empty strings
  • Add label and sub-label column to group the same posts together
  • Organize data to reduce files' size

In [6]:
import pandas as pd
import re
import numpy as np

In [7]:
pages = ['DramaAdd', 'ejeab', 'cartooneggcat', 'BBCThai', 'khobsanam', '1447102878929950',
         'powerofhusbands', 'basementkaraoke', 'cartoon5natee', 'AjahnBuddhadasa', 'Toodsdiary', 'ceclip', 'beargirlfriend',
         'jaytherabbitofficial', 'Darlingboredom', 'v.vajiramedhi', '334236760084743', 'kingdomoftigers', 'underbedstar', 'pantipded',
         'Pantip.KratooDed', 'nut.ped', '9gaginthai', 'in.one.zaroop']

#exclude: 'HighlightsHD.tv'

In [8]:
for page in pages:
    df = pd.read_csv(page + '.csv', encoding='utf_8_sig')
    pattern = r'(\r)+'
    if 'message' in df.columns:
        # there are unwanted '\r' in 'message' column because of Windows-Python incompatibility 
        for index, row in df.iterrows():
            if pd.isnull(row['message']):
                pass
            else:
                df.loc[index, 'message'] = re.sub(pattern, '', df.loc[index, 'message'])
    elif 'link' in df.columns:
        print(page + ' Has no MSG COLUMN !!')
        df['message'] = ''
    else:
        raise Exception('What happened ??')
    
    # fill NA :
    df[['comment_count', 'like_count', 'reaction_count', 'share_count']] = \
    df[['comment_count', 'like_count', 'reaction_count', 'share_count']].fillna(0).astype(int)
    df[['message', 'link']] = df[['message', 'link']].fillna('').astype(str)
    for colname in ['created_time', 'from', 'id', 'time_checked', 'type', 'updated_time']:
        df[colname] = df[colname].apply(lambda x: str(x) if pd.notnull(x) else x)
    
    # create a new dataframe containing 'id', 'message' and 'link' column. Then, drop duplicates :
    newdf = df.drop_duplicates(subset=['id', 'message', 'link'])
    newdf = newdf.sort_values(by = ['created_time', 'id', 'time_checked'])
    newdf.reset_index(drop=True, inplace=True)
    # label each row by 'id' :
    mapdf = newdf.loc[:, ['id', 'message']].drop_duplicates(subset = 'id')
    mapdf.reset_index(drop=True, inplace=True)
    mapdf['label1'] = np.arange(len(mapdf)) + 1
    newdf = newdf.merge(mapdf[['id', 'label1']], on='id', how='left')
    # sub-label each row by 'message' and 'link' :
    newdf['label2'] = newdf.groupby('label1').cumcount() + 1
    newdf['label2'] = newdf['label2'].astype(int)
    # create new csv files containing 'id', 'message', 'link', 'label1' and 'label2' column :
    newdf.to_csv(page + 'ID.csv', index=False, encoding='utf_8_sig')
    # remove 'message' and 'link' column from original csv files to reduce the size : 
    df = df.merge(newdf[['id', 'message', 'link', 'label1', 'label2']], on=['id', 'message', 'link'], how='left')
    df = df.drop(['message', 'link'], axis=1)
    df.to_csv(page + '.csv', index = False, encoding='utf_8_sig')
    print(page + ' ......... DONE !!')


DramaAdd ......... DONE !!
ejeab ......... DONE !!
cartooneggcat Has no MSG COLUMN !!
cartooneggcat ......... DONE !!
BBCThai ......... DONE !!
khobsanam ......... DONE !!
1447102878929950 ......... DONE !!
powerofhusbands ......... DONE !!
basementkaraoke ......... DONE !!
cartoon5natee ......... DONE !!
AjahnBuddhadasa ......... DONE !!
Toodsdiary ......... DONE !!
ceclip ......... DONE !!
beargirlfriend ......... DONE !!
jaytherabbitofficial ......... DONE !!
Darlingboredom ......... DONE !!
v.vajiramedhi ......... DONE !!
334236760084743 ......... DONE !!
kingdomoftigers ......... DONE !!
underbedstar ......... DONE !!
pantipded ......... DONE !!
Pantip.KratooDed ......... DONE !!
nut.ped ......... DONE !!
9gaginthai ......... DONE !!
in.one.zaroop ......... DONE !!

In [ ]: