In [7]:

    
pages = ['DramaAdd', 'ejeab', 'cartooneggcat', 'BBCThai', 'khobsanam', '1447102878929950',
         'powerofhusbands', 'basementkaraoke', 'cartoon5natee', 'AjahnBuddhadasa', 'Toodsdiary', 'ceclip', 'beargirlfriend',
         'jaytherabbitofficial', 'Darlingboredom', 'v.vajiramedhi', '334236760084743', 'kingdomoftigers', 'underbedstar', 'pantipded',
         'Pantip.KratooDed', 'nut.ped', '9gaginthai', 'in.one.zaroop']

#exclude: 'HighlightsHD.tv'



In [8]:

    
for page in pages:
    df = pd.read_csv(page + '.csv', encoding='utf_8_sig')
    pattern = r'(\r)+'
    if 'message' in df.columns:
        # there are unwanted '\r' in 'message' column because of Windows-Python incompatibility 
        for index, row in df.iterrows():
            if pd.isnull(row['message']):
                pass
            else:
                df.loc[index, 'message'] = re.sub(pattern, '', df.loc[index, 'message'])
    elif 'link' in df.columns:
        print(page + ' Has no MSG COLUMN !!')
        df['message'] = ''
    else:
        raise Exception('What happened ??')
    
    # fill NA :
    df[['comment_count', 'like_count', 'reaction_count', 'share_count']] = \
    df[['comment_count', 'like_count', 'reaction_count', 'share_count']].fillna(0).astype(int)
    df[['message', 'link']] = df[['message', 'link']].fillna('').astype(str)
    for colname in ['created_time', 'from', 'id', 'time_checked', 'type', 'updated_time']:
        df[colname] = df[colname].apply(lambda x: str(x) if pd.notnull(x) else x)
    
    # create a new dataframe containing 'id', 'message' and 'link' column. Then, drop duplicates :
    newdf = df.drop_duplicates(subset=['id', 'message', 'link'])
    newdf = newdf.sort_values(by = ['created_time', 'id', 'time_checked'])
    newdf.reset_index(drop=True, inplace=True)
    # label each row by 'id' :
    mapdf = newdf.loc[:, ['id', 'message']].drop_duplicates(subset = 'id')
    mapdf.reset_index(drop=True, inplace=True)
    mapdf['label1'] = np.arange(len(mapdf)) + 1
    newdf = newdf.merge(mapdf[['id', 'label1']], on='id', how='left')
    # sub-label each row by 'message' and 'link' :
    newdf['label2'] = newdf.groupby('label1').cumcount() + 1
    newdf['label2'] = newdf['label2'].astype(int)
    # create new csv files containing 'id', 'message', 'link', 'label1' and 'label2' column :
    newdf.to_csv(page + 'ID.csv', index=False, encoding='utf_8_sig')
    # remove 'message' and 'link' column from original csv files to reduce the size : 
    df = df.merge(newdf[['id', 'message', 'link', 'label1', 'label2']], on=['id', 'message', 'link'], how='left')
    df = df.drop(['message', 'link'], axis=1)
    df.to_csv(page + '.csv', index = False, encoding='utf_8_sig')
    print(page + ' ......... DONE !!')









    



DramaAdd ......... DONE !!
ejeab ......... DONE !!
cartooneggcat Has no MSG COLUMN !!
cartooneggcat ......... DONE !!
BBCThai ......... DONE !!
khobsanam ......... DONE !!
1447102878929950 ......... DONE !!
powerofhusbands ......... DONE !!
basementkaraoke ......... DONE !!
cartoon5natee ......... DONE !!
AjahnBuddhadasa ......... DONE !!
Toodsdiary ......... DONE !!
ceclip ......... DONE !!
beargirlfriend ......... DONE !!
jaytherabbitofficial ......... DONE !!
Darlingboredom ......... DONE !!
v.vajiramedhi ......... DONE !!
334236760084743 ......... DONE !!
kingdomoftigers ......... DONE !!
underbedstar ......... DONE !!
pantipded ......... DONE !!
Pantip.KratooDed ......... DONE !!
nut.ped ......... DONE !!
9gaginthai ......... DONE !!
in.one.zaroop ......... DONE !!

Cleaning data