In [6]:
import pandas as pd
import re
import numpy as np
In [7]:
pages = ['DramaAdd', 'ejeab', 'cartooneggcat', 'BBCThai', 'khobsanam', '1447102878929950',
'powerofhusbands', 'basementkaraoke', 'cartoon5natee', 'AjahnBuddhadasa', 'Toodsdiary', 'ceclip', 'beargirlfriend',
'jaytherabbitofficial', 'Darlingboredom', 'v.vajiramedhi', '334236760084743', 'kingdomoftigers', 'underbedstar', 'pantipded',
'Pantip.KratooDed', 'nut.ped', '9gaginthai', 'in.one.zaroop']
#exclude: 'HighlightsHD.tv'
In [8]:
for page in pages:
df = pd.read_csv(page + '.csv', encoding='utf_8_sig')
pattern = r'(\r)+'
if 'message' in df.columns:
# there are unwanted '\r' in 'message' column because of Windows-Python incompatibility
for index, row in df.iterrows():
if pd.isnull(row['message']):
pass
else:
df.loc[index, 'message'] = re.sub(pattern, '', df.loc[index, 'message'])
elif 'link' in df.columns:
print(page + ' Has no MSG COLUMN !!')
df['message'] = ''
else:
raise Exception('What happened ??')
# fill NA :
df[['comment_count', 'like_count', 'reaction_count', 'share_count']] = \
df[['comment_count', 'like_count', 'reaction_count', 'share_count']].fillna(0).astype(int)
df[['message', 'link']] = df[['message', 'link']].fillna('').astype(str)
for colname in ['created_time', 'from', 'id', 'time_checked', 'type', 'updated_time']:
df[colname] = df[colname].apply(lambda x: str(x) if pd.notnull(x) else x)
# create a new dataframe containing 'id', 'message' and 'link' column. Then, drop duplicates :
newdf = df.drop_duplicates(subset=['id', 'message', 'link'])
newdf = newdf.sort_values(by = ['created_time', 'id', 'time_checked'])
newdf.reset_index(drop=True, inplace=True)
# label each row by 'id' :
mapdf = newdf.loc[:, ['id', 'message']].drop_duplicates(subset = 'id')
mapdf.reset_index(drop=True, inplace=True)
mapdf['label1'] = np.arange(len(mapdf)) + 1
newdf = newdf.merge(mapdf[['id', 'label1']], on='id', how='left')
# sub-label each row by 'message' and 'link' :
newdf['label2'] = newdf.groupby('label1').cumcount() + 1
newdf['label2'] = newdf['label2'].astype(int)
# create new csv files containing 'id', 'message', 'link', 'label1' and 'label2' column :
newdf.to_csv(page + 'ID.csv', index=False, encoding='utf_8_sig')
# remove 'message' and 'link' column from original csv files to reduce the size :
df = df.merge(newdf[['id', 'message', 'link', 'label1', 'label2']], on=['id', 'message', 'link'], how='left')
df = df.drop(['message', 'link'], axis=1)
df.to_csv(page + '.csv', index = False, encoding='utf_8_sig')
print(page + ' ......... DONE !!')
In [ ]: