In [25]:
%ls -lh ../data/csv
In [26]:
import pandas as pd
import os
In [27]:
parent_path = os.path.dirname(os.getcwd())
csv_file = '97802012'
csv_file_name = csv_file + '.csv'
csv_dir_path = os.path.join(parent_path, 'data', 'csv')
csv_file_path = os.path.join(csv_dir_path, csv_file_name)
img_dir_path = os.path.join(parent_path, 'data', 'img', 'raw')
img_output_dir_path = os.path.join(img_dir_path, csv_file)
img_wrong_dir_path = os.path.join(parent_path, 'data', 'img', 'wrong')
In [28]:
df = pd.read_csv(csv_file_path, header=0)
old_rows_count = df.shape[0]
print("%d rows" % df.shape[0])
df.head(3)
Out[28]:
Get wrong image list
In [29]:
wrong_list = os.listdir(img_wrong_dir_path)
In [30]:
wrong_list = [x for x in wrong_list if csv_file in x]
In [31]:
len(wrong_list)
Out[31]:
Get index of each wrong image
In [32]:
def get_index(i):
return df[df['img'] == i].index.tolist()[0]
In [33]:
wrong_list_index = [get_index(i) for i in wrong_list]
Remove the rows, and save the modified csv file
In [34]:
df = df.drop(df.index[wrong_list_index])
In [35]:
df.shape[0]
Out[35]:
In [36]:
assert(df.shape[0] + len(wrong_list) == old_rows_count)
In [37]:
df.to_csv(csv_file_path, index=False)
Check to see that it was saved well
In [38]:
df = pd.read_csv(csv_file_path, header=0)
print("%d rows" % df.shape[0])
df.head(3)
Out[38]:
In [ ]:
Move img files to respective output directory. (To indicate that we have looked at the images, and removed the "wrong" images)
In [39]:
if not os.path.exists(img_output_dir_path):
os.makedirs(img_output_dir_path)
In [40]:
for f in df['img']:
old_path = os.path.join(img_dir_path, f)
new_path = os.path.join(img_output_dir_path, f)
os.rename(old_path, new_path)
Delete wrong images from wrong directory
In [41]:
for f in wrong_list:
remove_file_path = os.path.join(img_wrong_dir_path, f)
os.remove(remove_file_path)
In [ ]:
In [ ]:
In [ ]: