|
|
@@ -0,0 +1,216 @@ |
|
|
#=======================================================================# |
|
|
# extract_data.py # |
|
|
#=======================================================================# |
|
|
# usage: extract_data.py [-h] [-i INPUT_DIR] [-o OUTPUT_DIR] |
|
|
# |
|
|
# This program extracts provision numbers from a set of documents. |
|
|
# |
|
|
# optional arguments: |
|
|
# -h, --help show this help message and exit |
|
|
# -i INPUT_DIR, --input_dir INPUT_DIR |
|
|
# Input directory for the files to be modified |
|
|
# -o OUTPUT_DIR, --output_dir OUTPUT_DIR |
|
|
# Output directory for the files to be modified |
|
|
#=======================================================================# |
|
|
|
|
|
#=======================================================================# |
|
|
# Sample usage: # |
|
|
#=======================================================================# |
|
|
# python extract_data.py --input_dir ocr/data/ --output_dir ocr/results/ |
|
|
#=======================================================================# |
|
|
|
|
|
|
|
|
import numpy as np |
|
|
import os |
|
|
import cv2 |
|
|
import glob |
|
|
import shutil |
|
|
import pytesseract |
|
|
import re |
|
|
import time |
|
|
import argparse |
|
|
from statistics import mode |
|
|
|
|
|
regex = r"P\d{17}" |
|
|
found = {} |
|
|
results = {} |
|
|
queue = [] |
|
|
done = [] |
|
|
missing = [] |
|
|
pnr_area = [150, 450, 1600, 1150] # [start_x, start_y, end_x, end_y] |
|
|
|
|
|
|
|
|
# =============================================================================== # |
|
|
# To-do list # |
|
|
# =============================================================================== # |
|
|
# 0. Provision Number # |
|
|
# =============================================================================== # |
|
|
|
|
|
|
|
|
# =============================================================================== # |
|
|
# Threshold Methods # |
|
|
# =============================================================================== # |
|
|
# 1. Binary-Otsu w/ Gaussian Blur (kernel size = 9) # |
|
|
# 2. Binary-Otsu w/ Gaussian Blur (kernel size = 7) # |
|
|
# 3. Binary-Otsu w/ Gaussian Blur (kernel size = 5) # |
|
|
# 4. Binary-Otsu w/ Median Blur (kernel size = 5) # |
|
|
# 5. Binary-Otsu w/ Median Blur (kernel size = 3) # |
|
|
# 6. Adaptive Gaussian Threshold (31,2) w/ Gaussian Blur (kernel size = 5) # |
|
|
# 7. Adaptive Gaussian Threshold (31,2) w/ Median Blur (kernel size = 5) # |
|
|
# =============================================================================== # |
|
|
|
|
|
def apply_threshold(img, argument): |
|
|
switcher = { |
|
|
1: cv2.threshold(cv2.GaussianBlur(img, (9, 9), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], |
|
|
2: cv2.threshold(cv2.GaussianBlur(img, (7, 7), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], |
|
|
3: cv2.threshold(cv2.GaussianBlur(img, (5, 5), 0), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], |
|
|
4: cv2.threshold(cv2.medianBlur(img, 5), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], |
|
|
5: cv2.threshold(cv2.medianBlur(img, 3), 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1], |
|
|
6: cv2.adaptiveThreshold(cv2.GaussianBlur(img, (5, 5), 0), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2), |
|
|
7: cv2.adaptiveThreshold(cv2.medianBlur(img, 3), 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2), |
|
|
} |
|
|
return switcher.get(argument, "Invalid method") |
|
|
|
|
|
|
|
|
def crop_image(img, start_x, start_y, end_x, end_y): |
|
|
cropped = img[start_y:end_y, start_x:end_x] |
|
|
return cropped |
|
|
|
|
|
|
|
|
def get_string(img_path, method): |
|
|
# Read image using opencv |
|
|
img = cv2.imread(img_path) |
|
|
file_name = os.path.basename(img_path).split('.')[0] |
|
|
file_name = file_name.split()[0] |
|
|
|
|
|
output_path = os.path.join(output_dir, file_name) |
|
|
if not os.path.exists(output_path): |
|
|
os.makedirs(output_path) |
|
|
|
|
|
# Crop the areas where provision number is more likely present |
|
|
img = crop_image(img, pnr_area[0], pnr_area[1], pnr_area[2], pnr_area[3]) |
|
|
# img = cv2.resize(img, None, fx=1.2, fy=1.2, interpolation=cv2.INTER_CUBIC) |
|
|
|
|
|
# Convert to gray |
|
|
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
|
|
|
|
|
# Apply dilation and erosion to remove some noise |
|
|
kernel = np.ones((1, 1), np.uint8) |
|
|
img = cv2.dilate(img, kernel, iterations=1) |
|
|
img = cv2.erode(img, kernel, iterations=1) |
|
|
|
|
|
# Apply threshold to get image with only black and white |
|
|
img = apply_threshold(img, method) |
|
|
save_path = os.path.join(output_path, file_name + "_filter_" + str(method) + ".jpg") |
|
|
cv2.imwrite(save_path, img) |
|
|
|
|
|
# Recognize text with tesseract for python |
|
|
result = pytesseract.image_to_string(img, lang="eng") |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def find_match(regex, text): |
|
|
matches = re.finditer(regex, text, re.MULTILINE) |
|
|
target = "" |
|
|
for matchNum, match in enumerate(matches): |
|
|
matchNum = matchNum + 1 |
|
|
|
|
|
print(" Match {matchNum} was found at {start}-{end}: {match}".format(matchNum=matchNum, start=match.start(), |
|
|
end=match.end(), match=match.group())) |
|
|
target = match.group() |
|
|
|
|
|
return target |
|
|
|
|
|
|
|
|
def pretty_print(result_dict): |
|
|
s = '' |
|
|
for key in result_dict: |
|
|
s += '# ' + key + ': ' + result_dict[key] + '\n' |
|
|
return s |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
parser = argparse.ArgumentParser(description="This program extracts provision numbers from a set of documents.") |
|
|
parser.add_argument("-i", "--input_dir", help="Input directory for the files to be modified") |
|
|
parser.add_argument("-o", "--output_dir", help="Output directory for the files to be modified") |
|
|
args = parser.parse_args() |
|
|
|
|
|
input_dir = args.input_dir |
|
|
output_dir = args.output_dir |
|
|
|
|
|
if os.path.exists(output_dir): |
|
|
shutil.rmtree(output_dir) |
|
|
os.makedirs(output_dir) |
|
|
|
|
|
im_names = glob.glob(os.path.join(input_dir, '*.png')) + \ |
|
|
glob.glob(os.path.join(input_dir, '*.jpg')) + \ |
|
|
glob.glob(os.path.join(input_dir, '*.jpeg')) |
|
|
|
|
|
overall_start_t = time.time() |
|
|
for im_name in sorted(im_names): |
|
|
queue.append(im_name) |
|
|
|
|
|
print("The following files will be processed and their provision numbers will be extracted: {}\n".format(queue)) |
|
|
|
|
|
for im_name in im_names: |
|
|
start_time = time.time() |
|
|
print("*** The documents that are in the queue *** \n{}\n".format(queue)) |
|
|
|
|
|
print('#=======================================================') |
|
|
print(('# Regex is being applied on {:s}'.format(im_name))) |
|
|
print('#=======================================================') |
|
|
queue.remove(im_name) |
|
|
file_name = im_name.split(".")[0].split("/")[-1] |
|
|
|
|
|
i = 1 |
|
|
while i < 8: |
|
|
print("> The filter method " + str(i) + " is now being applied.") |
|
|
result = get_string(im_name, i) |
|
|
match = find_match(regex, result) |
|
|
if match: |
|
|
if file_name in found: |
|
|
found[file_name].append(match) |
|
|
else: |
|
|
list = [] |
|
|
list.append(match) |
|
|
found[file_name] = list |
|
|
|
|
|
f = open(os.path.join(output_dir, file_name, file_name + "_filter_" + str(i) + ".txt"), 'w') |
|
|
f.write(result) |
|
|
f.close() |
|
|
i += 1 |
|
|
|
|
|
pnr = '' |
|
|
if file_name in found: |
|
|
pnr = mode(found[file_name]) |
|
|
results[file_name] = pnr |
|
|
done.append(file_name) |
|
|
else: |
|
|
missing.append(file_name) |
|
|
end_time = time.time() |
|
|
|
|
|
print('#=======================================================\n' |
|
|
'# Results for: ' + file_name + '\n' |
|
|
'#=======================================================\n' |
|
|
'# The provision number: ' + pnr + '\n' |
|
|
'# It took ' + str(end_time-start_time) + ' seconds. \n' |
|
|
'#=======================================================\n') |
|
|
|
|
|
overall_end_t = time.time() |
|
|
|
|
|
print('#=======================================================\n' |
|
|
'# Summary \n' |
|
|
'#=======================================================\n' |
|
|
'# The documents that are successfully processed are: \n' + pretty_print(results) + |
|
|
'#=======================================================\n' |
|
|
'# The program failed to extract information from: \n' |
|
|
'# ' + str(missing) + '\n' |
|
|
'#=======================================================\n' |
|
|
'# It took ' + str(overall_end_t-overall_start_t) + ' seconds.\n' |
|
|
'#=======================================================\n') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|