Issue
I am trying to remove horizontal and vertical lines from a image. This image is generated from a pdf using pdf2jpg library. Upon removal of the horizontal and vertical lines this image will be fed to pytesseract to extract words and their individual co-ordinates. Here I am just extracting the full text for testing purpose. I am new to OpenCV. I have written this code by accumulating code snippets from different websites including stack overflow. The code works almost perfectly other than there are some occasional remnants of vertical lines. This remnants are confusing the tesseract and sometimes is being treated as I, 1 or |. Also it seems like number of misreads(like s is read as 5, I is read as 1 or | and vice versa) by tesseract is higher for the processed image than the original image. I think the reason for that being the font sharpness is lower than the original image that we started with. What changes can be done to this code which will remove those remnants of vertical line without affecting the font sharpness. Any suggestions or guidance in right direction will be heavily appreciated. Thanks in advance
from importlib import invalidate_caches
from pytesseract import image_to_string
#from pdf2image import convert_from_path
from pdf2jpg.pdf2jpg import convert_pdf2jpg
from PIL import Image
import sys
import cv2
import numpy
def pre_process(image):
if isinstance(image, str):
image = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
else:
# image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
pass
#Convert the image to true black n white from grayscale
threshold, image_bin = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
#Invert the image to change white to black and vice versa
image_inv = 255-image_bin
#Define kernels for horizontal and vertical lines
kernel_len = numpy.array(image).shape[1]//100
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#Remove anything that is not a vertical line
image_inv1 = cv2.erode(image_inv, vertical_kernel, iterations=3)
vertical_lines = cv2.dilate(image_inv1, vertical_kernel, iterations=3)
#Remove anything that is not a horizontal line
image_inv2 = cv2.erode(image_inv, horizontal_kernel, iterations=3)
horizontal_lines = cv2.dilate(image_inv2, horizontal_kernel, iterations=3)
#Add horizontal and vertical lines to get all lines
image_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
image_vh = cv2.erode(~image_vh, kernel, iterations=2)
threshold, image_vh = cv2.threshold(image_vh, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
# Make a inverted copy of original grayscale image
org_img_inv = cv2.bitwise_not(image)
#Apply mask of all lines
final_image_inv = cv2.bitwise_and(org_img_inv, org_img_inv, mask=image_vh)
#Invert again to get clean image without lines
image = cv2.bitwise_not(final_image_inv)
cv2.imshow("final", image)
cv2.waitKey(0)
return image
if __name__ =="__main__":
pdf_path = sys.argv[1]
images = convert_pdf2jpg(pdf_path, "temp", dpi=100, pages="ALL")
result = ""
for image_path in images[0]["output_jpgfiles"]:
# with Image.open(image_path) as image:
# text = image_to_string(image)
# result = "\n".join((result, text))
image = pre_process(image_path)
#image = pre_process(image)
text = image_to_string(image)
result = "\n".join((result, text))
# print(result)
with open("text.txt", "w") as out:
out.write(result)
# pre_process(image_path)
# break
Please find the attached pdf, which I am using as my input pdf for the code and a snip for processed image for reference. The code can be triggered from command prompt using
python .\read_pdf_ocr.py path_to_pdf_file
Environment details:
- Python: 3.7.9
- Libraries:
- opencv-python: 4.4.0.46
- pdf2jpg: 1.0
- pytesseract: 0.3.6
- Tesseract-OCR - open source OCR engine: v5.0.0-alpha.20200328
Solution
You can use line-detector
to detect the lines in the given image.
After you convert the image using convert_pdf2jpg
Find the edges of the image. You can use Canny
.
import cv2
import pytesseract
img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)
Part of the canny image:
Now we can use the line-detector
to find the coordinates of the image.
lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)
img_cpy = img.copy()
for ln in lns:
x1 = int(ln[0][0])
y1 = int(ln[0][1])
x2 = int(ln[0][2])
y2 = int(ln[0][3])
cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
color=(0, 255, 0), thickness=5)
print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))
When we run the code:
Output:
Coords: (8, 6)->(586, 6)
So the width of the table is 580 pixel
. (586 - 6)
Output:
Coords: (589, 28)->(589, 6)
So, the distance between two consecutive line is nearly 22 pixel
(28 - 6)
and there are 37 lines.
How about we draw each line same as the background color?
We know the distance between two consecutive line, start and end of the horizontal lines.
for _ in range(0, 37):
cv2.line(img, pt1=(6, y1), pt2=(590, y1),
color=(255, 255, 255), thickness=5)
print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
y1 += 20
Result: Horizontal lines removed
Sample:
If you look at the last sentence of the output:
Coords: (6, 726)->(590, 726)
So the ending coordinate is 726
. Too remove the vertical-line we need to find the starting coordinate since we already know the ending coordinate.
Output:
Coords: (8, 6)->(586, 6)
Coords: (589, 28)->(589, 6)
Coords: (69, 8)->(69, 24)
Coords: (337, 8)->(337, 24)
Th first coordinates give the starting points, 589, 69, 337
and 6
The first vertical line coordinates are: (6, 6)->(6, 726)
The second vertical line coordinates are: (69, 6)->(69, 726)
The third vertical line coordinates are: (337, 6)->(337, 726)
The fourth vertical line coordinates are: (589, 6)->(589, 726)
cv2.line(img, pt1=(6, 6), pt2=(6, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(72, 6), pt2=(72, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(337, 6), pt2=(337, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(589, 6), pt2=(589, 726),
color=(255, 255, 255), thickness=5)
Result: Vertical-lines-removed
Sample:
Now, when you read the text from the output image:
LINE 1
LINE 2
LINE 3
.
.
.
SOME RANDOM TEXT FOR LINE 1 CELL 1.
SOME RANDOM TEXT FOR LINE 2 CELL 1.
SOME RANDOM TEXT FOR LINE 3 CELL 1.
SOME RANDOM TEXT FOR LINE 4 CELL 1.
.
.
.
1AM OTHER TEXT FOR LINE 1 CELL 2
1AM OTHER TEXT FOR LINE 2 CELL 2
1AM OTHER TEXT FOR LINE 3 CELL 2
1AM OTHER TEXT FOR LINE 4 CELL 2
Code:
import cv2
import pytesseract
img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)
# lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)
#
# img_cpy = img.copy()
#
# for ln in lns:
# x1 = int(ln[0][0])
# y1 = int(ln[0][1])
# x2 = int(ln[0][2])
# y2 = int(ln[0][3])
#
# cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
# color=(0, 255, 0), thickness=5)
#
# print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))
#
# cv2.imshow("img_cpy", img_cpy)
# cv2.waitKey(0)
y1 = 6
for _ in range(0, 37):
cv2.line(img, pt1=(6, y1), pt2=(590, y1),
color=(255, 255, 255), thickness=5)
print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
y1 += 20
cv2.line(img, pt1=(6, 6), pt2=(6, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(72, 6), pt2=(72, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(337, 6), pt2=(337, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(589, 6), pt2=(589, 726),
color=(255, 255, 255), thickness=5)
cv2.imshow("lns", img)
cv2.waitKey(0)
txt = pytesseract.image_to_string(img)
print(txt)
Answered By - Ahx
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.