Sunday, March 27, 2022

[FIXED] How to remove horizontal and vertical lines without degrading the image quality in python

March 27, 2022 opencv, python, python-3.x, python-tesseract No comments

Issue

I am trying to remove horizontal and vertical lines from a image. This image is generated from a pdf using pdf2jpg library. Upon removal of the horizontal and vertical lines this image will be fed to pytesseract to extract words and their individual co-ordinates. Here I am just extracting the full text for testing purpose. I am new to OpenCV. I have written this code by accumulating code snippets from different websites including stack overflow. The code works almost perfectly other than there are some occasional remnants of vertical lines. This remnants are confusing the tesseract and sometimes is being treated as I, 1 or |. Also it seems like number of misreads(like s is read as 5, I is read as 1 or | and vice versa) by tesseract is higher for the processed image than the original image. I think the reason for that being the font sharpness is lower than the original image that we started with. What changes can be done to this code which will remove those remnants of vertical line without affecting the font sharpness. Any suggestions or guidance in right direction will be heavily appreciated. Thanks in advance

from importlib import invalidate_caches
from pytesseract import image_to_string
#from pdf2image import convert_from_path
from pdf2jpg.pdf2jpg import convert_pdf2jpg
from PIL import Image
import sys
import cv2
import numpy

def pre_process(image):
    if isinstance(image, str):
        image = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    else:
        # image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        pass
    #Convert the image to true black n white from grayscale
    threshold, image_bin = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
    #Invert the image to change white to black and vice versa
    image_inv = 255-image_bin
    
    #Define kernels for horizontal and vertical lines
    kernel_len = numpy.array(image).shape[1]//100
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))

    #Remove anything that is not a vertical line
    image_inv1 = cv2.erode(image_inv, vertical_kernel, iterations=3)
    vertical_lines = cv2.dilate(image_inv1, vertical_kernel, iterations=3)

    #Remove anything that is not a horizontal line
    image_inv2 = cv2.erode(image_inv, horizontal_kernel, iterations=3)
    horizontal_lines = cv2.dilate(image_inv2, horizontal_kernel, iterations=3)

    #Add horizontal and vertical lines to get all lines
    image_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    image_vh = cv2.erode(~image_vh, kernel, iterations=2)
    threshold, image_vh = cv2.threshold(image_vh, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)

    # Make a inverted copy of original grayscale image
    org_img_inv = cv2.bitwise_not(image)
    #Apply mask of all lines
    final_image_inv = cv2.bitwise_and(org_img_inv, org_img_inv, mask=image_vh)
    #Invert again to get clean image without lines
    image = cv2.bitwise_not(final_image_inv)
    cv2.imshow("final", image)
    cv2.waitKey(0)
    return image

if __name__ =="__main__":
    pdf_path = sys.argv[1]
    images = convert_pdf2jpg(pdf_path, "temp", dpi=100, pages="ALL")
    result = ""
    for image_path in images[0]["output_jpgfiles"]:
        # with Image.open(image_path) as image:
            # text = image_to_string(image)
            # result = "\n".join((result, text))
        image = pre_process(image_path)
        #image = pre_process(image)
        text = image_to_string(image)
        result = "\n".join((result, text))
    # print(result)
    with open("text.txt", "w") as out:
        out.write(result)
        # pre_process(image_path)
        # break

Please find the attached pdf, which I am using as my input pdf for the code and a snip for processed image for reference. The code can be triggered from command prompt using

python .\read_pdf_ocr.py path_to_pdf_file

Environment details:

Python: 3.7.9
Libraries:
- opencv-python: 4.4.0.46
- pdf2jpg: 1.0
- pytesseract: 0.3.6
Tesseract-OCR - open source OCR engine: v5.0.0-alpha.20200328

Snip of processed image

Test PDF with table

Solution

You can use line-detector to detect the lines in the given image.

After you convert the image using convert_pdf2jpg

Find the edges of the image. You can use Canny.

import cv2
import pytesseract

img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)

Canny-applied-image

Part of the canny image:

Now we can use the line-detector to find the coordinates of the image.

lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)

img_cpy = img.copy()

for ln in lns:
    x1 = int(ln[0][0])
    y1 = int(ln[0][1])
    x2 = int(ln[0][2])
    y2 = int(ln[0][3])

    cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
             color=(0, 255, 0), thickness=5)

    print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))

When we run the code:

Output:

Coords: (8, 6)->(586, 6)

So the width of the table is 580 pixel. (586 - 6)

Output:

Coords: (589, 28)->(589, 6)

So, the distance between two consecutive line is nearly 22 pixel (28 - 6)

and there are 37 lines.

How about we draw each line same as the background color?

We know the distance between two consecutive line, start and end of the horizontal lines.

for _ in range(0, 37):
    cv2.line(img, pt1=(6, y1), pt2=(590, y1),
             color=(255, 255, 255), thickness=5)
    print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
    y1 += 20

Result: Horizontal lines removed

Sample:

If you look at the last sentence of the output:

Coords: (6, 726)->(590, 726)

So the ending coordinate is 726. Too remove the vertical-line we need to find the starting coordinate since we already know the ending coordinate.

Output:

Coords: (8, 6)->(586, 6)
Coords: (589, 28)->(589, 6)
Coords: (69, 8)->(69, 24)
Coords: (337, 8)->(337, 24)

Th first coordinates give the starting points, 589, 69, 337 and 6

The first vertical line coordinates are: (6, 6)->(6, 726) The second vertical line coordinates are: (69, 6)->(69, 726) The third vertical line coordinates are: (337, 6)->(337, 726) The fourth vertical line coordinates are: (589, 6)->(589, 726)

cv2.line(img, pt1=(6, 6), pt2=(6, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(72, 6), pt2=(72, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(337, 6), pt2=(337, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(589, 6), pt2=(589, 726),
         color=(255, 255, 255), thickness=5)

Result: Vertical-lines-removed

Sample:

Now, when you read the text from the output image:

LINE 1
LINE 2
LINE 3
.
.
.
SOME RANDOM TEXT FOR LINE 1 CELL 1.
SOME RANDOM TEXT FOR LINE 2 CELL 1.
SOME RANDOM TEXT FOR LINE 3 CELL 1.
SOME RANDOM TEXT FOR LINE 4 CELL 1.
.
.
.
1AM OTHER TEXT FOR LINE 1 CELL 2
1AM OTHER TEXT FOR LINE 2 CELL 2
1AM OTHER TEXT FOR LINE 3 CELL 2
1AM OTHER TEXT FOR LINE 4 CELL 2

Code:

import cv2
import pytesseract

img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)

# lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)
#
# img_cpy = img.copy()
#
# for ln in lns:
#     x1 = int(ln[0][0])
#     y1 = int(ln[0][1])
#     x2 = int(ln[0][2])
#     y2 = int(ln[0][3])
#
#     cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
#              color=(0, 255, 0), thickness=5)
#
#     print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))
#
#     cv2.imshow("img_cpy", img_cpy)
#     cv2.waitKey(0)

y1 = 6

for _ in range(0, 37):
    cv2.line(img, pt1=(6, y1), pt2=(590, y1),
             color=(255, 255, 255), thickness=5)
    print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
    y1 += 20

cv2.line(img, pt1=(6, 6), pt2=(6, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(72, 6), pt2=(72, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(337, 6), pt2=(337, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(589, 6), pt2=(589, 726),
         color=(255, 255, 255), thickness=5)

cv2.imshow("lns", img)
cv2.waitKey(0)

txt = pytesseract.image_to_string(img)
print(txt)

Answered By - Ahx

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Sunday, March 27, 2022

[FIXED] How to remove horizontal and vertical lines without degrading the image quality in python

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels