Wednesday, April 6, 2022

[FIXED] Use Tesseract OCR to extract text from a scanned pdf folders

April 06, 2022 pdf, python, python-tesseract, tesseract, text No comments

Issue

I have the code to extract/convert text from scanned pdf files/normal pdf files by using Tesseract OCR. But I want to make my code to convert a pdf folder rather than a single pdf file, then the extract text files will be store in a folder that I want.

See my code below:

filePath = '/Users/CodingStark/scanned/scanned-file.pdf'
pages = convert_from_path(filePath, 500)


image_counter = 1
  
# Iterate through all the pages stored above 
for page in pages: 
  
    filename = "page_"+str(image_counter)+".jpg"
          
    page.save(filename, 'JPEG') 
  
    image_counter = image_counter + 1
    

filelimit = image_counter-1
  
# Creating a text file to write the output 
outfile = "scanned-file.txt"
  

f = open(outfile, "a") 
  
# Iterate from 1 to total number of pages 
for i in range(1, filelimit + 1): 

    filename = "page_"+str(i)+".jpg"
          
    # Recognize the text as string in image using pytesserct 
    text = str(((pytesseract.image_to_string(Image.open(filename))))) 

    text = text.replace('-\n', '')     
  

    f.write(text) 
#Close the file after writing all the text. 
f.close()

I want to automate my code so it will convert all my pdf files in the scanned folder and those extract text files will be in a folder that I want. Also, are there any ways to delete all the jpg files after the code? Since it takes a lot of memory spaces. Thank you so much!!

Updated with Answer

def tesseractOCR_pdf(pdf):

    filePath = pdf
    
    pages = convert_from_path(filePath, 500)

    # Counter to store images of each page of PDF to image 
    image_counter = 1

    # Iterate through all the pages stored above 
    for page in pages:
        # Declaring filename for each page of PDF as JPG 
        # For each page, filename will be: 
        # PDF page 1 -> page_1.jpg 
        # PDF page 2 -> page_2.jpg 
        # PDF page 3 -> page_3.jpg 
        # .... 
        # PDF page n -> page_n.jpg 

        filename = "page_"+str(image_counter)+".jpg"
        
        # Save the image of the page in system 
        page.save(filename, 'JPEG') 
        # Increment the counter to update filename 
        image_counter = image_counter + 1

    # Variable to get count of total number of pages 
    filelimit = image_counter-1


    # Create an empty string for stroing purposes
    text = ""
    # Iterate from 1 to total number of pages 
    for i in range(1, filelimit + 1): 
        # Set filename to recognize text from 
        # Again, these files will be: 
        # page_1.jpg 
        # page_2.jpg 
        # .... 
        # page_n.jpg 
        filename = "page_"+str(i)+".jpg"

        # Recognize the text as string in image using pytesserct 
        text += str(((pytesseract.image_to_string(Image.open(filename))))) 

        text = text.replace('-\n', '')     

    
    #Delete all the jpg files that created from above
    for i in glob.glob("*.jpg"):
        os.remove(i)
        
    return text

def tesseractOCR_img(img):

    filePath = img
    
    text = str(pytesseract.image_to_string(filePath,lang='eng',config='--psm 6'))
    
    text = text.replace('-\n', '')
    
    return text

def Tesseract_ALL(docDir, txtDir):
    if docDir == "": docDir = os.getcwd() + "\\" #if no docDir passed in 
        
    for doc in os.listdir(docDir): #iterate through docs in doc directory
        try:
            fileExtension = doc.split(".")[-1]
            
            if fileExtension == "pdf":
                pdfFilename = docDir + doc 
                text = tesseractOCR_pdf(pdfFilename) #get string of text content of pdf
                textFilename = txtDir + doc + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
            else:   
#             elif (fileExtension == "tif") | (fileExtension == "tiff") | (fileExtension == "jpg"):
                imgFilename = docDir + doc 
                text = tesseractOCR_img(imgFilename) #get string of text content of img
                textFilename = txtDir + doc + ".txt"
                textFile = open(textFilename, "w") #make text file
                textFile.write(text) #write text to text file
        except:
            print("Error in file: "+ str(doc))
            
    for filename in os.listdir(txtDir):
        fileExtension = filename.split(".")[-2]
        if fileExtension == "pdf":
            os.rename(txtDir + filename, txtDir + filename.replace('.pdf', ''))
        elif fileExtension == "tif":
            os.rename(txtDir + filename, txtDir + filename.replace('.tif', ''))
        elif fileExtension == "tiff":
            os.rename(txtDir + filename, txtDir + filename.replace('.tiff', ''))
        elif fileExtension == "jpg":
            os.rename(txtDir + filename, txtDir + filename.replace('.jpg', ''))

#Below are the code to run the functions
#Specific telling the function where the documents located and where you want the txt files to be at
docDir = "pdf_folder"
txtDir = "text_folder"

Tesseract_ALL(docDir, txtDir)

Solution

here is the loop to read from a path,

import glob,os
import os, subprocess

pdf_dir = "dir"
os.chdir(pdf_dir)
for pdf_file in glob.glob(os.path.join(pdf_dir, "*.PDF")):
      //// put here what you want to do for each pdf file

Answered By - Mustafa Azzurri

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Wednesday, April 6, 2022

[FIXED] Use Tesseract OCR to extract text from a scanned pdf folders

Issue

Updated with Answer

Solution

0 comments:

Post a Comment

Popular Posts

Labels