Quantcast
Channel: MobileRead Forums - Reading and Management
Viewing all articles
Browse latest Browse all 24044

Scanning OCR Errors

$
0
0
Errors produced by scanning text seem to follow a predictable pattern such a seU for sell or iUness for illness or bom for born etc but never the less aren't corrected by the automatic scanning software. So, I created a function for the calibre editor to fix those I most commonly found. You'll also found I've corrected some American spellings, depending upon your dictionary these won't actually be wrong.:)

The code is based on the Calibre example that tidies up hyphens.

You'll need to enter the following find : >.*?<

Here's the function, because PYTHON uses intelligent (or not so) indenting you may need to play some to get PYTHON to swallow the code. :

Code:

import regex
from calibre import replace_entities
from calibre import prepare_string_for_xml

def replace(match, number, file_name, metadata, dictionaries, data, functions, *args, **kwargs):

    def replace_word(wmatch):
        # Check if the current word exits in the dictionary
        CheckThisSpelling = wmatch.group(1)
        if dictionaries.recognized(CheckThisSpelling) == True: 
            return wmatch.group()
        else:
        #        else try to correct it - remove American spelling
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("or", "our")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)       
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("or~", "our")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("ors~", "our")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)   
        #        else try to correct it - remove American spelling
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("er~", "re")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("er", "re")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            else:
              NewSpelling = NewSpelling.replace("ree", "re")
              if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)                                   
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("ers~", "res")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling + '~'
            NewSpelling = NewSpelling.replace("nse~", "nce")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it - remove American spelling
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("l", "ll")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("l", "ll",1)
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("l", "~",2)
            NewSpelling = NewSpelling.replace("~", "l",1)
            NewSpelling = NewSpelling.replace("~", "ll",1)                     
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)                                               
        #        else try to correct it - remove American spelling
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ll", "l")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ll", "l",1)
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ll", "~",2)
            NewSpelling = NewSpelling.replace("~", "ll",1)
            NewSpelling = NewSpelling.replace("~", "l",1)                     
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)             
        #
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("U", "li")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("U", "ll")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)           
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("h", "li")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("H", "li")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("h", "li",1)
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("H", "li",1)
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2) 
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("h", "~",2)
            NewSpelling = NewSpelling.replace("~", "h",1)
            NewSpelling = NewSpelling.replace("~", "li",1)             
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("H", "~",2)
            NewSpelling = NewSpelling.replace("~", "H",1)
            NewSpelling = NewSpelling.replace("~", "li",1) 
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)                       
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("im", "un")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("l", "ll")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("imi", "um")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)             
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("m", "rn")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("m", "in")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
        #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("m", "hi")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)           
          #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("nm", "run")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
          #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("nmi", "rum")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)                                                                                                         
          #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("bn", "lm")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)                                                                                                           
          #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ii", "h")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)                                                                                                           
          #        else try to correct it
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("ii", "u")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)                                                                                                           
        #       
        #        else try to correct it
            if CheckThisSpelling == 'Fd':
                return " I'd" +  wmatch.group(2) 
            if CheckThisSpelling == 'Fve':
                return " I've" +  wmatch.group(2)
            if CheckThisSpelling == 'Fm':
                return " I'm" +  wmatch.group(2)
            if CheckThisSpelling == 'Fll':
                return " I'll" +  wmatch.group(2)
            if CheckThisSpelling == 'youVe':
                return " you've" +  wmatch.group(2)
            if CheckThisSpelling == 'YouVe':
                return " You've" +  wmatch.group(2)                 
        #       
        #        else try to correct it
            if CheckThisSpelling == 'wren\'t':
                return " weren't" +  wmatch.group(2)             

        #       
        #        else try to correct it
            if CheckThisSpelling == '&':
                return ' ' + chr(38) +  wmatch.group(2) 
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace(">", "y")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("j&", "fi")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)
            NewSpelling = CheckThisSpelling
            NewSpelling = NewSpelling.replace("i&", "fi")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2) 
            NewSpelling = NewSpelling.replace("l&", "fi")
            if dictionaries.recognized(NewSpelling) == True: 
                return NewSpelling +  wmatch.group(2)                                     
                                                                             
        return wmatch.group()
        #return wmatch.group() + '1' + wmatch.group(1) + '2' + wmatch.group(2) + '3' + NewSpelling
    # Search for words
    text = replace_entities(match.group()[1:-1])  # Handle HTML entities like &amp;
    corrected = regex.sub(r'\s*([\w\>\&[[a-z]\'[a-z]]]*)([\s*\.\?\,\"\;])', replace_word, text, flags=regex.VERSION1 | regex.UNICODE)
    return '>%s<' % prepare_string_for_xml(corrected)  # Put back required entities

GOOD LUCK & HOPE ITS OF SOME USE

Viewing all articles
Browse latest Browse all 24044

Trending Articles