Errors produced by scanning text seem to follow a predictable pattern such a seU for sell or iUness for illness or bom for born etc but never the less aren't corrected by the automatic scanning software. So, I created a function for the calibre editor to fix those I most commonly found. You'll also found I've corrected some American spellings, depending upon your dictionary these won't actually be wrong.:)
The code is based on the Calibre example that tidies up hyphens.
You'll need to enter the following find : >.*?<
Here's the function, because PYTHON uses intelligent (or not so) indenting you may need to play some to get PYTHON to swallow the code. :
GOOD LUCK & HOPE ITS OF SOME USE
The code is based on the Calibre example that tidies up hyphens.
You'll need to enter the following find : >.*?<
Here's the function, because PYTHON uses intelligent (or not so) indenting you may need to play some to get PYTHON to swallow the code. :
Code:
import regex
from calibre import replace_entities
from calibre import prepare_string_for_xml
def replace(match, number, file_name, metadata, dictionaries, data, functions, *args, **kwargs):
def replace_word(wmatch):
# Check if the current word exits in the dictionary
CheckThisSpelling = wmatch.group(1)
if dictionaries.recognized(CheckThisSpelling) == True:
return wmatch.group()
else:
# else try to correct it - remove American spelling
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("or", "our")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling + '~'
NewSpelling = NewSpelling.replace("or~", "our")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling + '~'
NewSpelling = NewSpelling.replace("ors~", "our")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it - remove American spelling
NewSpelling = CheckThisSpelling + '~'
NewSpelling = NewSpelling.replace("er~", "re")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("er", "re")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
else:
NewSpelling = NewSpelling.replace("ree", "re")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling + '~'
NewSpelling = NewSpelling.replace("ers~", "res")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling + '~'
NewSpelling = NewSpelling.replace("nse~", "nce")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it - remove American spelling
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("l", "ll")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("l", "ll",1)
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("l", "~",2)
NewSpelling = NewSpelling.replace("~", "l",1)
NewSpelling = NewSpelling.replace("~", "ll",1)
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it - remove American spelling
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("ll", "l")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("ll", "l",1)
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("ll", "~",2)
NewSpelling = NewSpelling.replace("~", "ll",1)
NewSpelling = NewSpelling.replace("~", "l",1)
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
#
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("U", "li")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("U", "ll")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("h", "li")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("H", "li")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("h", "li",1)
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("H", "li",1)
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("h", "~",2)
NewSpelling = NewSpelling.replace("~", "h",1)
NewSpelling = NewSpelling.replace("~", "li",1)
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("H", "~",2)
NewSpelling = NewSpelling.replace("~", "H",1)
NewSpelling = NewSpelling.replace("~", "li",1)
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("im", "un")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("l", "ll")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
#
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("imi", "um")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("m", "rn")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("m", "in")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("m", "hi")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("nm", "run")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("nmi", "rum")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("bn", "lm")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("ii", "h")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
# else try to correct it
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("ii", "u")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
#
# else try to correct it
if CheckThisSpelling == 'Fd':
return " I'd" + wmatch.group(2)
if CheckThisSpelling == 'Fve':
return " I've" + wmatch.group(2)
if CheckThisSpelling == 'Fm':
return " I'm" + wmatch.group(2)
if CheckThisSpelling == 'Fll':
return " I'll" + wmatch.group(2)
if CheckThisSpelling == 'youVe':
return " you've" + wmatch.group(2)
if CheckThisSpelling == 'YouVe':
return " You've" + wmatch.group(2)
#
# else try to correct it
if CheckThisSpelling == 'wren\'t':
return " weren't" + wmatch.group(2)
#
# else try to correct it
if CheckThisSpelling == '&':
return ' ' + chr(38) + wmatch.group(2)
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace(">", "y")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("j&", "fi")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = CheckThisSpelling
NewSpelling = NewSpelling.replace("i&", "fi")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
NewSpelling = NewSpelling.replace("l&", "fi")
if dictionaries.recognized(NewSpelling) == True:
return NewSpelling + wmatch.group(2)
return wmatch.group()
#return wmatch.group() + '1' + wmatch.group(1) + '2' + wmatch.group(2) + '3' + NewSpelling
# Search for words
text = replace_entities(match.group()[1:-1]) # Handle HTML entities like &
corrected = regex.sub(r'\s*([\w\>\&[[a-z]\'[a-z]]]*)([\s*\.\?\,\"\;])', replace_word, text, flags=regex.VERSION1 | regex.UNICODE)
return '>%s<' % prepare_string_for_xml(corrected) # Put back required entities