from wxPython.wx import *
from wxPython.html import *
import math
import sys
import re
import time
import operator
# maybe move this to the method area?
# add as needed...(NOT USED YET)
# these are to designate words which are almost always hyphenated
hyphenPrefixes = {
'dis':'',
're':''
}
hyphenSuffixes = {
'ed':'',
'es':''
}
appVersion = '.3.2'
ID_STUB = 100
ID_OPEN = 101
ID_SAVE = 102
ID_PARSE = 103
ID_HYPHEN = 104
ID_HEADERS= 105
ID_NUMBERS= 106
ID_CAPS = 107
ID_SPACES = 108
ID_EXIT = 109
ID_HELP = 140
ID_LICENSE= 141
ID_ABOUT = 142
############################# Class defs #############################
class MainFrame(wxFrame):
def __init__(self, parent, ID, title):
wxFrame.__init__(self, parent, ID, title,
wxDefaultPosition, wxSize(750, 750))
if wxPlatform == '__WXMSW__':
self.icon = wxIcon('imagery\\book.ico', wxBITMAP_TYPE_ICO )
self.SetIcon(self.icon)
self.CreateStatusBar()
self.SetStatusText("Tip: All modifications performed on the \'Before\' pane will be displayed in the \'After\' pane.")
#create the menubar
fileMenu = wxMenu()
fileMenu.Append(ID_OPEN, "&Open",
"Open a text file")
fileMenu.Append(ID_SAVE, "&Save",
"Save the new file")
fileMenu.AppendSeparator()
fileMenu.Append(ID_EXIT, "E&xit", "Terminate the program")
parseMenu = wxMenu()
parseMenu.Append(ID_PARSE, "&Go",
"Do it all")
parseMenu.Append(ID_HEADERS, "&Headers only",
"Remove all page headers")
parseMenu.Append(ID_HYPHEN, "&Hyphens only",
"Change only hyphens")
parseMenu.Append(ID_NUMBERS, "&Numbers only",
"Remove all page numbers")
parseMenu.Append(ID_CAPS, "&Caps only",
"Fix CAPITALIZED words")
parseMenu.Append(ID_SPACES, "&Space sentences only",
"Double-space after sentences")
helpMenu = wxMenu()
helpMenu.Append(ID_HELP, "&Help",
"Index and glossary")
helpMenu.Append(ID_LICENSE, "&License info",
"A limited freeware license")
helpMenu.AppendSeparator()
helpMenu.Append(ID_ABOUT, "&About",
"More information about this program")
self.menuBar = wxMenuBar()
self.menuBar.Append(fileMenu, "&File")
self.menuBar.Append(parseMenu, "&Parse")
self.menuBar.Append(helpMenu, "&Help")
self.SetMenuBar(self.menuBar)
self.tb = self.CreateToolBar(wxTB_HORIZONTAL|wxNO_BORDER|wxTB_3DBUTTONS)
self.tb.AddSimpleTool(10, wxBitmap('imagery\\go.bmp',
wxBITMAP_TYPE_BMP), "Apply all", "Apply all of the methods except spell-check.")
EVT_TOOL(self, 10, self.OnParse)
self.tb.AddSeparator()
self.tb.AddSimpleTool(50, wxBitmap('imagery\\caps.bmp',
wxBITMAP_TYPE_BMP), "CAPITALS", "Fix all CAPITALIZED words at the beginning of sentences.")
EVT_TOOL(self, 50, self.OnCaps)
self.tb.AddSimpleTool(60, wxBitmap('imagery\\spaces.bmp',
wxBITMAP_TYPE_BMP), "Spaces", "Insert double-space after sentences, where needed.")
EVT_TOOL(self, 60, self.OnSpaces)
self.tb.AddSimpleTool(65, wxBitmap('imagery\\paragraph.bmp',
wxBITMAP_TYPE_BMP), "Space paragraphs", "Space paragraphs with one blank line.")
EVT_TOOL(self, 65, self.OnParagraphs)
self.tb.AddSimpleTool(40, wxBitmap('imagery\\headers.bmp',
wxBITMAP_TYPE_BMP), "Remove headers", "Remove chapter headers.")
EVT_TOOL(self, 40, self.OnHeaders)
self.tb.AddSimpleTool(20, wxBitmap('imagery\\hyphen.bmp',
wxBITMAP_TYPE_BMP), "Remove hyphens", "Remove hyphens at ends of lines, with prompting.")
EVT_TOOL(self, 20, self.OnHyphen)
self.tb.AddSimpleTool(30, wxBitmap('imagery\\number.bmp',
wxBITMAP_TYPE_BMP), "Page numbers", "Remove page numbers, if alone on a line.")
EVT_TOOL(self, 30, self.OnNumbers)
self.tb.AddSimpleTool(85, wxBitmap('imagery\\length.bmp',
wxBITMAP_TYPE_BMP), "Line length", "Adjust line lengths.")
EVT_TOOL(self, 85, self.OnLineLength)
self.tb.AddSimpleTool(75, wxBitmap('imagery\\check.bmp',
wxBITMAP_TYPE_BMP), "Spell check", "Spell check the \'Before\' pane, with prompts")
EVT_TOOL(self, 75, self.OnSpell)
self.tb.AddSeparator()
self.tb.AddSimpleTool(70, wxBitmap('imagery\\after2before.bmp',
wxBITMAP_TYPE_BMP), "Switch results", "Replace \'Before\' tab-pane text with \'After\' tab-pane text")
EVT_TOOL(self, 70, self.OnReplacePane)
self.tb.Realize()
EVT_MENU(self, ID_OPEN, self.OnOpen)
EVT_MENU(self, ID_SAVE, self.OnFileSave)
EVT_MENU(self, ID_PARSE, self.OnParse)
EVT_MENU(self, ID_HYPHEN, self.OnHyphen)
EVT_MENU(self, ID_NUMBERS,self.OnNumbers)
EVT_MENU(self, ID_HEADERS,self.OnHeaders)
EVT_MENU(self, ID_CAPS, self.OnCaps)
EVT_MENU(self, ID_SPACES, self.OnSpaces)
EVT_MENU(self, ID_HELP, self.OnHelp)
EVT_MENU(self, ID_LICENSE,self.OnLicense)
EVT_MENU(self, ID_ABOUT, self.OnAbout)
EVT_MENU(self, ID_EXIT, self.ExitFrame)
##### create and add notebook pages
self.nb = wxNotebook(self, -1)
# create the before tab
#self.nb.txtBefore = wxTextCtrl(self.nb, -1, "THIS is the de-\nfault test\n4\nOVER THE SIERRAS NEVADAS. 5\n\nthe text",wxPoint(0, 0), wxSize(75, 20), wxTE_MULTILINE|wxTE_RICH )
self.nb.txtBefore = wxTextCtrl(self.nb, -1, "",wxPoint(0, 0), wxSize(75, 20), wxTE_MULTILINE|wxTE_RICH )
#self.fontObj = wxFont(12, wxDEFAULT, wxNORMAL, wxNORMAL, false, "arial")
self.fontObj = wxFont(11, wxMODERN, wxNORMAL, wxNORMAL, false)
try:
self.nb.txtBefore.LoadFile('test.txt')
except IOError:
dlg_m = wxMessageDialog (self,
'There was an error opening the new file.',
'Error!', wxOK)
dlg_m.ShowModal()
dlg_m.Destroy()
self.nb.txtBefore.SetStyle(0, self.nb.txtBefore.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.AddPage(self.nb.txtBefore, "Before", TRUE)
# create the After tab
self.nb.txtAfter = wxTextCtrl(self.nb, -1, "",wxPoint(0, 0), wxSize(75, 20), wxTE_MULTILINE|wxTE_RICH )
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.AddPage(self.nb.txtAfter, "After")
def OnHelp(self, event):
helpFrame = HelpFrame(NULL, -1, "Help for OCR to Gutenberg text")
helpFrame.Show(true)
def OnLicense(self, event):
licMessage = """
OCR to Gutenberg text is freeware for the preparation of e-texts for the Gutenberg Project.
Any other use of the code, in whole or in part, is by permission only.
Author: Ray Schumacher
WWW: http://rjs.org
Email: rays@rjs.org
"""
dlg = wxMessageDialog(self, licMessage, "License", wxOK | wxICON_INFORMATION)
dlg.ShowModal()
dlg.Destroy()
def OnAbout(self, event):
aboutMessage = string.join(["Welcome to\nOCR to Gutenberg text v", appVersion, "\n\n"], '')
dlg = wxMessageDialog(self, aboutMessage, "About", wxOK | wxICON_INFORMATION)
dlg.ShowModal()
dlg.Destroy()
def OnParse(self, event):
wxBeginBusyCursor()
txt = self.nb.txtBefore.GetValue()
txt = self.FixCaps(txt)
txt = self.spaceSentences(txt)
txt = self.RemovePageHeaders(txt)
txt = self.removePageNumbers(txt)
txt = self.removeVolumeNumbers(txt)
txt = self.RemoveHyphens(txt)
txt = self.spaceParagraphs(txt)
txt = self.formatHeaders(txt)
txt = self.fixLineLength(txt)
#txt = self.RemoveBrackets(txt)
#txt = self.RemoveBlankLines(txt)
self.nb.txtAfter.SetValue(txt)
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(1)
wxEndBusyCursor()
def OnSpell(self, event):
wxBeginBusyCursor()
txt = self.nb.txtBefore.GetValue()
txt = self.spellCheck(txt)
self.nb.txtAfter.SetValue(txt)
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(1)
wxEndBusyCursor()
def OnHyphen(self, event):
wxBeginBusyCursor()
self.nb.txtAfter.SetValue(self.RemoveHyphens(self.nb.txtBefore.GetValue()))
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
#self.nb.SetSelection(1)
wxEndBusyCursor()
def OnCaps(self, event):
wxBeginBusyCursor()
self.nb.txtAfter.SetValue(self.FixCaps(self.nb.txtBefore.GetValue()))
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(1)
wxEndBusyCursor()
def OnHeaders(self, event):
wxBeginBusyCursor()
self.nb.txtAfter.SetValue(self.RemovePageHeaders(self.nb.txtBefore.GetValue()))
self.nb.txtAfter.SetValue(self.formatHeaders(self.nb.txtAfter.GetValue()))
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(1)
wxEndBusyCursor()
def OnNumbers(self, event):
wxBeginBusyCursor()
self.nb.txtAfter.SetValue(self.removePageNumbers(self.nb.txtBefore.GetValue()))
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(1)
wxEndBusyCursor()
def OnSpaces(self, event):
wxBeginBusyCursor()
self.nb.txtAfter.SetValue(self.spaceSentences(self.nb.txtBefore.GetValue()))
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(1)
wxEndBusyCursor()
def OnParagraphs(self, event):
wxBeginBusyCursor()
self.nb.txtAfter.SetValue(self.spaceParagraphs(self.nb.txtBefore.GetValue()))
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(1)
wxEndBusyCursor()
def OnLineLength(self, event):
wxBeginBusyCursor()
self.nb.txtAfter.SetValue(self.fixLineLength(self.nb.txtBefore.GetValue()))
self.nb.txtAfter.SetStyle(0, self.nb.txtAfter.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(1)
wxEndBusyCursor()
def OnReplacePane(self, event):
self.nb.txtBefore.SetValue(self.nb.txtAfter.GetValue())
self.nb.txtAfter.SetValue('')
self.nb.txtBefore.SetStyle(0, self.nb.txtBefore.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(0)
def formatHeaders(self, txt):
## Title and Part type headers--5 returns after 6 before Chapter headers--
## 3 returns before first line. Chapter ends--4 returns before next chapter header.
lineList = string.split(txt, "\n")
titlePattern = re.compile('^CHAPTER\s+[IVXC\d]+\.?\s*$')
# search in the first 5 lines
for thisLine in range(len(lineList)):
m = titlePattern.search(lineList[thisLine])
if m:
# add 2 blank lines before the text
for i in range(2):
lineList.insert(thisLine+2, '')
# add 4 blank lines before the chapter title
for i in range(4):
lineList.insert(thisLine+1, '')
# add 3 blank lines before the chapter title
for i in range(3):
lineList.insert(thisLine, '')
break
txt = string.join(lineList, '\n')
self.SetStatusText("Status: Title headers are spaced properly.")
return txt
def RemovePageHeaders(self, txt):
lineList = string.split(txt, "\n")
headerPattern1 = re.compile('^\s*(\d+)\s+([A-Z\.]+\s*)+$')
headerPattern2 = re.compile('^\s*([A-Z\-\,\']+\.?\s+)+(\d+)$')
headerPattern3 = re.compile('^\s*([A-Z\-\,\']+\s+)*([A-Z\-]+\.)\s?$')
numPattern = re.compile('^\s*\d+\s*$')
# keeps the line numbers!
progressdlg = wxProgressDialog("Removing headers", "Please wait...", len(lineList),
self, wxPD_AUTO_HIDE | wxPD_CAN_ABORT | wxPD_APP_MODAL | wxPD_ESTIMATED_TIME | wxPD_ELAPSED_TIME )
for thisLine in range(len(lineList)-2):
m = headerPattern1.search(lineList[thisLine])
if m:
lineList[thisLine] = m.group(1)
else:
m = headerPattern2.search(lineList[thisLine])
if m:
lineList[thisLine] = m.group(2)
else:
m = headerPattern3.search(lineList[thisLine])
#print 'm', m, thisLine
m1 = numPattern.search(lineList[thisLine-1])
#print 'm1', m1, thisLine
m2 = numPattern.search(lineList[thisLine-2])
#print 'm2', m2, thisLine
m3 = numPattern.search(lineList[thisLine+1])
#print 'm3', m3, thisLine
m4 = numPattern.search(lineList[thisLine+2])
if m and (m1 or m2 or m3 or m4):
lineList[thisLine] = ''
if(operator.mod(thisLine, 10)==0 ):
progressdlg.Update(thisLine)
txt = string.join(lineList, '\n')
progressdlg.Destroy()
self.SetStatusText("Status: Page headers have been removed.")
return txt
def removeVolumeNumbers(self, txt):
## sub(pattern, repl, string[, count])
## any 'vol. 11-23.'
volPattern = re.compile('vol\.? \d{1,3}-\d{1,3}\.?')
txt = volPattern.sub('\n', txt)
self.SetStatusText("Status: All volume notes removed..")
return txt
def spaceParagraphs(self, txt):
lineList = string.split(txt, "\n")
# any line less than 45 chars ends a paragraph (?!)
paraPattern = re.compile('^.{3,55}$')
# any indented line starts a paragraph (?!)
indentPattern = re.compile('^ ')
thisLine = 21
while thisLine < len(lineList):
m1 = paraPattern.search(lineList[thisLine])
if m1:
# insert a blank line after this
lineList.insert(thisLine+1, '')
thisLine = thisLine
m2 = indentPattern.search(lineList[thisLine])
if m2 and not m1:
# insert a blank line before this
lineList.insert(thisLine, str(thisLine))
thisLine = thisLine + 1
thisLine = thisLine + 1
txt = string.join(lineList, '\n')
self.SetStatusText("Status: All paragraphs are spaced with one blank line.")
return txt
"""
E.Two spaces after each sentence [watch for ! or ? that do NOT end sentences,
then use only one space].
J. Elipses [word. . .] have no spaces before or after ".'s" unless they end a sentence with
four [. . . . ] then it is a sentence ending. . .with two spaces. . . . Next is a new sentence.
K. Dashes will be--dashes--with no extra spaces around them
"""
def spaceSentences(self, txt):
# this version has a bug; the colored hyphen display is offset by 1 when doing 'all'
# maybe split each line by the pattern, and use the len() of each to draw color
lineList = string.split(txt, "\n")
endPattern = re.compile('[?!.]\s(["A-Z])')
doubleSpacePattern = re.compile('\. ')
spaced = 0
progressdlg = wxProgressDialog("Double spacing", "Please wait...", len(lineList),
self, wxPD_AUTO_HIDE | wxPD_CAN_ABORT | wxPD_APP_MODAL | wxPD_ESTIMATED_TIME | wxPD_ELAPSED_TIME )
for thisLine in range(len(lineList)):
# insert the double space
(lineList[thisLine], number) = endPattern.subn(r'. \1', lineList[thisLine])
# was there a split?
if number > 1:
# lineList[thisLine] now has double spaces, so search for 'em
thisLinesSentances = doubleSpacePattern.split(lineList[thisLine])
previousLength = 0
for word in range(len(thisLinesSentances)-1):
spacePosition = self.nb.txtBefore.XYToPosition(len(thisLinesSentances[word])+previousLength, thisLine)
#self.nb.txtBefore.SetStyle(spacePosition+1, spacePosition+2, wxTextAttr(wxNullColour, "CYAN"))
previousLength = previousLength + len(thisLinesSentances[word]) + 1
spaced = spaced + number - 1
#if(operator.mod(thisLine, 10)==0 ):
progressdlg.Update(thisLine)
txt = string.join(lineList, '\n')
progressdlg.Destroy()
self.SetStatusText(string.join(["Status:", str(spaced),"sentences were spaced."]))
return txt
def FixCaps(self, txt):
capsPattern1 = re.compile('^([A-Z])([A-Z]+)(\s+[a-z].*)$') # AN ALL-CAPS WORD FOLLOWED BY A LOWER CASE LETTER
#capsPattern2 = re.compile('^([A-Z]{2,})') # AN ALL-CAPS WORD
lineList = string.split(txt, '\n')
capsFixed = 0
for thisLine in range(len(lineList)):
m = capsPattern1.search(lineList[thisLine])
if m:
fixedCase = string.join([m.group(1), string.lower(m.group(2))], '')
#capsPattern2.sub(fixedCase, lineList[thisLine])
lineList[thisLine] = string.join([fixedCase, m.group(3)], '')
capsFixed += 1
txt = string.join(lineList, '\n')
self.SetStatusText(string.join(["Status:", str(capsFixed),"words have had case adjusted."]))
return txt
def RemoveBrackets(self, txt):
blankPattern = re.compile('<.+>', re.M)
txt = blankPattern.sub('', txt)
return txt
def RemoveBlankLines(self, txt):
blankPattern = re.compile('\n[\s]*\n')
txt = blankPattern.sub('\n', txt)
return txt
def removePageNumbers(self, txt):
numPattern = re.compile('\n\s*\d+\s*\n')
txt = numPattern.sub('\n', txt)
return txt
def spellCheck(self, txt):
dict = {}
dict_has_key = dict.has_key
for line in open("english-words").xreadlines():
word = line[:-1]
if word: dict[word] = 1
skippedDict = {}
skippedDict_has_key = skippedDict.has_key
for line in open("skip_words").xreadlines():
word = line[:-1]
if word: skippedDict[word] = 1
newWords = {}
newSkipWords= {}
lineList = string.split(txt, "\n")
splitPattern = re.compile(r'(\W+)')
wordPattern = re.compile(r'^[a-zA-Z]+$')
for thisLine in range(len(lineList)):
thisLinesWords = splitPattern.split(lineList[thisLine])
previousLength = 0
stopDialog = 0
for i in range(len(thisLinesWords)):
if wordPattern.search(thisLinesWords[i]) \
and not dict_has_key(thisLinesWords[i]) \
and not skippedDict_has_key(thisLinesWords[i]) \
and not skippedDict_has_key(string.lower(thisLinesWords[i])) \
and not dict_has_key(string.lower(thisLinesWords[i])):
wordPosition = self.nb.txtBefore.XYToPosition(previousLength, thisLine)
# not a word
# highlight the word in yellow
self.nb.txtBefore.SetStyle(wordPosition, wordPosition+len(thisLinesWords[i]), wxTextAttr("BLACK", "YELLOW"))
choiceDlg = wxSingleChoiceDialog(self, thisLinesWords[i], 'Unknown word', ['Add', 'Skip', 'Change'])
# scroll the window to the line and position
self.nb.txtBefore.ShowPosition(wordPosition-50)
if choiceDlg.ShowModal() == wxID_OK:
if choiceDlg.GetStringSelection() == 'Add':
dict[string.lower(thisLinesWords[i])] = 1
newWords[string.lower(thisLinesWords[i])] = 1
# un-highlight the word in yellow
self.nb.txtBefore.SetStyle(wordPosition, wordPosition+len(thisLinesWords[i]), wxTextAttr("BLACK", "WHITE"))
elif choiceDlg.GetStringSelection() == 'Skip':
newSkipWords[string.lower(thisLinesWords[i])] = 1
# highlight the word in grey
self.nb.txtBefore.SetStyle(wordPosition, wordPosition+len(thisLinesWords[i]), wxTextAttr("BLACK", "LIGHT GREY"))
else:
dlg = wxTextEntryDialog(self, string.join(['Enter the new spelling for:', thisLinesWords[i]]), 'Change', '')
if dlg.ShowModal() == wxID_OK:
newSpelling = dlg.GetValue()
changePattern = re.compile(thisLinesWords[i])
lineList[thisLine] = changePattern.sub(newSpelling, lineList[thisLine])
dlg.Destroy()
else:
# drop out of the outer for loop
stopDialog = 1
break
choiceDlg.Destroy()
previousLength = previousLength + len(thisLinesWords[i])
if stopDialog:
break
txt = string.join(lineList, '\n')
dlg = wxMessageDialog(self, string.join(["Status:", str(len(newWords)),"words added.\n", str(len(newSkipWords)),"words skipped."]), "Complete", wxOK | wxICON_INFORMATION)
dlg.ShowModal()
dlg.Destroy()
self.SetStatusText('')
self.updateDictionary(newWords, 'english-words', 2)
self.updateDictionary(newSkipWords, 'skip_words', 1)
return txt
def RemoveHyphens(self, txt):
self.SetStatusText("Status: Starting hyphen processing.")
dict = {}
dict_has_key = dict.has_key
for line in open("english-words").xreadlines():
word = line[:-1]
if word: dict[word] = 1
skippedDict = {}
skippedDict_has_key = skippedDict.has_key
for line in open("skip_words").xreadlines():
word = line[:-1]
if word: skippedDict[word] = 1
newWords = {}
newSkipWords= {}
lineList = string.split(txt, "\n")
#numOfDashes = string.count(txt, "-")
#capsWordsPattern = re.compile('([[^a-z][A-Z]]{2,} ){2,}') #2 or more words with 2 or more letters, all caps?
capsWordsPattern = re.compile('([A-Z]{2,}\s+)+[A-Z]{2,}') #2 or more words with 2 or more letters, all caps?
bracketsPattern = re.compile('^<.+>$') # a line with only a <...> comment
preHyphenPattern = re.compile('(.+ ([a-zA-Z]+))-$') # the first half of the word
#postHyphenPattern = re.compile('^([a-zA-Z]+[,.:;\']? ?)') # why not???
postHyphenPattern = re.compile('^([a-zA-Z]+)([,.:;\'])?\s?') # second half (on next line...)
numPattern = re.compile('^\d+$')
#lengthShortened = 0
thisPosition = 0
numberRemoved = 0
added = 0
skipped = 0
for thisLine in range(len(lineList)):
lineList[thisLine] = string.strip(lineList[thisLine])
m1 = preHyphenPattern.search(lineList[thisLine])
if m1:
# the next line might be blank or a header! so, skip to the next relavent line
tempLine = thisLine
skippedLength = 0
while 1:
if lineList[tempLine+1] == '' or numPattern.search(lineList[tempLine+1]) or capsWordsPattern.search(lineList[tempLine+1]) or bracketsPattern.search(lineList[tempLine+1]):
tempLine = tempLine+1
skippedLength = len(lineList[tempLine+1])
else:
break
m2 = postHyphenPattern.search(lineList[tempLine+1])
if m2:
testWord = string.join([m1.group(2), m2.group(1)], '')
testWordHyphenated = string.join([m1.group(2), m2.group(1)], '-')
# determione the position values
#hyphenPosition = self.nb.txtBefore.XYToPosition(len(lineList[thisLine])+lengthShortened-1, thisLine)
#suffixEndPosition = self.nb.txtBefore.XYToPosition(len(m2.group(1)), tempLine+1)
# scroll the window to the line and position
#self.nb.txtBefore.ShowPosition(hyphenPosition)
# insert a test here for hyphenPrefixes and hyphenSuffixes?
if skippedDict_has_key(testWordHyphenated):
skipped = skipped + 1
# now, decide whether to de-hyphenate(!) or not
# test first half, on first line
elif dict_has_key(m1.group(2)):
# test to see is the concatenation is a known word
if dict_has_key(testWord):
#now, ver 3.6, concatenate automatically...
lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '')
if m2.group(2): # add the punctuation
lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '')
# remove the suffix from the second
lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1)
#lengthShortened = len(m2.group(1)) + 1
thisPosition += len(m2.group(1)) + 1
#self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "YELLOW"))
else: # first half known, but the whole concatenation is not a known word
message = string.join([testWordHyphenated, '\n', 'line: ', str(thisLine), '\n', lineList[thisLine], '\n', lineList[tempLine+1]], '')
choiceDlg = wxSingleChoiceDialog(self, message, 'Unknown word.', ['Concatenate', 'Hyphenate'])
if choiceDlg.ShowModal() == wxID_OK:
if choiceDlg.GetStringSelection() == 'Concatenate':
dict[testWord] = 1
lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '')
if m2.group(2):
lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '')
# remove the suffix from the second
lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1)
#lengthShortened = len(m2.group(1)) + 1
thisPosition += len(m2.group(1)) + 1
newWords[testWord] = 1
added = added + 1
#self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "YELLOW"))
elif choiceDlg.GetStringSelection() == 'Hyphenate':
#lengthShortened = 0
newSkipWords[string.lower(testWordHyphenated)] = 1
#self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "RED"))
else: # drop out of the outer for loop
break
choiceDlg.Destroy()
# test the second half, on the next line
elif dict_has_key(m2.group(1)):
# test to see if the concatenation is a word
if dict_has_key(testWord):
#now, ver 3.6, concatenate automatically...
lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '')
if m2.group(2): # add the punctuation
lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '')
# remove the suffix from the second
lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1)
#lengthShortened = len(m2.group(1)) + 1
thisPosition += len(m2.group(1)) + 1
#self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "YELLOW"))
else: # the whole concatenation is not a word!
message = string.join([testWordHyphenated, '\n', 'line: ', str(thisLine), '\n', lineList[thisLine], '\n', lineList[tempLine+1]], '')
choiceDlg = wxSingleChoiceDialog(self, message, 'Unknown word..', ['Concatenate', 'Hyphenate'])
if choiceDlg.ShowModal() == wxID_OK:
if choiceDlg.GetStringSelection() == 'Concatenate':
dict[testWord] = 1
lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '')
if m2.group(2):
lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '')
# remove the suffix from the second
lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1)
#lengthShortened = len(m2.group(1)) + 1
thisPosition += len(m2.group(1)) + 1
newWords[testWord] = 1
added = added + 1
#self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "YELLOW"))
elif choiceDlg.GetStringSelection() == 'Hyphenate':
#lengthShortened = 0
newSkipWords[string.lower(testWordHyphenated)] = 1
#self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), hyphenPosition+2+len(m2.group(1)), wxTextAttr("BLACK", "RED"))
else: # drop out of the outer for loop
break
choiceDlg.Destroy()
else:
# neither half is a word, so we must concatenate
lineList[thisLine] = string.join([m1.group(1), m2.group(1)], '')
if m2.group(2):
lineList[thisLine] = string.join([lineList[thisLine], m2.group(2)], '')
lineList[tempLine+1] = postHyphenPattern.sub( '', lineList[tempLine+1], 1)
#lengthShortened = len(m2.group(1)) + 1
thisPosition += len(m2.group(1)) + 1
# highlight the hyphenated word in green
#self.nb.txtBefore.SetStyle(hyphenPosition-len(m1.group(2)), suffixEndPosition, wxTextAttr("BLACK", "GREEN"))
numberRemoved = numberRemoved+ 1
else:
#lengthShortened = 0
print 'Error: No next line match for line', thisLine
else: # no hyphen on this line
#lengthShortened = 0
pass
thisPosition += len(lineList[thisLine])+1
txt = string.join(lineList, '\n')
self.SetStatusText(string.join(["Status:", str(numberRemoved),"hyphens auto-removed.",
str(len(newWords)),"words added.", str(len(newSkipWords)+skipped),"words skipped."]))
thisPosition += len(newSkipWords)+skipped
self.nb.txtBefore.ShowPosition(thisPosition)
self.nb.txtBefore.SetStyle(thisPosition-10, thisPosition, wxTextAttr("BLACK", "YELLOW"))
self.updateDictionary(newWords, 'english-words', 2)
self.updateDictionary(newSkipWords, 'skip_words', 1)
return txt
def fixLineLength(self, txt):
## problem where a line is long after a blank line: does not fix
## We try to average 65, with 55 to 75 being short and long other than for emergencies,
## which will extend to 51 to 79.
# assumes that paragraphs and headers have been spaced
lineList = string.split(txt, "\n")
newLineList = []
desiredLength = 70
blankPattern = re.compile('^\d*\s*$')
capsWordsPattern = re.compile('([A-Z]{2,}\s+)+[A-Z\.]{2,}') #2 or more words with 2 or more letters, all caps
headerPattern1 = re.compile('^(\d+)\s+([A-Z.]+\s*)+$')
headerPattern2 = re.compile('^([A-Z.]+\s+)+(\d+)$')
hyphenPattern = re.compile('[a-zA-Z]-[a-zA-Z]')
hyphenEndPattern = re.compile('-$')
thisLine = 0
progressdlg = wxProgressDialog("Setting line lengths", "Please wait...", len(lineList),
self, wxPD_AUTO_HIDE | wxPD_CAN_ABORT | wxPD_APP_MODAL | wxPD_ESTIMATED_TIME | wxPD_ELAPSED_TIME )
lineList.append('\n')
#search for the start of a new paragraph
while thisLine < len(lineList):
#print thisLine
if capsWordsPattern.search(lineList[thisLine]) or blankPattern.search(lineList[thisLine]):
newLineList.append(lineList[thisLine])
#print '\tcaps', lineList[thisLine]
paragraphStart = thisLine + 1
progressdlg.Update(thisLine)
# point to the next line in lineList (beforeText)
thisLine += 1
else:
# it is a start line for a new paragraph
#print '\telse', thisLine
paragraph = lineList[thisLine]
thisLine += 1
# assume that short lines also end a paragraph (!)
#while len(lineList[thisLine]) > 55 and not blankPattern.search(lineList[thisLine+1]):
while len(lineList[thisLine]) > 1:
# add the line's words
paragraph = string.join([paragraph, lineList[thisLine]])
#print '\tinner', thisLine, paragraph
thisLine += 1
# add this next, the ending line, too
if blankPattern.search(lineList[thisLine]):
pass
else:
paragraph = string.join([paragraph, lineList[thisLine]])
# create an array of words
paragraphArray = string.split(paragraph)
#print paragraphArray
tempLine = paragraphArray.pop(0)
while len(paragraphArray):
# there are words to be added, has the next line been started? (might be blanks...)
if tempLine:
# the temp line has been started, does the next word have a hyphen?
if hyphenPattern.search(paragraphArray[0]):
# see if the whole double word will fit best
if (len(tempLine)+len(paragraphArray[0])-desiredLength < desiredLength-len(tempLine)) and (len(tempLine)+len(paragraphArray[0]) < 79):
# if the current end word is hyphenated, join the next word without a space
if hyphenEndPattern.search(tempLine):
# add the hyphenated word pair
tempLine = string.join([tempLine, paragraphArray.pop(0)], '')
else:
# add the word with a space
tempLine = string.join([tempLine, paragraphArray.pop(0)])
# will the first half fit
else:
splitWord = string.split(paragraphArray[0], '-')
if ((len(splitWord[0])+ len(tempLine)+1-desiredLength) < desiredLength-len(tempLine)) and (len(tempLine)+len(splitWord[0]) < 79):
if hyphenEndPattern.search(tempLine):
# add the first half hyphenated word
tempLine = string.join([tempLine, splitWord[0]], '')
else:
# add the first half hyphenated word, spaced
tempLine = string.join([tempLine, splitWord[0]])
# put a hyphen on the end
tempLine = string.join([tempLine, '-'], '')
paragraphArray[0] = splitWord[1]
newLineList.append(tempLine)
# must form a new line...
tempLine = paragraphArray.pop(0)
# the first half won't even fit
else:
newLineList.append(tempLine)
tempLine = paragraphArray.pop(0)
# there is no hyphen...
elif (len(tempLine)+len(paragraphArray[0])-desiredLength < desiredLength-len(tempLine)) and (len(tempLine)+len(paragraphArray[0]) < 79):
# add the word
tempLine = string.join([tempLine, paragraphArray.pop(0)])
else:
# the next word will not fit!, write the line...
newLineList.append(tempLine)
tempLine = paragraphArray.pop(0)
else:
# start the next line
tempLine = paragraphArray.pop(0)
if len(tempLine):
# words are left in tempLine after the while, write the last line..
newLineList.append(tempLine)
tempLine = ''
# done, write it
txt = string.join(newLineList, '\n')
progressdlg.Destroy()
return txt
def updateDictionary(self, hash, fileName, sleepTime):
if len(hash):
f=open(fileName, 'a+')
for aKey in hash.keys():
line = string.join([aKey, '\n'], '')
f.write(line)
f.close
time.sleep(sleepTime)
self.SetStatusText(string.join(["Status: The", fileName,"dictionary has been updated with", str(len(hash)), "words."]))
def ExitFrame(self, event):
self.Close(true)
# ----------------------------------------------------------------------------------------
# Some handlers.
# ----------------------------------------------------------------------------------------
def OnOpen(self, event):
dlg = wxFileDialog (self, "Select a text file to import", ".", "", "*.*", wxOPEN)
if dlg.ShowModal() == wxID_OK:
try:
self.nb.txtBefore.LoadFile(dlg.GetPath())
except IOError:
dlg_m = wxMessageDialog (self,
'There was an error opening the new file.',
'Error!', wxOK)
dlg_m.ShowModal()
dlg_m.Destroy()
dlg.Destroy()
self.nb.txtBefore.SetStyle(0, self.nb.txtBefore.GetLastPosition(), wxTextAttr("BLACK", wxNullColour, self.fontObj))
self.nb.SetSelection(0)
def OnFileSave(self, event):
dlg = wxFileDialog(self, "Save to file", ".", "", "*.txt", wxSAVE)
if dlg.ShowModal() == wxID_OK:
for path in dlg.GetPaths():
try:
f=open(path, 'w')
f.write(self.nb.txtAfter.GetValue())
f.close
except IOError:
dlg_m = wxMessageDialog (self,
'There was an error writing.',
'Error!', wxOK)
dlg_m.ShowModal()
dlg_m.Destroy()
dlg.Destroy()
def OnCloseMe(self, event):
self.Close(true)
def OnCloseWindow(self, event):
self.Destroy()
class HelpFrame(wxFrame):
def __init__(self, parent, ID, title):
wxFrame.__init__(self, parent, ID, title,
wxDefaultPosition, wxSize(600, 550))
if wxPlatform == '__WXMSW__':
self.icon = wxIcon('imagery\\book.ico', wxBITMAP_TYPE_ICO )
self.SetIcon(self.icon)
##### create and add notebook pages
self.htmlHelp = wxHtmlWindow(self, -1)
try:
self.htmlHelp.LoadPage('help/index.htm')
except IOError:
dlg_m = wxMessageDialog (self, 'There was an error opening the file.', 'Error!', wxOK)
dlg_m.ShowModal()
dlg_m.Destroy()
############### Main application class ###########
class OCRApp(wxApp):
def OnInit(self):
frame = MainFrame(NULL, -1, "OCR to Gutenberg text")
frame.Show(true)
self.SetTopWindow(frame)
return true
app = OCRApp(0)
app.MainLoop()