better word replacement
This commit is contained in:
@ -79,8 +79,20 @@ def load_and_split(source: Path, books: list) -> dict[str, str]:
|
|||||||
Read the source file and split it into sections keyed by label.
|
Read the source file and split it into sections keyed by label.
|
||||||
Each section starts at its (start_line1, start_line2) marker pair and
|
Each section starts at its (start_line1, start_line2) marker pair and
|
||||||
ends just before the next section's marker.
|
ends just before the next section's marker.
|
||||||
|
|
||||||
|
Marker positions are always detected from the *original* unmodified file
|
||||||
|
(_ORIG_FILE) when it exists, so that phonetic fixes applied to section
|
||||||
|
headings in the TTS-fixed file can never break section detection. The
|
||||||
|
line numbers are identical in both files because word-level replacements
|
||||||
|
never add or remove lines.
|
||||||
"""
|
"""
|
||||||
raw_lines = source.read_text(encoding="utf-8").splitlines()
|
# Use the original (un-fixed) file for marker detection so phonetic
|
||||||
|
# changes to heading lines don't break matching.
|
||||||
|
marker_source = _ORIG_FILE if _ORIG_FILE.exists() else source
|
||||||
|
marker_lines = marker_source.read_text(encoding="utf-8").splitlines()
|
||||||
|
|
||||||
|
# The content to actually return comes from `source` (may be fixed file).
|
||||||
|
content_lines = source.read_text(encoding="utf-8").splitlines()
|
||||||
|
|
||||||
# Build a mapping: (label, line1, line2) for each book
|
# Build a mapping: (label, line1, line2) for each book
|
||||||
markers = [(label, m[0].strip(), m[1].strip()) for label, m, _, _ in books]
|
markers = [(label, m[0].strip(), m[1].strip()) for label, m, _, _ in books]
|
||||||
@ -88,9 +100,9 @@ def load_and_split(source: Path, books: list) -> dict[str, str]:
|
|||||||
# Find the line index of each marker's first occurrence (two-line match)
|
# Find the line index of each marker's first occurrence (two-line match)
|
||||||
marker_positions: list[tuple[int, int]] = [] # (line_idx, books_idx)
|
marker_positions: list[tuple[int, int]] = [] # (line_idx, books_idx)
|
||||||
for book_idx, (label, m1, m2) in enumerate(markers):
|
for book_idx, (label, m1, m2) in enumerate(markers):
|
||||||
for line_idx, line in enumerate(raw_lines[:-1]):
|
for line_idx, line in enumerate(marker_lines[:-1]):
|
||||||
if (line.strip() == m1 and
|
if (line.strip().upper() == m1.upper() and
|
||||||
raw_lines[line_idx + 1].strip().startswith(m2)):
|
marker_lines[line_idx + 1].strip().upper().startswith(m2.upper())):
|
||||||
marker_positions.append((line_idx, book_idx))
|
marker_positions.append((line_idx, book_idx))
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -104,8 +116,8 @@ def load_and_split(source: Path, books: list) -> dict[str, str]:
|
|||||||
if rank + 1 < len(marker_positions):
|
if rank + 1 < len(marker_positions):
|
||||||
end_line = marker_positions[rank + 1][0]
|
end_line = marker_positions[rank + 1][0]
|
||||||
else:
|
else:
|
||||||
end_line = len(raw_lines)
|
end_line = len(content_lines)
|
||||||
text = "\n".join(raw_lines[line_idx:end_line]).strip()
|
text = "\n".join(content_lines[line_idx:end_line]).strip()
|
||||||
sections[label] = text
|
sections[label] = text
|
||||||
|
|
||||||
return sections
|
return sections
|
||||||
|
|||||||
@ -736,14 +736,24 @@ class ProperNounAuditor(tk.Tk):
|
|||||||
count_total = 0
|
count_total = 0
|
||||||
for original, replacement in self.fixes.items():
|
for original, replacement in self.fixes.items():
|
||||||
pattern = r'\b' + re.escape(original) + r'\b'
|
pattern = r'\b' + re.escape(original) + r'\b'
|
||||||
new_text, n = re.subn(pattern, replacement, text)
|
new_text, n = re.subn(pattern, replacement, text, flags=re.IGNORECASE)
|
||||||
if n:
|
if n:
|
||||||
text = new_text
|
text = new_text
|
||||||
count_total += n
|
count_total += n
|
||||||
|
|
||||||
|
# Convert ALL-CAPS words (2+ letters) to Title Case: HAGOTH → Hagoth
|
||||||
|
# Handles hyphenated names like ANTI-NEPHI-LEHI → Anti-Nephi-Lehi
|
||||||
|
text, n_caps = re.subn(
|
||||||
|
r'\b[A-Z]{2,}(?:-[A-Z]{2,})*\b',
|
||||||
|
lambda m: m.group(0).title(),
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
|
||||||
FIXED_TEXT_OUT.write_text(text, encoding="utf-8")
|
FIXED_TEXT_OUT.write_text(text, encoding="utf-8")
|
||||||
messagebox.showinfo(
|
messagebox.showinfo(
|
||||||
"Done",
|
"Done",
|
||||||
f"Applied {len(self.fixes)} fix rules ({count_total} replacements).\n\n"
|
f"Applied {len(self.fixes)} fix rules ({count_total} replacements).\n"
|
||||||
|
f"Converted {n_caps} ALL-CAPS words to Title Case.\n\n"
|
||||||
f"Saved to:\n{FIXED_TEXT_OUT}"
|
f"Saved to:\n{FIXED_TEXT_OUT}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -19,6 +19,6 @@
|
|||||||
"Nephi-Im": "Kneefi-Im",
|
"Nephi-Im": "Kneefi-Im",
|
||||||
"Nephitish": "Kneefitish",
|
"Nephitish": "Kneefitish",
|
||||||
"Zenephi": "Zekneefi",
|
"Zenephi": "Zekneefi",
|
||||||
"Nephi": "Kneefi",
|
"Moroni": "Mor-oh-nye",
|
||||||
"Moroni": "Mor-oh-nye"
|
"Nephi": "Knee-fye"
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user