mirror of
https://github.com/ashim-hq/ashim
synced 2026-04-21 13:37:52 +00:00
- Replace [object Object] errors with readable messages across all 20+ API routes by normalizing Zod validation errors to strings (formatZodErrors) - Add parseApiError() on frontend to defensively handle any details type - Add global Fastify error handler with full stack traces in logs - Fix image-to-pdf auth: Object.entries(headers) → headers.forEach() - Fix passport-photo: safeParse + formatZodErrors, safe error extraction - Fix OCR silent fallbacks: log exception type/message when falling back, include actual engine used in API response and Docker logs - Fix split tool: process all uploaded images, combine into ZIP with subfolders per image - Fix batch support for blur-faces, strip-metadata, edit-metadata, vectorize: add processAllFiles branch for multi-file uploads - Docker: LOG_LEVEL=debug, PYTHONWARNINGS=default for visibility - Add Playwright e2e tests verifying all fixes against Docker container
317 lines
11 KiB
Python
317 lines
11 KiB
Python
"""Text extraction from images using Tesseract, PaddleOCR PP-OCRv5, or PaddleOCR-VL 1.5."""
|
|
import sys
|
|
import json
|
|
import os
|
|
|
|
# Lazy-loaded VLM instance (stays resident in dispatcher process)
|
|
_paddleocr_vl_instance = None
|
|
|
|
|
|
def emit_progress(percent, stage):
|
|
"""Emit structured progress to stderr for bridge.ts to capture."""
|
|
print(json.dumps({"progress": percent, "stage": stage}), file=sys.stderr, flush=True)
|
|
|
|
|
|
TESSERACT_LANG_MAP = {
|
|
"en": "eng", "de": "deu", "fr": "fra", "es": "spa",
|
|
"zh": "chi_sim", "ja": "jpn", "ko": "kor",
|
|
}
|
|
|
|
PADDLE_LANG_MAP = {
|
|
"en": "en", "de": "latin", "fr": "latin", "es": "latin",
|
|
"zh": "ch", "ja": "japan", "ko": "korean",
|
|
}
|
|
|
|
|
|
def auto_detect_language(input_path):
|
|
"""Detect the predominant script in the image using Tesseract multi-lang.
|
|
|
|
Runs a quick Tesseract pass with all installed language packs,
|
|
then analyzes the Unicode character ranges in the output to
|
|
determine which PaddleOCR language model to use.
|
|
"""
|
|
import subprocess
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
["tesseract", input_path, "stdout", "-l", "eng+kor+chi_sim+jpn"],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
text = result.stdout.strip()
|
|
if not text:
|
|
return "en"
|
|
|
|
hangul = sum(1 for c in text if "\uAC00" <= c <= "\uD7AF" or "\u1100" <= c <= "\u11FF")
|
|
cjk = sum(1 for c in text if "\u4E00" <= c <= "\u9FFF")
|
|
hiragana = sum(1 for c in text if "\u3040" <= c <= "\u309F")
|
|
katakana = sum(1 for c in text if "\u30A0" <= c <= "\u30FF")
|
|
latin = sum(1 for c in text if c.isascii() and c.isalpha())
|
|
|
|
total = hangul + cjk + hiragana + katakana + latin
|
|
if total == 0:
|
|
return "en"
|
|
|
|
if hangul / total > 0.3:
|
|
return "ko"
|
|
if (hiragana + katakana) / total > 0.2:
|
|
return "ja"
|
|
if cjk / total > 0.3:
|
|
return "zh"
|
|
return "en"
|
|
except Exception:
|
|
return "en"
|
|
|
|
|
|
def run_tesseract(input_path, language, is_auto=False):
|
|
"""Run Tesseract OCR (Fast tier)."""
|
|
import subprocess
|
|
|
|
# When auto-detected, use all installed language packs for best coverage
|
|
if is_auto:
|
|
tess_lang = "eng+kor+chi_sim+jpn+deu+fra+spa"
|
|
else:
|
|
tess_lang = TESSERACT_LANG_MAP.get(language, "eng")
|
|
|
|
emit_progress(30, "Scanning")
|
|
result = subprocess.run(
|
|
["tesseract", input_path, "stdout", "-l", tess_lang],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=120,
|
|
)
|
|
emit_progress(70, "Extracting text")
|
|
text = result.stdout.strip()
|
|
if result.returncode != 0 and not text:
|
|
raise RuntimeError(result.stderr.strip() or "Tesseract failed")
|
|
return text
|
|
|
|
|
|
def _extract_ocr_texts(results):
|
|
"""Extract text from PaddleOCR 3.x result objects.
|
|
|
|
Handles multiple result formats across PaddleOCR versions:
|
|
- 3.4.x: OCRResult with .json["res"]["rec_texts"]
|
|
- Earlier: result objects with .res dict containing "text" list
|
|
"""
|
|
text_parts = []
|
|
for res in results:
|
|
# PaddleOCR 3.4.x format: OCRResult with .json dict
|
|
if hasattr(res, "json") and isinstance(res.json, dict):
|
|
inner = res.json.get("res", {})
|
|
rec_texts = inner.get("rec_texts", [])
|
|
if rec_texts:
|
|
text_parts.extend(rec_texts)
|
|
continue
|
|
# Older format: .res dict with "text" list
|
|
if hasattr(res, "res") and isinstance(res.res, dict):
|
|
text_parts.extend(res.res.get("text", []))
|
|
return "\n".join(text_parts)
|
|
|
|
|
|
def run_paddleocr_v5(input_path, language):
|
|
"""Run PaddleOCR PP-OCRv5 server models (Balanced tier)."""
|
|
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
|
|
stdout_fd = os.dup(1)
|
|
os.dup2(2, 1)
|
|
|
|
try:
|
|
import logging
|
|
from paddleocr import PaddleOCR
|
|
from gpu import gpu_available
|
|
|
|
# Suppress PaddleOCR internal logging (replaces removed show_log param)
|
|
for name in ("ppocr", "paddleocr", "paddle"):
|
|
logging.getLogger(name).setLevel(logging.ERROR)
|
|
|
|
paddle_lang = PADDLE_LANG_MAP.get(language, "en")
|
|
device = "gpu:0" if gpu_available() else "cpu"
|
|
|
|
emit_progress(20, "Loading")
|
|
ocr = PaddleOCR(
|
|
lang=paddle_lang,
|
|
device=device,
|
|
ocr_version="PP-OCRv5",
|
|
)
|
|
emit_progress(30, "Scanning")
|
|
results = ocr.predict(input=input_path)
|
|
emit_progress(70, "Extracting text")
|
|
|
|
text = _extract_ocr_texts(results)
|
|
finally:
|
|
os.dup2(stdout_fd, 1)
|
|
os.close(stdout_fd)
|
|
|
|
return text
|
|
|
|
|
|
def run_paddleocr_vl(input_path):
|
|
"""Run PaddleOCR-VL 1.5 vision-language model (Best tier).
|
|
|
|
The VLM is lazy-loaded on first call and stays resident in the
|
|
dispatcher process for subsequent requests.
|
|
Requires PaddlePaddle >= 3.2 for fused_rms_norm_ext.
|
|
"""
|
|
global _paddleocr_vl_instance
|
|
os.environ["PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK"] = "True"
|
|
|
|
stdout_fd = os.dup(1)
|
|
os.dup2(2, 1)
|
|
|
|
try:
|
|
if _paddleocr_vl_instance is None:
|
|
emit_progress(15, "Loading model")
|
|
from paddleocr import PaddleOCRVL
|
|
from gpu import gpu_available
|
|
|
|
device = "gpu" if gpu_available() else "cpu"
|
|
_paddleocr_vl_instance = PaddleOCRVL(device=device)
|
|
|
|
emit_progress(30, "Scanning")
|
|
output = _paddleocr_vl_instance.predict(input_path)
|
|
emit_progress(70, "Extracting text")
|
|
|
|
text_parts = []
|
|
for res in output:
|
|
if hasattr(res, "parsing_res_list"):
|
|
for block in res.parsing_res_list:
|
|
content = block.get("block_content", "")
|
|
if content:
|
|
text_parts.append(content)
|
|
elif hasattr(res, "rec_text"):
|
|
text_parts.append(res.rec_text)
|
|
# Also try the json-based extraction as fallback
|
|
elif hasattr(res, "json") and isinstance(res.json, dict):
|
|
inner = res.json.get("res", {})
|
|
rec_texts = inner.get("rec_texts", [])
|
|
text_parts.extend(rec_texts)
|
|
|
|
text = "\n".join(text_parts)
|
|
finally:
|
|
os.dup2(stdout_fd, 1)
|
|
os.close(stdout_fd)
|
|
|
|
return text
|
|
|
|
|
|
def main():
|
|
input_path = sys.argv[1]
|
|
settings = json.loads(sys.argv[2]) if len(sys.argv) > 2 else {}
|
|
|
|
quality = settings.get("quality", None)
|
|
language = settings.get("language", "auto")
|
|
enhance = settings.get("enhance", True)
|
|
|
|
# Backward compat: old "engine" param maps to quality
|
|
if quality is None:
|
|
engine = settings.get("engine", "tesseract")
|
|
quality = "fast" if engine == "tesseract" else "balanced"
|
|
|
|
preprocessed_path = None
|
|
try:
|
|
emit_progress(5, "Preparing")
|
|
|
|
# Preprocessing (if enabled)
|
|
if enhance:
|
|
emit_progress(8, "Enhancing image")
|
|
try:
|
|
from ocr_preprocess import preprocess
|
|
preprocessed_path = input_path + "_enhanced.png"
|
|
preprocess(input_path, preprocessed_path)
|
|
input_path = preprocessed_path
|
|
except Exception as e:
|
|
print(json.dumps({"warning": f"Enhancement skipped: {e}"}), file=sys.stderr, flush=True)
|
|
preprocessed_path = None
|
|
|
|
# Language auto-detection
|
|
was_auto = language == "auto"
|
|
if was_auto:
|
|
emit_progress(10, "Detecting language")
|
|
language = auto_detect_language(input_path)
|
|
|
|
engine_used = quality
|
|
|
|
# Route to engine based on quality tier
|
|
if quality == "fast":
|
|
try:
|
|
text = run_tesseract(input_path, language, is_auto=was_auto)
|
|
engine_used = "tesseract"
|
|
except FileNotFoundError:
|
|
print(json.dumps({"success": False, "error": "Tesseract is not installed"}))
|
|
sys.exit(1)
|
|
|
|
elif quality == "balanced":
|
|
try:
|
|
text = run_paddleocr_v5(input_path, language)
|
|
engine_used = "paddleocr-v5"
|
|
except ImportError as e:
|
|
print(json.dumps({"success": False, "error": f"PaddleOCR is not installed: {e}"}))
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(json.dumps({
|
|
"warning": f"PaddleOCR PP-OCRv5 failed ({type(e).__name__}: {e}), falling back to Tesseract"
|
|
}), file=sys.stderr, flush=True)
|
|
emit_progress(25, "PaddleOCR failed, falling back to Tesseract")
|
|
try:
|
|
text = run_tesseract(input_path, language, is_auto=was_auto)
|
|
engine_used = "tesseract (fallback from balanced)"
|
|
except FileNotFoundError:
|
|
print(json.dumps({"success": False, "error": "OCR engines unavailable: PaddleOCR failed and Tesseract is not installed"}))
|
|
sys.exit(1)
|
|
|
|
elif quality == "best":
|
|
try:
|
|
text = run_paddleocr_vl(input_path)
|
|
engine_used = "paddleocr-vl"
|
|
except ImportError as e:
|
|
print(json.dumps({
|
|
"warning": f"PaddleOCR-VL not available ({e}), trying PP-OCRv5"
|
|
}), file=sys.stderr, flush=True)
|
|
emit_progress(20, "VL model unavailable, trying PP-OCRv5")
|
|
try:
|
|
text = run_paddleocr_v5(input_path, language)
|
|
engine_used = "paddleocr-v5 (fallback from best)"
|
|
except Exception as e2:
|
|
print(json.dumps({
|
|
"warning": f"PP-OCRv5 also failed ({type(e2).__name__}: {e2}), falling back to Tesseract"
|
|
}), file=sys.stderr, flush=True)
|
|
emit_progress(25, "PP-OCRv5 failed, falling back to Tesseract")
|
|
text = run_tesseract(input_path, language, is_auto=was_auto)
|
|
engine_used = "tesseract (fallback from best)"
|
|
except Exception as e:
|
|
print(json.dumps({
|
|
"warning": f"PaddleOCR-VL failed ({type(e).__name__}: {e}), trying PP-OCRv5"
|
|
}), file=sys.stderr, flush=True)
|
|
emit_progress(20, "VL model failed, trying PP-OCRv5")
|
|
try:
|
|
text = run_paddleocr_v5(input_path, language)
|
|
engine_used = "paddleocr-v5 (fallback from best)"
|
|
except Exception as e2:
|
|
print(json.dumps({
|
|
"warning": f"PP-OCRv5 also failed ({type(e2).__name__}: {e2}), falling back to Tesseract"
|
|
}), file=sys.stderr, flush=True)
|
|
emit_progress(25, "PP-OCRv5 failed, falling back to Tesseract")
|
|
text = run_tesseract(input_path, language, is_auto=was_auto)
|
|
engine_used = "tesseract (fallback from best)"
|
|
|
|
else:
|
|
print(json.dumps({"success": False, "error": f"Unknown quality: {quality}"}))
|
|
sys.exit(1)
|
|
|
|
emit_progress(95, "Done")
|
|
print(json.dumps({"success": True, "text": text, "engine": engine_used}))
|
|
|
|
except Exception as e:
|
|
print(json.dumps({"success": False, "error": str(e)}))
|
|
sys.exit(1)
|
|
finally:
|
|
# Clean up preprocessed temp file
|
|
if preprocessed_path:
|
|
try:
|
|
os.remove(preprocessed_path)
|
|
except OSError:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|