NER: Scan JPG NER JSON

From OnnoWiki
Revision as of 06:50, 13 September 2025 by Onnowpurbo (talk | contribs) (Created page with "Siap! Berikut **script Python all-in-one** untuk: 1. baca gambar hasil scan (JPG/PNG), 2. lakukan OCR (Tesseract via `pytesseract`), 3. lakukan **NER via Ollama** (model loka...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

Siap! Berikut **script Python all-in-one** untuk:

1. baca gambar hasil scan (JPG/PNG), 2. lakukan OCR (Tesseract via `pytesseract`), 3. lakukan **NER via Ollama** (model lokal, mis. `llama3`/`llama3.1`/`mistral`), 4. keluaran **JSON** yang rapi.

Termasuk sedikit **pre-processing** (grayscale, denoise, threshold, auto-deskew ringan) agar OCR lebih stabil pada hasil scan.

---

      1. Prasyarat
  • **Tesseract OCR** terpasang:
 * Ubuntu/Debian:
   ```bash
   sudo apt-get update
   sudo apt-get install -y tesseract-ocr tesseract-ocr-ind tesseract-ocr-eng
   ```
 ```bash
 ollama pull llama3.1
 ```
  • Python deps:
 ```bash
 pip install pillow pytesseract opencv-python requests python-dotenv
 ```

Opsional: buat file `.env` untuk override endpoint/model:

``` OLLAMA_BASE=http://localhost:11434 OLLAMA_MODEL=llama3.1 OCR_LANG=ind+eng ```

---

      1. Script: `ocr_ner_ollama.py`

> Jalankan: > `python ocr_ner_ollama.py input1.jpg input2.png --lang ind+eng --model llama3.1 --out result.json`

```python

  1. !/usr/bin/env python3
  2. -*- coding: utf-8 -*-

import argparse import base64 import io import json import os import re import sys from dataclasses import asdict, dataclass from typing import List, Dict, Any, Tuple

import cv2 import numpy as np import pytesseract import requests from PIL import Image from dotenv import load_dotenv

  1. =========================
  2. Data structures
  3. =========================

@dataclass class OCRWord:

   text: str
   conf: float
   left: int
   top: int
   width: int
   height: int
   line_num: int
   par_num: int

@dataclass class OCROutput:

   text: str
   words: List[OCRWord]
   image_size: Tuple[int, int]  # (width, height)

@dataclass class Entity:

   text: str
   label: str
   start: int
   end: int
   confidence: float = None  # model may not provide; keep optional

@dataclass class NEROutput:

   entities: List[Entity]
   notes: str = ""


  1. =========================
  2. Image preprocessing
  3. =========================

def auto_deskew(image: np.ndarray) -> np.ndarray:

   """
   Estimate skew via minAreaRect on edges and rotate to correct.
   Returns rotated image.
   """
   gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
   # Canny edges for orientation
   edges = cv2.Canny(gray, 50, 150)
   coords = np.column_stack(np.where(edges > 0))
   if coords.size == 0:
       return image
   rect = cv2.minAreaRect(coords.astype(np.float32))
   angle = rect[-1]
   # OpenCV returns angle in [-90, 0); convert
   if angle < -45:
       angle = -(90 + angle)
   else:
       angle = -angle
   (h, w) = image.shape[:2]
   center = (w // 2, h // 2)
   M = cv2.getRotationMatrix2D(center, angle, 1.0)
   rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
   return rotated


def preprocess_for_ocr(image_path: str) -> np.ndarray:

   img = cv2.imread(image_path)
   if img is None:
       raise FileNotFoundError(f"Cannot open image: {image_path}")
   # Resize up (help small scans)
   scale = 1.5 if max(img.shape[:2]) < 1500 else 1.0
   if scale != 1.0:
       img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
   # Deskew
   img = auto_deskew(img)
   # Grayscale + denoise + contrast (CLAHE) + adaptive threshold
   gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
   gray = cv2.fastNlMeansDenoising(gray, h=10)
   clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
   gray = clahe.apply(gray)
   thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
                               cv2.THRESH_BINARY, 31, 10)
   # Morph open to remove small noise
   kernel = np.ones((1, 1), np.uint8)
   thr = cv2.morphologyEx(thr, cv2.MORPH_OPEN, kernel)
   return thr


  1. =========================
  2. OCR via Tesseract
  3. =========================

def run_ocr(processed_img: np.ndarray, lang: str = "ind+eng") -> OCROutput:

   pil = Image.fromarray(processed_img)
   # Use detailed data for bounding boxes
   data = pytesseract.image_to_data(pil, lang=lang, output_type=pytesseract.Output.DICT)
   words: List[OCRWord] = []
   text_parts = []
   n = len(data["text"])
   for i in range(n):
       txt = data["text"][i].strip()
       if not txt:
           continue
       conf = float(data["conf"][i]) if data["conf"][i] != '-1' else -1.0
       left, top, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
       line_num = int(data.get("line_num", [0]*n)[i]) if "line_num" in data else 0
       par_num = int(data.get("par_num", [0]*n)[i]) if "par_num" in data else 0
       words.append(OCRWord(txt, conf, left, top, w, h, line_num, par_num))
       text_parts.append(txt)
   full_text = " ".join(text_parts)
   h, w = processed_img.shape[:2]
   return OCROutput(text=full_text, words=words, image_size=(w, h))


  1. =========================
  2. NER via Ollama
  3. =========================

DEFAULT_ENTITY_SCHEMA = [

   "PERSON", "ORG", "LOC", "GPE", "DATE", "TIME", "MONEY",
   "PERCENT", "EMAIL", "PHONE", "URL", "EVENT", "LAW", "PRODUCT", "NORP"

]

SYSTEM_PROMPT = (

   "You are an information extraction engine. "
   "Extract named entities from the provided text and return ONLY valid JSON. "
   "Use the requested schema. Do not include explanations."

)

def build_ner_prompt(doc_text: str, labels: List[str]) -> str:

   labels_str = ", ".join(labels)
   # Keep prompt concise; doc may be long
   return (
       f"Text (Indonesian/English mix possible):\n\"\"\"\n{doc_text}\n\"\"\"\n\n"
       f"Extract entities with labels in this set: {labels_str}.\n"
       f"Rules:\n"
       f"- Output JSON ONLY with keys: entities, notes.\n"
       f"- Each entity: Template:\"text\",\"label\",\"start\",\"end\",\"confidence\".\n"
       f"- Use character offsets on the given Text (0-based, inclusive start, exclusive end).\n"
       f"- If unsure, set confidence conservatively (0.0–1.0). If not provided by reasoning, use 0.5.\n"
       f"- Keep notes short (e.g., detection caveats).\n"
   )

def call_ollama_ner(text: str,

                   model: str = "llama3.1",
                   base_url: str = "http://localhost:11434",
                   labels: List[str] = None,
                   max_chars: int = 8000) -> NEROutput:
   if labels is None:
       labels = DEFAULT_ENTITY_SCHEMA
   # Truncate overly long text to fit context
   doc = text[:max_chars]
   payload = {
       "model": model,
       "options": {
           "temperature": 0.1
       },
       # If your Ollama supports JSON mode, include format:"json"
       "format": "json",
       "system": SYSTEM_PROMPT,
       "prompt": build_ner_prompt(doc, labels),
       "stream": False
   }
   resp = requests.post(f"{base_url}/api/generate", json=payload, timeout=120)
   resp.raise_for_status()
   # Ollama returns {"response": "...json..."}; parse that JSON string
   content = resp.json().get("response", "").strip()
   # Be robust to accidental leading/trailing text
   first_brace = content.find("{")
   last_brace = content.rfind("}")
   if first_brace == -1 or last_brace == -1:
       raise ValueError(f"Ollama did not return JSON: {content[:200]}")
   json_str = content[first_brace:last_brace+1]
   parsed = json.loads(json_str)
   ents: List[Entity] = []
   for e in parsed.get("entities", []):
       ents.append(
           Entity(
               text=e.get("text", ""),
               label=e.get("label", ""),
               start=int(e.get("start", -1)),
               end=int(e.get("end", -1)),
               confidence=float(e.get("confidence", 0.5)) if e.get("confidence", None) is not None else 0.5
           )
       )
   notes = parsed.get("notes", "")
   return NEROutput(entities=ents, notes=notes)


  1. =========================
  2. Utility: map entities to approximate boxes (optional)
  3. =========================

def map_entities_to_boxes(ocr: OCROutput, entities: List[Entity]) -> List[Dict[str, Any]]:

   """
   Approximate bounding boxes per entity by matching entity text to concatenated OCR text.
   This is heuristic: we align by char offsets then collect words overlapping that span.
   """
   # Build cumulative char spans for each word in the concatenated text
   words = ocr.words
   joined = " ".join([w.text for w in words])
   # Precompute positions of each word in 'joined'
   idx = 0
   word_spans = []
   for w in words:
       # find w.text starting at idx or later
       m = re.search(r'\b' + re.escape(w.text) + r'\b', joined[idx:])
       if not m:
           # fallback: raw find
           m2 = joined.find(w.text, idx)
           if m2 == -1:
               # skip if cannot map
               word_spans.append((None, None))
               continue
           start = m2
           end = m2 + len(w.text)
       else:
           start = idx + m.start()
           end = idx + m.end()
       word_spans.append((start, end))
       idx = end + 1  # account for space
   mapped = []
   for ent in entities:
       if ent.start < 0 or ent.end <= ent.start:
           mapped.append({"text": ent.text, "label": ent.label, "bbox": None, "page": 1, "confidence": ent.confidence})
           continue
       # collect words whose spans overlap [start, end)
       boxes = []
       for (span, w), ws in zip(words, word_spans):
           if ws[0] is None:
               continue
           s, e = ws
           if not (e <= ent.start or s >= ent.end):  # overlap
               boxes.append((w.left, w.top, w.width, w.height))
       if not boxes:
           mapped.append({"text": ent.text, "label": ent.label, "bbox": None, "page": 1, "confidence": ent.confidence})
       else:
           # merge to one bbox
           xs = [b[0] for b in boxes]
           ys = [b[1] for b in boxes]
           xe = [b[0]+b[2] for b in boxes]
           ye = [b[1]+b[3] for b in boxes]
           bbox = [int(min(xs)), int(min(ys)), int(max(xe) - min(xs)), int(max(ye) - min(ys))]
           mapped.append({"text": ent.text, "label": ent.label, "bbox": bbox, "page": 1, "confidence": ent.confidence})
   return mapped


  1. =========================
  2. Main
  3. =========================

def main():

   load_dotenv()
   parser = argparse.ArgumentParser(description="OCR (Tesseract) + NER (Ollama) pipeline with JSON output.")
   parser.add_argument("images", nargs="+", help="Path(s) to input JPG/PNG scans.")
   parser.add_argument("--lang", default=os.getenv("OCR_LANG", "ind+eng"), help="Tesseract lang (e.g., ind, eng, ind+eng).")
   parser.add_argument("--model", default=os.getenv("OLLAMA_MODEL", "llama3.1"), help="Ollama model name.")
   parser.add_argument("--ollama", default=os.getenv("OLLAMA_BASE", "http://localhost:11434"), help="Ollama base URL.")
   parser.add_argument("--labels", default=",".join(DEFAULT_ENTITY_SCHEMA),
                       help="Comma-separated labels to extract (override default).")
   parser.add_argument("--out", default="ocr_ner_output.json", help="Output JSON file.")
   parser.add_argument("--max-chars", type=int, default=8000, help="Max characters from OCR text to send to NER.")
   parser.add_argument("--map-bbox", action="store_true", help="Approximate entity bounding boxes from OCR words.")
   args = parser.parse_args()
   results = []
   labels = [s.strip() for s in args.labels.split(",") if s.strip()]
   for path in args.images:
       processed = preprocess_for_ocr(path)
       ocr = run_ocr(processed, lang=args.lang)
       try:
           ner = call_ollama_ner(ocr.text, model=args.model, base_url=args.ollama,
                                 labels=labels, max_chars=args.max_chars)
       except Exception as e:
           # If NER fails, still return OCR
           ner = NEROutput(entities=[], notes=f"NER error: {e}")
       item = {
           "file": os.path.basename(path),
           "image_size": {"width": ocr.image_size[0], "height": ocr.image_size[1]},
           "ocr": {
               "text": ocr.text,
               "num_words": len(ocr.words),
               "avg_conf": float(np.mean([w.conf for w in ocr.words if w.conf >= 0])) if ocr.words else None,
           },
           "ner": {
               "entities": [asdict(e) for e in ner.entities],
               "notes": ner.notes
           }
       }
       if args.map_bbox and ner.entities:
           mapped = map_entities_to_boxes(ocr, ner.entities)
           item["ner"]["entities_with_bbox"] = mapped
       results.append(item)
   with open(args.out, "w", encoding="utf-8") as f:
       json.dump({"results": results}, f, ensure_ascii=False, indent=2)
   print(json.dumps({"results": results}, ensure_ascii=False, indent=2))


if __name__ == "__main__":

   main()

```

---

      1. Contoh Pakai

```bash

  1. 1) Single file, default bahasa ind+eng, model llama3.1

python ocr_ner_ollama.py scan_ktp.jpg

  1. 2) Banyak file, pakai mapping bbox, output custom

python ocr_ner_ollama.py doc1.png doc2.jpg --map-bbox --out hasil.json

  1. 3) Pakai model lain + endpoint custom

python ocr_ner_ollama.py nota.png --model mistral --ollama http://127.0.0.1:11434 ```

      1. Contoh Output JSON (ringkas)

```json {

 "results": [
   {
     "file": "nota.png",
     "image_size": { "width": 1754, "height": 1240 },
     "ocr": {
       "text": "TOKO MAJU JAYA ... Total: Rp 125.000 ...",
       "num_words": 47,
       "avg_conf": 86.2
     },
     "ner": {
       "entities": [
         { "text": "TOKO MAJU JAYA", "label": "ORG", "start": 0, "end": 14, "confidence": 0.86 },
         { "text": "Rp 125.000", "label": "MONEY", "start": 35, "end": 45, "confidence": 0.78 }
       ],
       "notes": "Currency inferred from 'Rp' token."
     }
   }
 ]

} ```

---

      1. Catatan & Tips
  • Jika hasil OCR berantakan, coba:
 * Scan ulang dengan resolusi ≥300 DPI.
 * Variasi bahasa `--lang ind`, `--lang ind+eng`.
 * Ganti thresholding ke Otsu: `cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)`.
  • Untuk **dokumen panjang**, batasi `--max-chars` agar prompt tidak terlalu panjang.
  • Skema label bisa Anda sesuaikan via `--labels`, mis. `--labels "PERSON,ORG,LOC,DATE,EMAIL,PHONE"`.

Butuh versi yang menyimpan **hasil per-kata (bbox)** ke JSON juga, atau integrasi langsung ke **BPMN/PM4Py**? Kasih tahu—saya siap tweak agar cocok dengan pipeline Anda.