NER: Scan JPG NER JSON
Siap! Berikut **script Python all-in-one** untuk:
1. baca gambar hasil scan (JPG/PNG), 2. lakukan OCR (Tesseract via `pytesseract`), 3. lakukan **NER via Ollama** (model lokal, mis. `llama3`/`llama3.1`/`mistral`), 4. keluaran **JSON** yang rapi.
Termasuk sedikit **pre-processing** (grayscale, denoise, threshold, auto-deskew ringan) agar OCR lebih stabil pada hasil scan.
---
- Prasyarat
- **Tesseract OCR** terpasang:
* Ubuntu/Debian:
```bash sudo apt-get update sudo apt-get install -y tesseract-ocr tesseract-ocr-ind tesseract-ocr-eng ```
- **Ollama** berjalan lokal (default `http://localhost:11434`) dan model sudah di-pull:
```bash ollama pull llama3.1 ```
- Python deps:
```bash pip install pillow pytesseract opencv-python requests python-dotenv ```
Opsional: buat file `.env` untuk override endpoint/model:
``` OLLAMA_BASE=http://localhost:11434 OLLAMA_MODEL=llama3.1 OCR_LANG=ind+eng ```
---
- Script: `ocr_ner_ollama.py`
> Jalankan: > `python ocr_ner_ollama.py input1.jpg input2.png --lang ind+eng --model llama3.1 --out result.json`
```python
- !/usr/bin/env python3
- -*- coding: utf-8 -*-
import argparse import base64 import io import json import os import re import sys from dataclasses import asdict, dataclass from typing import List, Dict, Any, Tuple
import cv2 import numpy as np import pytesseract import requests from PIL import Image from dotenv import load_dotenv
- =========================
- Data structures
- =========================
@dataclass class OCRWord:
text: str conf: float left: int top: int width: int height: int line_num: int par_num: int
@dataclass class OCROutput:
text: str words: List[OCRWord] image_size: Tuple[int, int] # (width, height)
@dataclass class Entity:
text: str label: str start: int end: int confidence: float = None # model may not provide; keep optional
@dataclass class NEROutput:
entities: List[Entity] notes: str = ""
- =========================
- Image preprocessing
- =========================
def auto_deskew(image: np.ndarray) -> np.ndarray:
""" Estimate skew via minAreaRect on edges and rotate to correct. Returns rotated image. """ gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) # Canny edges for orientation edges = cv2.Canny(gray, 50, 150) coords = np.column_stack(np.where(edges > 0)) if coords.size == 0: return image rect = cv2.minAreaRect(coords.astype(np.float32)) angle = rect[-1] # OpenCV returns angle in [-90, 0); convert if angle < -45: angle = -(90 + angle) else: angle = -angle
(h, w) = image.shape[:2] center = (w // 2, h // 2) M = cv2.getRotationMatrix2D(center, angle, 1.0) rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE) return rotated
def preprocess_for_ocr(image_path: str) -> np.ndarray:
img = cv2.imread(image_path) if img is None: raise FileNotFoundError(f"Cannot open image: {image_path}")
# Resize up (help small scans) scale = 1.5 if max(img.shape[:2]) < 1500 else 1.0 if scale != 1.0: img = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
# Deskew img = auto_deskew(img)
# Grayscale + denoise + contrast (CLAHE) + adaptive threshold gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.fastNlMeansDenoising(gray, h=10) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) gray = clahe.apply(gray) thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 31, 10)
# Morph open to remove small noise kernel = np.ones((1, 1), np.uint8) thr = cv2.morphologyEx(thr, cv2.MORPH_OPEN, kernel) return thr
- =========================
- OCR via Tesseract
- =========================
def run_ocr(processed_img: np.ndarray, lang: str = "ind+eng") -> OCROutput:
pil = Image.fromarray(processed_img) # Use detailed data for bounding boxes data = pytesseract.image_to_data(pil, lang=lang, output_type=pytesseract.Output.DICT) words: List[OCRWord] = [] text_parts = []
n = len(data["text"]) for i in range(n): txt = data["text"][i].strip() if not txt: continue conf = float(data["conf"][i]) if data["conf"][i] != '-1' else -1.0 left, top, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i] line_num = int(data.get("line_num", [0]*n)[i]) if "line_num" in data else 0 par_num = int(data.get("par_num", [0]*n)[i]) if "par_num" in data else 0 words.append(OCRWord(txt, conf, left, top, w, h, line_num, par_num)) text_parts.append(txt)
full_text = " ".join(text_parts) h, w = processed_img.shape[:2] return OCROutput(text=full_text, words=words, image_size=(w, h))
- =========================
- NER via Ollama
- =========================
DEFAULT_ENTITY_SCHEMA = [
"PERSON", "ORG", "LOC", "GPE", "DATE", "TIME", "MONEY", "PERCENT", "EMAIL", "PHONE", "URL", "EVENT", "LAW", "PRODUCT", "NORP"
]
SYSTEM_PROMPT = (
"You are an information extraction engine. " "Extract named entities from the provided text and return ONLY valid JSON. " "Use the requested schema. Do not include explanations."
)
def build_ner_prompt(doc_text: str, labels: List[str]) -> str:
labels_str = ", ".join(labels) # Keep prompt concise; doc may be long return ( f"Text (Indonesian/English mix possible):\n\"\"\"\n{doc_text}\n\"\"\"\n\n" f"Extract entities with labels in this set: {labels_str}.\n" f"Rules:\n" f"- Output JSON ONLY with keys: entities, notes.\n" f"- Each entity: Template:\"text\",\"label\",\"start\",\"end\",\"confidence\".\n" f"- Use character offsets on the given Text (0-based, inclusive start, exclusive end).\n" f"- If unsure, set confidence conservatively (0.0–1.0). If not provided by reasoning, use 0.5.\n" f"- Keep notes short (e.g., detection caveats).\n" )
def call_ollama_ner(text: str,
model: str = "llama3.1", base_url: str = "http://localhost:11434", labels: List[str] = None, max_chars: int = 8000) -> NEROutput: if labels is None: labels = DEFAULT_ENTITY_SCHEMA
# Truncate overly long text to fit context doc = text[:max_chars]
payload = { "model": model, "options": { "temperature": 0.1 }, # If your Ollama supports JSON mode, include format:"json" "format": "json", "system": SYSTEM_PROMPT, "prompt": build_ner_prompt(doc, labels), "stream": False }
resp = requests.post(f"{base_url}/api/generate", json=payload, timeout=120) resp.raise_for_status()
# Ollama returns {"response": "...json..."}; parse that JSON string content = resp.json().get("response", "").strip()
# Be robust to accidental leading/trailing text first_brace = content.find("{") last_brace = content.rfind("}") if first_brace == -1 or last_brace == -1: raise ValueError(f"Ollama did not return JSON: {content[:200]}")
json_str = content[first_brace:last_brace+1] parsed = json.loads(json_str)
ents: List[Entity] = [] for e in parsed.get("entities", []): ents.append( Entity( text=e.get("text", ""), label=e.get("label", ""), start=int(e.get("start", -1)), end=int(e.get("end", -1)), confidence=float(e.get("confidence", 0.5)) if e.get("confidence", None) is not None else 0.5 ) ) notes = parsed.get("notes", "") return NEROutput(entities=ents, notes=notes)
- =========================
- Utility: map entities to approximate boxes (optional)
- =========================
def map_entities_to_boxes(ocr: OCROutput, entities: List[Entity]) -> List[Dict[str, Any]]:
""" Approximate bounding boxes per entity by matching entity text to concatenated OCR text. This is heuristic: we align by char offsets then collect words overlapping that span. """ # Build cumulative char spans for each word in the concatenated text words = ocr.words joined = " ".join([w.text for w in words]) # Precompute positions of each word in 'joined' idx = 0 word_spans = [] for w in words: # find w.text starting at idx or later m = re.search(r'\b' + re.escape(w.text) + r'\b', joined[idx:]) if not m: # fallback: raw find m2 = joined.find(w.text, idx) if m2 == -1: # skip if cannot map word_spans.append((None, None)) continue start = m2 end = m2 + len(w.text) else: start = idx + m.start() end = idx + m.end() word_spans.append((start, end)) idx = end + 1 # account for space
mapped = [] for ent in entities: if ent.start < 0 or ent.end <= ent.start: mapped.append({"text": ent.text, "label": ent.label, "bbox": None, "page": 1, "confidence": ent.confidence}) continue # collect words whose spans overlap [start, end) boxes = [] for (span, w), ws in zip(words, word_spans): if ws[0] is None: continue s, e = ws if not (e <= ent.start or s >= ent.end): # overlap boxes.append((w.left, w.top, w.width, w.height)) if not boxes: mapped.append({"text": ent.text, "label": ent.label, "bbox": None, "page": 1, "confidence": ent.confidence}) else: # merge to one bbox xs = [b[0] for b in boxes] ys = [b[1] for b in boxes] xe = [b[0]+b[2] for b in boxes] ye = [b[1]+b[3] for b in boxes] bbox = [int(min(xs)), int(min(ys)), int(max(xe) - min(xs)), int(max(ye) - min(ys))] mapped.append({"text": ent.text, "label": ent.label, "bbox": bbox, "page": 1, "confidence": ent.confidence}) return mapped
- =========================
- Main
- =========================
def main():
load_dotenv()
parser = argparse.ArgumentParser(description="OCR (Tesseract) + NER (Ollama) pipeline with JSON output.") parser.add_argument("images", nargs="+", help="Path(s) to input JPG/PNG scans.") parser.add_argument("--lang", default=os.getenv("OCR_LANG", "ind+eng"), help="Tesseract lang (e.g., ind, eng, ind+eng).") parser.add_argument("--model", default=os.getenv("OLLAMA_MODEL", "llama3.1"), help="Ollama model name.") parser.add_argument("--ollama", default=os.getenv("OLLAMA_BASE", "http://localhost:11434"), help="Ollama base URL.") parser.add_argument("--labels", default=",".join(DEFAULT_ENTITY_SCHEMA), help="Comma-separated labels to extract (override default).") parser.add_argument("--out", default="ocr_ner_output.json", help="Output JSON file.") parser.add_argument("--max-chars", type=int, default=8000, help="Max characters from OCR text to send to NER.") parser.add_argument("--map-bbox", action="store_true", help="Approximate entity bounding boxes from OCR words.") args = parser.parse_args()
results = [] labels = [s.strip() for s in args.labels.split(",") if s.strip()]
for path in args.images: processed = preprocess_for_ocr(path) ocr = run_ocr(processed, lang=args.lang)
try: ner = call_ollama_ner(ocr.text, model=args.model, base_url=args.ollama, labels=labels, max_chars=args.max_chars) except Exception as e: # If NER fails, still return OCR ner = NEROutput(entities=[], notes=f"NER error: {e}")
item = { "file": os.path.basename(path), "image_size": {"width": ocr.image_size[0], "height": ocr.image_size[1]}, "ocr": { "text": ocr.text, "num_words": len(ocr.words), "avg_conf": float(np.mean([w.conf for w in ocr.words if w.conf >= 0])) if ocr.words else None, }, "ner": { "entities": [asdict(e) for e in ner.entities], "notes": ner.notes } }
if args.map_bbox and ner.entities: mapped = map_entities_to_boxes(ocr, ner.entities) item["ner"]["entities_with_bbox"] = mapped
results.append(item)
with open(args.out, "w", encoding="utf-8") as f: json.dump({"results": results}, f, ensure_ascii=False, indent=2)
print(json.dumps({"results": results}, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()
```
---
- Contoh Pakai
```bash
- 1) Single file, default bahasa ind+eng, model llama3.1
python ocr_ner_ollama.py scan_ktp.jpg
- 2) Banyak file, pakai mapping bbox, output custom
python ocr_ner_ollama.py doc1.png doc2.jpg --map-bbox --out hasil.json
- 3) Pakai model lain + endpoint custom
python ocr_ner_ollama.py nota.png --model mistral --ollama http://127.0.0.1:11434 ```
- Contoh Output JSON (ringkas)
```json {
"results": [ { "file": "nota.png", "image_size": { "width": 1754, "height": 1240 }, "ocr": { "text": "TOKO MAJU JAYA ... Total: Rp 125.000 ...", "num_words": 47, "avg_conf": 86.2 }, "ner": { "entities": [ { "text": "TOKO MAJU JAYA", "label": "ORG", "start": 0, "end": 14, "confidence": 0.86 }, { "text": "Rp 125.000", "label": "MONEY", "start": 35, "end": 45, "confidence": 0.78 } ], "notes": "Currency inferred from 'Rp' token." } } ]
} ```
---
- Catatan & Tips
- Jika hasil OCR berantakan, coba:
* Scan ulang dengan resolusi ≥300 DPI. * Variasi bahasa `--lang ind`, `--lang ind+eng`. * Ganti thresholding ke Otsu: `cv2.threshold(gray,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)`.
- Untuk **dokumen panjang**, batasi `--max-chars` agar prompt tidak terlalu panjang.
- Skema label bisa Anda sesuaikan via `--labels`, mis. `--labels "PERSON,ORG,LOC,DATE,EMAIL,PHONE"`.
Butuh versi yang menyimpan **hasil per-kata (bbox)** ke JSON juga, atau integrasi langsung ke **BPMN/PM4Py**? Kasih tahu—saya siap tweak agar cocok dengan pipeline Anda.