OpenWebUI: python knowledge PDF CLI API upload

From OnnoWiki
Jump to navigation Jump to search

owui_upload_kb.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Upload PDF ke Open WebUI Knowledge via API (tanpa GUI).

Fitur:
- Buat knowledge collection (jika belum ada)
- Upload banyak PDF (folder / list path)
- Tambah setiap file ke collection
- Anti-duplikat sederhana (skip jika server balas "Duplicate content")
- Uji query RAG terhadap collection (opsional)

Dok:
- Upload file: POST /api/v1/files/
- Tambah ke knowledge: POST /api/v1/knowledge/{id}/file/add
- Buat knowledge: POST /api/v1/knowledge/create
"""

import os
import sys
import time
import json
import glob
import argparse
import requests
from typing import List, Optional 

def api_headers(api_key: str, accept_json: bool = True):
    h = {"Authorization": f"Bearer {api_key}"}
    if accept_json:
        h["Accept"] = "application/json"
    return h 

def create_knowledge(base_url: str, api_key: str, name: str, description: str = "") -> str:
    url = f"{base_url}/api/v1/knowledge/create"
    payload = {"name": name, "description": description}
    r = requests.post(url, headers={**api_headers(api_key), "Content-Type": "application/json"}, json=payload, timeout=120)
    r.raise_for_status()
    data = r.json()
    kb_id = data.get("id")
    if not kb_id:
        raise RuntimeError(f"Gagal membuat knowledge: {data}")
    return kb_id 

def upload_file(base_url: str, api_key: str, file_path: str) -> Optional[str]:
    url = f"{base_url}/api/v1/files/"
    with open(file_path, "rb") as f:
        files = {"file": (os.path.basename(file_path), f, "application/pdf")}
        r = requests.post(url, headers=api_headers(api_key), files=files, timeout=600)
    if r.status_code >= 400:
        # Banyak kasus gagal karena file khusus, PDF rusak, dsb.
        print(f"[WARN] Upload gagal: {file_path} -> {r.status_code} {r.text}")
        return None
    data = r.json()
    return data.get("id")  # file_id 

def add_file_to_knowledge(base_url: str, api_key: str, knowledge_id: str, file_id: str) -> bool:
    url = f"{base_url}/api/v1/knowledge/{knowledge_id}/file/add"
    payload = {"file_id": file_id}
    r = requests.post(url, headers={**api_headers(api_key), "Content-Type": "application/json"}, json=payload, timeout=120)
    if r.status_code == 400 and "Duplicate content" in r.text:
        # Open WebUI bisa mengembalikan 400 duplicate bila konten sama sudah ada
        print(f"[INFO] Duplikat terdeteksi, skip file_id={file_id}")
        return False
    r.raise_for_status()
    return True 

def find_or_create_kb(base_url: str, api_key: str, kb_name: str, kb_description: str = "") -> str:
    """
    Jika Anda sudah tahu UUID knowledge, langsung pakai.
    Kalau belum, cara termudah adalah buat baru dengan nama yang diinginkan.
    """
    print(f"[STEP] Membuat knowledge '{kb_name}'")
    return create_knowledge(base_url, api_key, kb_name, kb_description) 

def collect_pdf_paths(input_path: str) -> List[str]:
    if os.path.isdir(input_path):
        # Ambil semua PDF di folder (tanpa rekursif). Ubah ke **/*.pdf jika ingin rekursif.
        return sorted(glob.glob(os.path.join(input_path, "*.pdf")))
    elif os.path.isfile(input_path):
        return [input_path]
    else:
        raise FileNotFoundError(f"Path tidak ditemukan: {input_path}") 

def rag_test_query(base_url: str, api_key: str, model: str, kb_id: str, user_query: str) -> str:
    """
    Gunakan chat completions + files:[{type:'collection', id:kb_id}] untuk uji RAG.
    """
    url = f"{base_url}/api/chat/completions"
    payload = {
        "model": model,
        "messages": [{"role": "user", "content": user_query}],
        "files": [{"type": "collection", "id": kb_id}],
    }
    r = requests.post(url, headers={**api_headers(api_key), "Content-Type": "application/json"}, json=payload, timeout=600)
    r.raise_for_status()
    data = r.json()
    # Bentuk respons mengikuti OpenAI-compatible schema.
    try:
        return data["choices"][0]["message"]["content"]
    except Exception:
        return json.dumps(data, ensure_ascii=False, indent=2) 

def main():
    p = argparse.ArgumentParser(description="Upload PDF ke Open WebUI Knowledge (tanpa GUI).")
    p.add_argument("--base-url", required=True, help="Contoh: http://localhost:3000")
    p.add_argument("--api-key", required=True, help="API Key dari Settings > Account")
    p.add_argument("--kb-name", required=True, help="Nama knowledge (collection) yang akan dibuat")
    p.add_argument("--kb-desc", default="", help="Deskripsi knowledge")
    p.add_argument("--input", required=True, help="Path ke file PDF atau folder berisi PDF")
    p.add_argument("--model", default="llama3.1", help="Nama model untuk uji RAG (opsional)")
    p.add_argument("--test-query", default="", help="Jika diisi, lakukan uji query RAG ke collection")
    p.add_argument("--sleep-after-upload", type=int, default=3, help="Delay (detik) antar upload untuk memberi waktu proses  embedding")
    args = p.parse_args()

    base_url = args.base_url.rstrip("/")
    api_key = args.api_key 

    # 1) Buat knowledge
    kb_id = find_or_create_kb(base_url, api_key, args.kb_name, args.kb_desc)
    print(f"[OK] Knowledge dibuat: {kb_id}") 

    # 2) Kumpulkan PDF
    pdfs = collect_pdf_paths(args.input)
    if not pdfs:
        print("[WARN] Tidak ada PDF ditemukan.")
        sys.exit(0)
    print(f"[STEP] Menemukan {len(pdfs)} file PDF")  

    # 3) Upload + tambahkan ke knowledge
    uploaded = 0
    for path in pdfs:
        print(f"[STEP] Upload: {path}")
        file_id = upload_file(base_url, api_key, path)
        if not file_id:
            continue
        print(f"[OK] File terupload, file_id={file_id} -> tambah ke knowledge")
        try:
            add_file_to_knowledge(base_url, api_key, kb_id, file_id)
            uploaded += 1
        except requests.HTTPError as e:
            print(f"[ERR] Gagal tambah ke knowledge: {e.response.status_code} {e.response.text}")
        # beri jeda kecil agar proses embedding tidak numpuk (praktis untuk CPU-only)
        time.sleep(args.sleep_after_upload)  

    print(f"[DONE] Selesai. Total file berhasil diproses: {uploaded}/{len(pdfs)}") 

    # 4) Uji RAG (opsional) 
    if args.test_query:
        print(f"[TEST] Jalankan uji RAG model={args.model}")
        answer = rag_test_query(base_url, api_key, args.model, kb_id, args.test_query)
        print("\n=== JAWABAN RAG ===\n")
        print(answer)
        print("\n===================\n") 

if __name__ == "__main__":
    main()




owui-pakai

# 0) Set variabel (opsional)
export OWUI_URL="http://localhost:3000"
export OWUI_KEY="sk-xxxxxx"

# 1) Jalankan: buat KB + upload semua PDF di folder ./dokumen
python3 owui_upload_kb.py \
  --base-url "$OWUI_URL" \
  --api-key "$OWUI_KEY" \
  --kb-name "Dokumen Kampus" \
  --kb-desc "Koleksi PDF Peraturan & Panduan" \
  --input "./dokumen" \
  --model "llama3.1" \
  --test-query "Ringkas poin penting dari semua dokumen tentang akreditasi BAN-PT." 

# Atau untuk satu file saja:
python3 owui_upload_kb.py \
  --base-url "$OWUI_URL" \
  --api-key "$OWUI_KEY" \
  --kb-name "RIP ITTS 2025-2050" \
  --input "./RIP-ITTS-2025-2050.pdf" \
  --test-query "Apa visi utama dokumen ini?"