From f3074f406504c81635c2995cb139fe1c10d32681 Mon Sep 17 00:00:00 2001 From: gitea_admin Date: Wed, 10 Jun 2026 14:41:38 +0000 Subject: [PATCH] Automated commit --- app/routes.py | 373 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 262 insertions(+), 111 deletions(-) diff --git a/app/routes.py b/app/routes.py index 2e55328..055658c 100644 --- a/app/routes.py +++ b/app/routes.py @@ -1,131 +1,282 @@ -"""API routes. - -Define your API endpoints here. All routes are prefixed with /api. - -Built-in AI endpoints (via Druppie SDK): - POST /api/ai/chat — LLM chat completion (body: {prompt, system?}) [module-llm] - POST /api/ai/ocr — OCR text extraction (body: {image_url}) [module-vision] - POST /api/ai/search — Web search (body: {query}) [module-web] - -RAG endpoints (vectors stored in THIS app's own database): - POST /api/rag/index — embed + store documents (body: {documents: [...]}) - POST /api/rag/search — semantic similarity search (body: {query}) - -Example adding your own: - - @api.route('/items', methods=['GET']) - def list_items(): - db = next(get_db()) - items = db.query(Item).all() - return jsonify([{'id': str(i.id), 'name': i.name} for i in items]) -""" - +import base64 +import json +import logging +from datetime import date from flask import Blueprint, jsonify, request from druppie_sdk import DruppieClient - from app.database import get_db -from app.rag import RAG +from app.models import Permit +logger = logging.getLogger(__name__) api = Blueprint("api", __name__) - druppie = DruppieClient() -RAG_INDEX = "knowledge-base" +ALLOWED_EXTENSIONS = {"png", "jpg", "jpeg", "gif", "bmp", "tiff", "webp", "pdf"} +MAX_FILE_SIZE = 25 * 1024 * 1024 + +# Selectielijst waterschappen 2012 — simplified lookup +SELECTIELIJST = { + "watervergunning_lozing": {"nominatie": "vernietigen", "jaren": 20}, + "watervergunning_onttrekking": {"nominatie": "vernietigen", "jaren": 20}, + "keurvergunning": {"nominatie": "vernietigen", "jaren": 10}, + "omgevingsvergunning": {"nominatie": "bewaren", "jaren": None}, + "lozingsvergunning": {"nominatie": "vernietigen", "jaren": 20}, + "onttrekkingsvergunning": {"nominatie": "vernietigen", "jaren": 15}, + "projectplan": {"nominatie": "bewaren", "jaren": None}, + "peilbesluit": {"nominatie": "bewaren", "jaren": None}, + "leggerwijziging": {"nominatie": "bewaren", "jaren": None}, +} + +METADATA_EXTRACTION_PROMPT = """Analyseer het volgende vergunningdocument en extraheer de metadata. +Antwoord ALLEEN met een JSON object met deze velden (laat leeg als niet gevonden): + +{ + "permit_number": "vergunningnummer (K-xxxx, L-xxxx, etc.)", + "applicant_name": "naam aanvrager", + "permit_holder_name": "naam vergunninghouder", + "issuer_name": "naam uitgever/verlener", + "location": "locatie/adres", + "issue_date": "uitgiftedatum (YYYY-MM-DD)", + "expiry_date": "geldigheidsdatum (YYYY-MM-DD)", + "applicable_law": "toepasselijke wet of regeling", + "work_type": "type werk/activiteit", + "water_type": "type oppervlaktewater", + "embankment_type": "type waterkering", + "permit_type": "type vergunning (watervergunning_lozing, keurvergunning, etc.)", + "source_system": "bronsysteem indien herkenbaar" +} + +Document tekst: +""" + + +def _allowed_file(fn): + return "." in fn and fn.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS + + +def _compute_archive_status(permit_type, issue_date_str): + """Compute archive nomination, retention, and status per selectielijst.""" + key = (permit_type or "").strip().lower().replace(" ", "_") + info = SELECTIELIJST.get(key, {"nominatie": "onbekend", "jaren": None}) + nominatie = info["nominatie"] + jaren = info["jaren"] + status = "onbekend" + + if nominatie == "bewaren": + status = "te bewaren (oneindig)" + elif nominatie == "vernietigen" and jaren and issue_date_str: + try: + issue = date.fromisoformat(issue_date_str) + destroy_date = issue.replace(year=issue.year + jaren) + if date.today() >= destroy_date: + status = "te vernietigen" + else: + status = f"te bewaren tot {destroy_date.isoformat()}" + except (ValueError, OverflowError): + status = f"te bewaren ({jaren} jaar na uitgifte)" + elif nominatie == "vernietigen": + status = f"te bewaren ({jaren} jaar na uitgifte)" if jaren else "onbekend" + + return nominatie, jaren, status @api.route("/info") def info(): from app.config import settings - return jsonify(app_name=settings.app_name) -# --------------------------------------------------------------------------- -# AI endpoints — via Druppie SDK (calls module-llm and module-vision) -# --------------------------------------------------------------------------- - - -@api.route("/ai/chat", methods=["POST"]) -def ai_chat_endpoint(): - """LLM chat completion. Body: {"prompt": "...", "system": "..."}""" - data = request.get_json(silent=True) - if not data or "prompt" not in data: - return jsonify(error="Missing required field: prompt"), 400 - result = druppie.call("llm", "chat", { - "prompt": data["prompt"], - "system": data.get("system", "You are a helpful assistant."), - }) - return jsonify(answer=result.get("answer", "")) - - -@api.route("/ai/ocr", methods=["POST"]) -def ai_ocr_endpoint(): - """OCR text extraction. Body: {"image_url": "https://..."}""" - data = request.get_json(silent=True) - if not data or "image_url" not in data: - return jsonify(error="Missing required field: image_url"), 400 - result = druppie.call("vision", "ocr", {"image_source": data["image_url"]}) - return jsonify(text=result.get("text", "")) - - -@api.route("/ai/search", methods=["POST"]) -def ai_search_endpoint(): - """Web search. Body: {"query": "search terms"}""" - data = request.get_json(silent=True) - if not data or "query" not in data: - return jsonify(error="Missing required field: query"), 400 - result = druppie.call("web", "search_web", {"query": data["query"]}) - return jsonify(result) - - -# --------------------------------------------------------------------------- -# RAG endpoints — worked example of the embed → store → search loop -# -# Vectors live in THIS app's own Postgres (pgvector); embeddings are -# generated by the stateless module-llm `embed` tool via the SDK. There is -# no shared vectorstore — each app owns its own vectors. The `RAG` helper -# (app/rag.py) handles chunking, the embed call, storage, and search. -# --------------------------------------------------------------------------- - - -@api.route("/rag/index", methods=["POST"]) -def rag_index_endpoint(): - """Embed and store documents in the app's own database. - - Body: {"documents": [{"content": "...", "source_name": "...", - "source_page": 1}, ...]} - - For each document the RAG helper chunks the text, calls - module-llm `embed` to turn each chunk into a vector, and stores the - chunk + vector in this app's `vector_chunks` table (pgvector). - """ - data = request.get_json(silent=True) - if not data or not data.get("documents"): - return jsonify(error="Missing required field: documents"), 400 +@api.route("/permits/upload", methods=["POST"]) +def upload_permit(): + """Upload permit document — OCR + metadata extraction + archive classification.""" + if "file" not in request.files: + return jsonify(error="Geen bestand geüpload."), 400 + file = request.files["file"] + if not file.filename or not _allowed_file(file.filename): + return jsonify(error="Ongeldig bestandstype. Upload JPG, PNG of PDF."), 400 + file_bytes = file.read() + if len(file_bytes) > MAX_FILE_SIZE: + return jsonify(error="Bestand te groot (max 25 MB)."), 400 + if not file_bytes: + return jsonify(error="Leeg bestand."), 400 db = next(get_db()) - rag = RAG(db, druppie) - rag.create_index(RAG_INDEX) - result = rag.index_documents(RAG_INDEX, data["documents"]) - return jsonify(result) + permit = Permit( + permit_number=request.form.get("permit_number", "").strip() or None, + applicant_name=request.form.get("applicant_name", "").strip() or None, + source_file=file.filename, + source_system=request.form.get("source_system", "upload").strip(), + status="processing", + ) + db.add(permit) + db.commit() + db.refresh(permit) + + ext = file.filename.rsplit(".", 1)[1].lower() + mime_map = {"jpg": "image/jpeg", "jpeg": "image/jpeg", "png": "image/png", + "pdf": "application/pdf", "tiff": "image/tiff", "webp": "image/webp"} + mime = mime_map.get(ext, "application/octet-stream") + data_uri = f"data:{mime};base64,{base64.b64encode(file_bytes).decode()}" + + try: + # Step 1: OCR — extract text from document + ocr_result = druppie.call("vision", "ocr", { + "image_source": data_uri, + "prompt": "Extraheer alle tekst uit dit vergunningdocument. Bewaar de structuur.", + }) + extracted_text = ocr_result.get("text", "") + permit.extracted_text = extracted_text + + # Step 2: LLM — extract structured metadata from text + meta_result = druppie.call("llm", "chat", { + "prompt": METADATA_EXTRACTION_PROMPT + extracted_text[:4000], + "system": "Je bent een metadata-extractie specialist voor Nederlandse watervergunningen. Antwoord ALLEEN met valid JSON.", + }) + raw_answer = meta_result.get("answer", "{}") + + # Parse JSON from LLM response (handle markdown code blocks) + json_str = raw_answer + if "```" in json_str: + json_str = json_str.split("```")[1] + if json_str.startswith("json"): + json_str = json_str[4:] + json_str = json_str.strip() + + try: + meta = json.loads(json_str) + except json.JSONDecodeError: + meta = {} + logger.warning("Failed to parse metadata JSON: %s", raw_answer[:200]) + + # Apply extracted metadata (LLM fills gaps, user input takes priority) + permit.permit_number = permit.permit_number or meta.get("permit_number") or None + permit.applicant_name = permit.applicant_name or meta.get("applicant_name") or None + permit.permit_holder_name = meta.get("permit_holder_name") or None + permit.issuer_name = meta.get("issuer_name") or None + permit.location = meta.get("location") or None + permit.applicable_law = meta.get("applicable_law") or None + permit.work_type = meta.get("work_type") or None + permit.water_type = meta.get("water_type") or None + permit.embankment_type = meta.get("embankment_type") or None + permit.permit_type = meta.get("permit_type", "onbekend") or "onbekend" + permit.source_system = permit.source_system or meta.get("source_system") or "upload" + + # Parse dates + for field, key in [("issue_date", "issue_date"), ("expiry_date", "expiry_date")]: + val = meta.get(key) + if val: + try: + setattr(permit, field, date.fromisoformat(val)) + except (ValueError, TypeError): + pass + + # Step 3: Compute archive status (BR-01 to BR-06) + nominatie, jaren, arch_status = _compute_archive_status( + permit.permit_type, + permit.issue_date.isoformat() if permit.issue_date else None, + ) + permit.archive_nomination = nominatie + permit.retention_years = jaren + permit.archive_status = arch_status + permit.status = "processed" + + except Exception as e: + logger.error("Processing failed for permit %s: %s", permit.id, e) + permit.status = "error" + permit.error_message = str(e)[:500] + + db.commit() + db.refresh(permit) + return jsonify(_permit_to_dict(permit)) -@api.route("/rag/search", methods=["POST"]) -def rag_search_endpoint(): - """Semantic similarity search over the stored documents. - - Body: {"query": "natural-language question", "top_k": 5} - - The query is embedded with the same module-llm `embed` tool, then - matched against the stored chunks with pgvector's cosine distance - (`embedding <=> :qvec`). Returns the top-k chunks with their source - metadata so the caller can build a cited answer. - """ - data = request.get_json(silent=True) - if not data or "query" not in data: - return jsonify(error="Missing required field: query"), 400 - +@api.route("/permits") +def list_permits(): db = next(get_db()) - rag = RAG(db, druppie) - results = rag.search(RAG_INDEX, data["query"], top_k=data.get("top_k", 5)) - return jsonify(results=results) + permits = db.query(Permit).order_by(Permit.created_at.desc()).all() + return jsonify([_permit_summary(p) for p in permits]) + + +@api.route("/permits/") +def get_permit(permit_id): + db = next(get_db()) + permit = db.query(Permit).filter(Permit.id == permit_id).first() + if not permit: + return jsonify(error="Niet gevonden"), 404 + return jsonify(_permit_to_dict(permit)) + + +@api.route("/permits/", methods=["DELETE"]) +def delete_permit(permit_id): + db = next(get_db()) + permit = db.query(Permit).filter(Permit.id == permit_id).first() + if not permit: + return jsonify(error="Niet gevonden"), 404 + db.delete(permit) + db.commit() + return jsonify(ok=True) + + +@api.route("/permits/search") +def search_permits(): + """Search across all metadata fields (FR-01).""" + q = request.args.get("q", "").strip() + db = next(get_db()) + if not q: + return jsonify([_permit_summary(p) for p in db.query(Permit).order_by(Permit.created_at.desc()).all()]) + like = f"%{q}%" + results = db.query(Permit).filter( + Permit.permit_number.ilike(like) + | Permit.applicant_name.ilike(like) + | Permit.permit_holder_name.ilike(like) + | Permit.location.ilike(like) + | Permit.permit_type.ilike(like) + | Permit.applicable_law.ilike(like) + | Permit.work_type.ilike(like) + | Permit.water_type.ilike(like) + | Permit.extracted_text.ilike(like) + ).order_by(Permit.created_at.desc()).all() + return jsonify([_permit_summary(p) for p in results]) + + +@api.route("/permits/stats") +def permit_stats(): + """Dashboard stats.""" + db = next(get_db()) + total = db.query(Permit).count() + by_type = {} + for p in db.query(Permit).all(): + t = (p.permit_type or "onbekend").lower() + by_type[t] = by_type.get(t, 0) + 1 + by_archive = {} + for p in db.query(Permit).all(): + s = p.archive_status or "onbekend" + by_archive[s] = by_archive.get(s, 0) + 1 + return jsonify(total=total, by_type=by_type, by_archive=by_archive) + + +def _permit_summary(p): + return { + "id": p.id, "permit_number": p.permit_number, + "applicant_name": p.applicant_name, "permit_type": p.permit_type, + "location": p.location, "source_file": p.source_file, + "source_system": p.source_system, "status": p.status, + "archive_status": p.archive_status, + "issue_date": p.issue_date.isoformat() if p.issue_date else None, + "upload_date": p.upload_date.isoformat() if p.upload_date else None, + } + + +def _permit_to_dict(p): + return { + **_permit_summary(p), + "permit_holder_name": p.permit_holder_name, + "issuer_name": p.issuer_name, + "expiry_date": p.expiry_date.isoformat() if p.expiry_date else None, + "applicable_law": p.applicable_law, + "work_type": p.work_type, "water_type": p.water_type, + "embankment_type": p.embankment_type, + "archive_nomination": p.archive_nomination, + "retention_years": p.retention_years, + "extracted_text": p.extracted_text, "error_message": p.error_message, + }