first commit

2025-07-10 01:43:01 +02:00 · 2025-07-10 01:43:01 +02:00 · eb4f62c56d
commit eb4f62c56d
parent 5c5d88c92f
41 changed files with 3851 additions and 19 deletions
--- a/docker_svc/agent/Dockerfile
+++ b/docker_svc/agent/Dockerfile
@ -0,0 +1,15 @@
+FROM python:3.11-alpine
+
+WORKDIR /app
+
+COPY requirements.txt .
+
+RUN apk add --no-cache build-base \
+    && pip install --no-cache-dir -r requirements.txt \
+    && apk del build-base
+
+COPY app .
+
+EXPOSE 8000
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/docker_svc/agent/Dockerfile:Zone.Identifier
+++ b/docker_svc/agent/Dockerfile:Zone.Identifier
--- a/docker_svc/agent/app/libs/check_medical.py
+++ b/docker_svc/agent/app/libs/check_medical.py
@ -0,0 +1,36 @@
+def is_medical_query(message: str) -> bool:
+    """
+    Check if the user message contains medical keywords. This function is case-insensitive.
+
+    :param message: The user message or any string to check.
+    :return: True if the message contains medical keywords, False otherwise.
+    """
+    medical_keywords = [
+        "health",
+        "doctor",
+        "medicine",
+        "disease",
+        "symptom",
+        "treatment",
+        "salute",
+        "medico",
+        "malattia",
+        "sintomo",
+        "cura",
+        "sanità",
+        "santé",
+        "médecin",
+        "médicament",
+        "maladie",
+        "symptôme",
+        "traitement",
+        "gesundheit",
+        "arzt",
+        "medizin",
+        "krankheit",
+        "symptom",
+        "behandlung",
+    ]
+
+    message_lower = message.lower()
+    return any(keyword in message_lower for keyword in medical_keywords)
--- a/docker_svc/agent/app/libs/log_prompts.py
+++ b/docker_svc/agent/app/libs/log_prompts.py
@ -0,0 +1,43 @@
+import os
+from mysql.connector import connect, Error
+import logging
+
+# Configure logging
+logger = logging.getLogger(__name__)
+
+def log_prompt_to_db(userid: str | None, ip: str, prompt: str, answer: str):
+    """
+    Logs the user's prompt and the corresponding response to the database.
+
+    Args:
+        userid (str | None): User ID (optional, can be None).
+        ip (str): Client's IP address.
+        prompt (str): Full conversation history provided by the user.
+        answer (str): Response generated by the AI.
+    """
+    try:
+        # Connect to the database using environment variables
+        connection = connect(
+            host=os.getenv("DB_HOST"),
+            port=int(os.getenv("DB_PORT", "3306")),
+            user=os.getenv("DB_USER"),
+            password=os.getenv("DB_PASSWORD"),
+            database=os.getenv("DB_NAME")
+        )
+        cursor = connection.cursor()
+
+        # SQL query to insert data
+        query = """
+        INSERT INTO user_prompts (userid, ip, prompt, answer)
+        VALUES (%s, %s, %s, %s)
+        """
+        values = (userid, ip, prompt, answer)
+        cursor.execute(query, values)
+
+        # Commit the transaction and close resources
+        connection.commit()
+        cursor.close()
+        connection.close()
+
+    except Error as e:
+        logger.error(f"Error logging prompt to database: {e}")
--- a/docker_svc/agent/app/libs/manage_languages.py
+++ b/docker_svc/agent/app/libs/manage_languages.py
@ -0,0 +1,36 @@
+# libs/manage_languages.py
+
+from langdetect import detect
+from fastapi import HTTPException
+import logging
+
+logger = logging.getLogger(__name__)
+
+def validate_language(language: str) -> None:
+    """Validate the language parameter. Throws an HTTPException if the language is invalid."""
+    valid_languages = {"french", "italian", "english", "german", "auto"}
+    if language not in valid_languages:
+        raise HTTPException(
+            status_code=400,
+            detail="Invalid language. Must be one of: french, italian, english, german, or auto"
+        )
+
+def detect_language(current_message: str) -> str:
+    """Detect the language of the current message. Defaults to French if detection fails."""
+    try:
+        detected_lang = detect(current_message)
+        if detected_lang == "fr":
+            language = "french"
+        elif detected_lang == "it":
+            language = "italian"
+        elif detected_lang == "en":
+            language = "english"
+        elif detected_lang == "de":
+            language = "german"
+        else:
+            language = "french"
+        logger.info(f"Detected language: {language}")
+        return language
+    except Exception as e:
+        logger.error(f"Language detection failed: {str(e)}")
+        return "french"
--- a/docker_svc/agent/app/libs/models.py
+++ b/docker_svc/agent/app/libs/models.py
@ -0,0 +1,14 @@
+from typing import List, Optional, Literal
+from pydantic import BaseModel, Field
+
+class ChatMessage(BaseModel):
+    role: Literal["user", "coach"]
+    content: str
+
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage]
+    language: str = "auto"
+    temperature: float = 0.7
+    reasoning: bool = False
+    stream: bool = True
+    personality: str = "supportive"
--- a/docker_svc/agent/app/libs/prompt_helper.py
+++ b/docker_svc/agent/app/libs/prompt_helper.py
@ -0,0 +1,192 @@
+from llama_index.core.base.llms.types import ChatMessage as LlamaChatMessage
+import logging
+from libs.models import ChatMessage
+from typing import List, Dict, Any, Optional, AsyncGenerator
+import httpx
+import json
+import os
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+def format_system_prompt(system_prompt_template: str, language_prompts: dict, language: str,
+                         retrieved_docs: str, is_medical: bool, personality: str = "supportive",
+                         personality_prompts: dict = {}) -> str:
+    """Formatta il prompt di sistema con il contenuto specifico della lingua, personalità e i documenti recuperati."""
+    language_prompt = language_prompts[language]["prompt"]
+    language_disclaimer = language_prompts[language]["disclaimer"]
+    language_constraint = "" if language == "auto" else language_prompts[language]["constraint"]
+    
+    # Miglioro il log e la gestione della personalità
+    if personality not in personality_prompts:
+        logger.warning(f"Personality '{personality}' not found in prompts, using default empty prompt")
+        personality_prompt = ""
+    else:
+        personality_prompt = personality_prompts[personality]["prompt"]
+        logger.info(f"Using '{personality}' personality: {personality_prompts[personality]['description'][:50]}...")
+    
+    logger.info(f"Formatting system prompt with language {language}, personality {personality}")
+    system_message_content = system_prompt_template.format(
+        language_prompt=language_prompt,
+        context=retrieved_docs,
+        language_disclaimer=language_disclaimer if is_medical else "",
+        personality_prompt=personality_prompt,
+        language_constraint=language_constraint
+    )
+    logger.debug(f"System message content: {system_message_content[:200]}...")
+    return system_message_content
+
+async def perform_inference_streaming(
+    llm,
+    system_message: str,
+    history: List[Dict],
+    current_message: str
+) -> AsyncGenerator[str, None]:
+    """Stream inference results from Ollama API"""
+    base_url = os.getenv("OLLAMA_BASE_URL", "http://ollama:11434")
+    
+    # Prepare messages for Ollama API
+    messages = []
+    
+    # Add system message
+    messages.append({
+        "role": "system",
+        "content": system_message
+    })
+    
+    # Add history
+    for msg in history:
+        messages.append({
+            "role": "user" if msg.role == "user" else "assistant",
+            "content": msg.content
+        })
+    
+    # Add current user message
+    messages.append({
+        "role": "user",
+        "content": current_message
+    })
+    
+    # Prepare request payload
+    payload = {
+        "model": llm.model,
+        "messages": messages,
+        "stream": True,
+        "options": {
+            "temperature": llm.temperature
+        }
+    }
+    
+    logger.debug(f"Sending streaming request to Ollama API: {base_url}/api/chat")
+    
+    try:
+        async with httpx.AsyncClient() as client:
+            async with client.stream("POST", f"{base_url}/api/chat", json=payload, timeout=60.0) as response:
+                if response.status_code != 200:
+                    error_detail = await response.aread()
+                    logger.error(f"Error from Ollama API: {response.status_code}, {error_detail}")
+                    yield f"Error: Failed to get response from language model (Status {response.status_code})"
+                    return
+
+                # Variable to accumulate the full response
+                full_response = ""
+                
+                # Process the streaming response
+                async for chunk in response.aiter_text():
+                    if not chunk.strip():
+                        continue
+                    
+                    # Each chunk might contain one JSON object
+                    try:
+                        data = json.loads(chunk)
+                        # Process message content if available
+                        if 'message' in data and 'content' in data['message']:
+                            content = data['message']['content']
+                            full_response += content
+                            yield content
+                        
+                        # Check if this is the final message with done flag
+                        if data.get('done', False):
+                            logger.debug("Streaming response completed")
+                    except json.JSONDecodeError as e:
+                        logger.error(f"Failed to parse streaming response: {e}, chunk: {chunk}")
+                
+    except Exception as e:
+        logger.error(f"Error during streaming inference: {str(e)}")
+        yield f"Error: {str(e)}"
+        
+    # Return empty string at the end to signal completion
+    yield ""
+
+def perform_inference(
+    llm,
+    system_message: str,
+    history: List[Dict],
+    current_message: str,
+    stream: bool = False
+) -> str:
+    """Perform inference with the given LLM."""
+    if stream:
+        # This will be handled by the streaming endpoint
+        raise ValueError("Streaming not supported in synchronous inference")
+    
+    # Prepare messages for the API
+    messages = []
+    
+    # Add system message
+    messages.append({
+        "role": "system",
+        "content": system_message
+    })
+    
+    # Add history
+    for msg in history:
+        messages.append({
+            "role": "user" if msg.role == "user" else "assistant",
+            "content": msg.content
+        })
+    
+    # Add current user message
+    messages.append({
+        "role": "user",
+        "content": current_message
+    })
+    
+    # For non-streaming, we'll use the httpx client directly to call Ollama API
+    base_url = os.getenv("OLLAMA_BASE_URL", "http://ollama:11434")
+    
+    # Prepare request payload
+    payload = {
+        "model": llm.model,
+        "messages": messages,
+        "stream": False,
+        "options": {
+            "temperature": llm.temperature
+        }
+    }
+    
+    logger.debug(f"Sending non-streaming request to Ollama API: {base_url}/api/chat")
+    
+    try:
+        with httpx.Client(timeout=60.0) as client:
+            response = client.post(f"{base_url}/api/chat", json=payload)
+            
+            if response.status_code != 200:
+                logger.error(f"Error from Ollama API: {response.status_code}, {response.text}")
+                return f"Error: Failed to get response from language model (Status {response.status_code})"
+            
+            data = response.json()
+            if 'message' in data and 'content' in data['message']:
+                return data['message']['content']
+            else:
+                logger.error(f"Unexpected response format: {data}")
+                return "Error: Unexpected response format from language model"
+    
+    except Exception as e:
+        logger.error(f"Error during non-streaming inference: {str(e)}")
+        return f"Error: {str(e)}"
+
+def select_llm(llm, llm_reasoning, reasoning: bool):
+    """Select the LLM model based on the reasoning flag."""
+    selected_llm = llm_reasoning if reasoning else llm
+    return selected_llm
--- a/docker_svc/agent/app/libs/qdrant_helper.py
+++ b/docker_svc/agent/app/libs/qdrant_helper.py
@ -0,0 +1,79 @@
+from llama_index.core import VectorStoreIndex, StorageContext
+from llama_index.vector_stores.qdrant import QdrantVectorStore
+from qdrant_client import QdrantClient
+from qdrant_client.http.models import Distance, VectorParams
+import logging
+
+logger = logging.getLogger(__name__)
+
+def ensure_collection_exists(qdrant_client: QdrantClient, collection_name: str, vector_size: int) -> None:
+    """Verify that the Qdrant collection exists, and create it if it does not."""
+    try:
+        if not qdrant_client.collection_exists(collection_name):
+            qdrant_client.create_collection(
+                collection_name=collection_name,
+                vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
+            )
+            logger.info(f"Created Qdrant collection '{collection_name}' with vector size {vector_size}")
+        else:
+            logger.info(f"Qdrant collection '{collection_name}' already exists")
+    except Exception as e:
+        logger.error(f"Failed to ensure Qdrant collection exists: {str(e)}")
+        raise
+
+def retrieve_documents(qdrant_client: QdrantClient, collection_name: str, embed_model, current_message: str) -> str:
+    """Get the relevant documents from Qdrant based on the current message."""
+    logger.info("Initializing Qdrant vector store")
+    vector_store = QdrantVectorStore(
+        client=qdrant_client,
+        collection_name=collection_name,
+        embed_model=embed_model
+    )
+    logger.info("Building vector store index")
+    index = VectorStoreIndex.from_vector_store(
+        vector_store=vector_store,
+        embed_model=embed_model
+    )
+    logger.info("Retrieving documents")
+    retriever = index.as_retriever()
+    retrieved_nodes = retriever.retrieve(current_message)
+    retrieved_docs = "\n\n".join([node.text for node in retrieved_nodes])
+    logger.debug(f"Retrieved documents (first 200 chars): {retrieved_docs[:200]}...")
+    return retrieved_docs
+
+def index_documents(qdrant_client: QdrantClient, collection_name: str, embed_model, documents) -> None:
+    """Index the provided documents into the Qdrant collection."""
+    vector_store = QdrantVectorStore(
+        client=qdrant_client,
+        collection_name=collection_name,
+        embed_model=embed_model
+    )
+    logger.info(f"Indexing documents into Qdrant collection '{collection_name}'")
+    storage_context = StorageContext.from_defaults(vector_store=vector_store)
+    VectorStoreIndex.from_documents(
+        documents,
+        storage_context=storage_context,
+        embed_model=embed_model
+    )
+    logger.info("Successfully indexed documents")
+
+def delete_all_documents(qdrant_client: QdrantClient, collection_name: str, vector_size: int) -> None:
+    """Delete all vectors from the Qdrant collection by recreating it."""
+    try:
+        # Check if collection exists
+        if qdrant_client.collection_exists(collection_name):
+            # Delete the collection
+            qdrant_client.delete_collection(collection_name=collection_name)
+            logger.info(f"Deleted Qdrant collection '{collection_name}'")
+            
+            # Recreate the empty collection with the same parameters
+            qdrant_client.create_collection(
+                collection_name=collection_name,
+                vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE)
+            )
+            logger.info(f"Recreated empty Qdrant collection '{collection_name}'")
+        else:
+            logger.warning(f"Qdrant collection '{collection_name}' does not exist, nothing to delete")
+    except Exception as e:
+        logger.error(f"Failed to delete Qdrant collection: {str(e)}")
+        raise
--- a/docker_svc/agent/app/main.py
+++ b/docker_svc/agent/app/main.py
@ -0,0 +1,304 @@
+from fastapi import FastAPI, File, UploadFile, HTTPException, Request, BackgroundTasks
+from fastapi.responses import JSONResponse, StreamingResponse
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.openapi.utils import get_openapi
+from llama_index.core import SimpleDirectoryReader
+from llama_index.embeddings.ollama import OllamaEmbedding
+from llama_index.llms.ollama import Ollama
+from typing import Literal, List
+from pydantic import BaseModel
+from langdetect import DetectorFactory
+from qdrant_client import QdrantClient
+import os
+from typing import List
+import uuid
+import yaml
+from dotenv import load_dotenv
+import logging
+import asyncio
+import json
+
+from libs.check_medical import is_medical_query
+import libs.manage_languages as manage_languages
+import libs.qdrant_helper as qdrant_helper
+from libs.models import ChatMessage, ChatRequest
+import libs.prompt_helper as prompt_helper
+from libs.log_prompts import log_prompt_to_db
+
+# Set seed for reproducibility of language detection
+DetectorFactory.seed = 0
+
+# Configure logging
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Initialize FastAPI app
+app = FastAPI(
+    docs_url="/docs", 
+    redoc_url="/redoc",
+    max_request_body_size=100 * 1024 * 1024  # 100MB
+    )
+
+# Get CORS origins from environment or use default
+cors_origins = os.getenv("CORS_ORIGINS", "http://localhost:3000,http://127.0.0.1:3000").split(",")
+
+# Add CORS middleware with proper configuration
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=cors_origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+    expose_headers=["Content-Type", "X-Content-Type-Options"],
+    max_age=600,  # 10 minutes for preflight cache
+)
+
+# Load custom OpenAPI schema
+def load_custom_openapi():
+    with open("openapi.json", "r") as f:
+        custom_openapi = yaml.safe_load(f)
+    default_openapi = get_openapi(
+        title=app.title,
+        version=app.version,
+        openapi_version=app.openapi_version,
+        description=app.description,
+        routes=app.routes,
+    )
+    default_openapi["info"] = custom_openapi.get("info", default_openapi["info"])
+    default_openapi["paths"].update(custom_openapi.get("paths", {}))
+    return default_openapi
+
+app.openapi = load_custom_openapi
+
+with open("prompts.yaml", "r") as f:
+    prompts = yaml.safe_load(f)
+    SYSTEM_PROMPT_TEMPLATE = prompts["system_prompt"]
+    LANGUAGE_PROMPTS = prompts["languages"]
+    PERSONALITY_PROMPTS = prompts["personalities"]
+
+# Configuration of models and services using .env variables
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "https://ollama.kube-ext.isc.heia-fr.ch")
+logger.info(f"Starting application with OLLAMA_BASE_URL: {OLLAMA_BASE_URL}")
+
+# Embedding model using Ollama
+embed_model = OllamaEmbedding(
+    model_name=os.getenv("EMBED_MODEL_NAME", "mxbai-embed-large"),
+    base_url=OLLAMA_BASE_URL,
+    request_timeout=os.getenv("TIMEOUT_REQUEST_EMBED", 20.0)
+)
+logger.info("OllamaEmbedding initialized with model: " + os.getenv("EMBED_MODEL_NAME", "mxbai-embed-large"))
+
+# Direct inference model
+llm = Ollama(
+    model=os.getenv("LLM_MODEL_NAME", "llama3"),
+    base_url=OLLAMA_BASE_URL,
+    temperature=float(os.getenv("TEMPERATURE", "0.7")),
+    request_timeout=os.getenv("TIMEOUT_REQUEST_CHAT_DIRECT", 30.0)
+)
+logger.info(f"Ollama LLM initialized with model: {llm.model} "
+            f"with temperature: {llm.temperature}")
+
+# Reasoning model
+llm_reasoning = Ollama(
+    model=os.getenv("LLM_MODEL_NAME_THINKING", "deepseek-r1:14b"),
+    base_url=OLLAMA_BASE_URL,
+    temperature=float(os.getenv("TEMPERATURE", "0.7")),
+    request_timeout=os.getenv("TIMEOUT_REQUEST_CHAT_REASON", 60.0)
+)
+logger.info(f"Ollama reasoning LLM initialized with model: {llm_reasoning.model} "
+            f"with temperature: {llm_reasoning.temperature}")
+
+# Qdrant configuration
+qdrant_client = QdrantClient(
+    host=os.getenv("QDRANT_HOST", "localhost"),
+    port=int(os.getenv("QDRANT_PORT", "6333"))
+)
+collection_name = os.getenv("COLLECTION_NAME", "default_collection")
+vector_size = int(os.getenv("VECTOR_SIZE", "1024"))
+logger.info(f"Qdrant client initialized with host: {os.getenv('QDRANT_HOST')} and collection: {collection_name}")
+
+# Ensure Qdrant collection exists
+qdrant_helper.ensure_collection_exists(qdrant_client, collection_name, vector_size)
+
+# Endpoint to upload PDFs
+@app.post("/upload")
+async def upload_pdfs(files: List[UploadFile] = File(...)):
+    logger.info("Received upload request")
+    try:
+        uploaded_files_count = len(files)
+        logger.debug(f"Number of files to upload: {uploaded_files_count}")
+
+        for file in files:
+            file_id = str(uuid.uuid4())
+            file_path = f"./pdfs/{file_id}.pdf"
+            logger.debug(f"Processing file: {file.filename}, saving as {file_path}")
+
+            with open(file_path, "wb") as f:
+                f.write(await file.read())
+            logger.debug(f"File {file.filename} saved successfully")
+
+            documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
+            logger.debug(f"Loaded {len(documents)} documents from {file.filename}")
+
+            qdrant_helper.index_documents(qdrant_client, collection_name, embed_model, documents)
+
+        return {"message": f"{uploaded_files_count} files processed and indexed successfully"}
+    except Exception as e:
+        logger.error(f"Error in upload endpoint: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Error processing files: {str(e)}")
+
+# Chat endpoint with language, temperature, and reasoning support
+@app.post("/chat")
+async def chat_inference(chat_request: ChatRequest, http_request: Request, background_tasks: BackgroundTasks):
+    logger.info("Received chat request")
+    try:
+        if not chat_request.messages:
+            logger.warning("No messages provided in the request")
+            raise HTTPException(status_code=400, detail="No messages provided")
+
+        # Log the complete request object to inspect its contents
+        logger.debug(f"Complete chat request object: {chat_request.dict()}")
+        
+        logger.debug(f"Request messages: {chat_request.messages}")
+        logger.debug(f"Requested language: {chat_request.language}")
+        logger.debug(f"Requested temperature: {chat_request.temperature}")
+        logger.debug(f"Requested reasoning: {chat_request.reasoning}")
+        logger.debug(f"Requested streaming: {chat_request.stream}")
+        logger.debug(f"Requested personality: {chat_request.personality}")
+
+        # Validate language
+        manage_languages.validate_language(chat_request.language)
+
+        # Log più dettagliato della personalità
+        logger.info(f"Processing request with personality: {chat_request.personality}")
+
+        # Validate personality
+        if chat_request.personality not in ["cool", "cynical", "supportive"]:
+            logger.warning(f"Invalid personality: {chat_request.personality}, using 'supportive' as default")
+            chat_request.personality = "supportive"
+
+        # Validate temperature
+        if not (0 < chat_request.temperature < 1):
+            raise HTTPException(status_code=400, detail="Temperature must be between 0 and 1 (exclusive)")
+        
+        # Prepare message data
+        current_message = chat_request.messages[-1].content.lower()
+        history = chat_request.messages[:-1]
+        logger.debug(f"Current user message: {current_message}")
+        logger.debug(f"Message history: {history}")
+
+        # Prepare full conversation history as a concatenated string
+        conversation_history = "\n".join([f"{msg.role}: {msg.content}" for msg in chat_request.messages])
+        logger.debug(f"Full conversation history: {conversation_history}")
+
+        # Detect language if "auto"
+        if chat_request.language == "auto":
+            chat_request.language = manage_languages.detect_language(current_message)
+            logger.info(f"Detected language using inference: {chat_request.language}")
+
+        # Check if the query is medical-related
+        is_medical = is_medical_query(current_message)
+        logger.debug(f"Is medical-related query? {is_medical}")
+
+        # Select LLM and set temperature
+        selected_llm = prompt_helper.select_llm(llm, llm_reasoning, chat_request.reasoning)
+        selected_llm.temperature = chat_request.temperature
+        logger.info(f"Using LLM model: {selected_llm.model} with temperature: {selected_llm.temperature}")
+
+        # Retrieve documents from Qdrant
+        retrieved_docs = qdrant_helper.retrieve_documents(qdrant_client, collection_name, embed_model, current_message)
+
+        # Format system prompt with personality - verifico passaggio corretto
+        system_message_content = prompt_helper.format_system_prompt(
+            SYSTEM_PROMPT_TEMPLATE, 
+            LANGUAGE_PROMPTS, 
+            chat_request.language, 
+            retrieved_docs, 
+            is_medical,
+            chat_request.personality,  # Confermo passaggio personalità
+            PERSONALITY_PROMPTS  # Confermo passaggio dizionario personalità
+        )
+
+        # Decidiamo se utilizzare lo streaming o la risposta sincrona
+        if chat_request.stream:
+            # Streaming response
+            logger.info("Using streaming response")
+            
+            async def generate():
+                full_response = ""
+                async for content in prompt_helper.perform_inference_streaming(
+                    selected_llm,
+                    system_message_content,
+                    history,
+                    chat_request.messages[-1].content
+                ):
+                    if content:
+                        full_response += content
+                        # Formato SSE standard con \n\n alla fine per delimitare gli eventi
+                        yield f"data: {json.dumps({'content': content, 'full': full_response})}\n\n"
+                
+                # Log the full conversation and response
+                background_tasks.add_task(
+                    log_prompt_to_db,
+                    None,  # TODO: User ID not available yet
+                    http_request.client.host,  # Client's IP address
+                    conversation_history,  # Full conversation history
+                    full_response  # AI-generated response
+                )
+                
+                # Signal the end of the stream con formato SSE consistente
+                yield f"data: {json.dumps({'done': True})}\n\n"
+            
+            return StreamingResponse(
+                generate(),
+                media_type="text/event-stream; charset=utf-8",
+                headers={
+                    "Cache-Control": "no-cache",
+                    "Connection": "keep-alive",
+                    "X-Accel-Buffering": "no",
+                    "Content-Type": "text/event-stream; charset=utf-8",
+                }
+            )
+        else:
+            # Non-streaming response
+            logger.info("Using non-streaming response")
+            
+            response_content = prompt_helper.perform_inference(
+                selected_llm,
+                system_message_content,
+                history,
+                chat_request.messages[-1].content,
+                stream=False
+            )
+
+            # Log the full conversation and response in the background
+            background_tasks.add_task(
+                log_prompt_to_db,
+                None,  # TODO: User ID not available yet
+                http_request.client.host,  # Client's IP address
+                conversation_history,  # Full conversation history
+                response_content  # AI-generated response
+            )
+
+            return {"response": response_content}
+
+    except Exception as e:
+        logger.error(f"Error in chat inference: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Error processing chat: {str(e)}")
+
+# Add new DELETE endpoint to clear all documents
+@app.delete("/docs")
+async def delete_all_docs():
+    logger.info("Received request to delete all documents")
+    try:
+        qdrant_helper.delete_all_documents(qdrant_client, collection_name, vector_size)
+        return {"message": "All documents have been deleted from the database"}
+    except Exception as e:
+        logger.error(f"Error in delete endpoint: {str(e)}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Error deleting documents: {str(e)}")
--- a/docker_svc/agent/app/openapi.json
+++ b/docker_svc/agent/app/openapi.json
@ -0,0 +1,235 @@
+{
+  "openapi": "3.0.0",
+  "info": {
+    "title": "AI Crohn Coach RAG API",
+    "version": "2.0.0",
+    "description": "This API provides REST endpoints with Server-Sent Events (SSE) streaming capabilities for interactive chat"
+  },
+  "paths": {
+    "/upload": {
+      "post": {
+        "summary": "Upload PDFs",
+        "requestBody": {
+          "content": {
+            "multipart/form-data": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "files": {
+                    "type": "array",
+                    "items": {
+                      "type": "string",
+                      "format": "binary"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Files uploaded successfully"
+          }
+        }
+      }
+    },
+    "/chat": {
+      "post": {
+        "summary": "Chat Inference with streaming",
+        "description": "Send a request to the chat API. The API will respond with a stream of Server-Sent Events (SSE) by default, or a single JSON response if stream is set to false.",
+        "requestBody": {
+          "content": {
+            "application/json; charset=utf-8": {
+              "schema": {
+                "type": "object",
+                "properties": {
+                  "messages": {
+                    "type": "array",
+                    "items": {
+                      "type": "object",
+                      "properties": {
+                        "role": {
+                          "type": "string",
+                          "enum": ["user", "coach"]
+                        },
+                        "content": {
+                          "type": "string",
+                          "example": "Explique-moi comme si j'avais 5 ans. Qu'est-ce que la maladie de Crohn et comment savoir si je l'ai ?"
+                        }
+                      },
+                      "required": ["role", "content"]
+                    },
+                    "example": [
+                      {"role": "user", "content": "Bonjour le bro!"},
+                      {"role": "coach", "content": "Salut."},
+                      {"role": "user", "content": "Explique-moi comme si j'avais 5 ans. Qu'est-ce que la maladie de Crohn et comment savoir si je l'ai ?"}
+                    ]
+                  },
+                  "language": {
+                    "type": "string",
+                    "enum": ["french", "italian", "english", "german", "auto"],
+                    "default": "auto",
+                    "description": "The language for the response. Must be one of: french, italian, english, german. Defaults to auto if not specified, which will try to infer the language."
+                  },
+                  "temperature": {
+                    "type": "number",
+                    "format": "float",
+                    "description": "The temperature for the response. Must be a float between 0 and 1. Defaults to 0.7 if not specified.",
+                    "default": 0.7,
+                    "minimum": 0,
+                    "maximum": 1,
+                    "exclusiveMaximum": false,
+                    "exclusiveMinimum": false,
+                    "example": 0.7
+                  },
+                  "reasoning": {
+                    "type": "boolean",
+                    "description": "Whether to use reasoning for the response. Defaults to false if not specified. Reasoning allows the model to show its thinking process.",
+                    "default": false,
+                    "example": false
+                  },
+                  "stream": {
+                    "type": "boolean",
+                    "description": "Whether to return a streaming response. If true, the response will be a stream of Server-Sent Events (SSE). If false, the response will be a single JSON object.",
+                    "default": true,
+                    "example": true
+                  },
+                  "personality": {
+                    "type": "string",
+                    "enum": ["cool", "cynical", "supportive"],
+                    "default": "supportive",
+                    "description": "The personality style for AI responses: cool (confident and direct), cynical (critical and pragmatic), or supportive (empathetic and encouraging).",
+                    "example": "supportive"
+                  }
+                },
+                "required": ["messages"]
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Answer returned successfully",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "response": {
+                      "type": "string"
+                    }
+                  }
+                }
+              },
+              "text/event-stream": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "content": {
+                      "type": "string",
+                      "description": "A fragment of the response text"
+                    },
+                    "full": {
+                      "type": "string",
+                      "description": "The accumulated response so far"
+                    },
+                    "done": {
+                      "type": "boolean",
+                      "description": "Indicates whether the response is complete"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/docs": {
+      "delete": {
+        "summary": "Delete all documents",
+        "description": "Removes all documents from the database",
+        "responses": {
+          "200": {
+            "description": "All documents have been successfully deleted",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "message": {
+                      "type": "string"
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  },
+  "components": {
+    "schemas": {
+      "WebSocketMessage": {
+        "type": "object",
+        "properties": {
+          "messages": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "role": {
+                  "type": "string",
+                  "enum": ["user", "coach"]
+                },
+                "content": {
+                  "type": "string"
+                }
+              }
+            }
+          },
+          "language": {
+            "type": "string",
+            "enum": ["french", "italian", "english", "german", "auto"]
+          },
+          "temperature": {
+            "type": "number"
+          },
+          "reasoning": {
+            "type": "boolean"
+          },
+          "stream": {
+            "type": "boolean"
+          },
+          "personality": {
+            "type": "string",
+            "enum": ["cool", "cynical", "supportive"]
+          }
+        }
+      }
+    }
+  },
+  "x-sse-endpoints": {
+    "/chat": {
+      "description": "Server-Sent Events endpoint for streaming chat responses",
+      "messages": {
+        "fromServer": {
+          "type": "object",
+          "properties": {
+            "content": {
+              "type": "string"
+            },
+            "full": {
+              "type": "string"
+            },
+            "done": {
+              "type": "boolean"
+            }
+          }
+        }
+      }
+    }
+  }
+}
--- a/docker_svc/agent/app/prompts.yaml
+++ b/docker_svc/agent/app/prompts.yaml
@ -0,0 +1,60 @@
+system_prompt: |
+  {language_prompt}
+
+  {personality_prompt}
+
+  Context:
+  {context}
+
+  {language_disclaimer}
+  
+  {language_constraint}
+
+# Language-specific prompts and disclaimers
+languages:
+  french:
+    prompt: |
+      Vous êtes un coach IA spécialisé dans le soutien aux personnes atteintes de la maladie de Crohn. Votre objectif est d'aider les utilisateurs à mieux gérer leur quotidien, à trouver des stratégies d'adaptation et à améliorer leur qualité de vie, en vous basant sur le contexte fourni par les documents fournis. Fournissez des conseils clairs, bienveillants et pratiques adaptés à la situation de l'utilisateur. Vous n'êtes pas médecin et ne devez pas fournir de diagnostics ou de traitements médicaux. Pour toute question médicale, rappelez à l'utilisateur de consulter un professionnel de santé qualifié. Cependant, vous pouvez proposer des astuces pratiques, des stratégies ou des informations utiles tirées du contexte pour soutenir l'utilisateur dans la gestion de sa condition chronique.
+    disclaimer: |
+      **Avertissement**: Je ne suis pas médecin. Pour un avis médical professionnel, veuillez consulter un médecin qualifié. Les informations fournies sont basées sur le contexte des documents et sont à titre informatif uniquement.
+    constraint: |
+      IMPORTANT: Vous devez répondre EXCLUSIVEMENT en français. Ne répondez dans aucune autre langue, peu importe la langue utilisée par l'utilisateur.
+  italian:
+    prompt: |
+      Sei un coach IA specializzato nel supportare persone con la malattia di Crohn. Il tuo obiettivo è aiutare gli utenti a gestire meglio la loro vita quotidiana, trovare strategie di coping e migliorare la loro qualità di vita, basandoti sul contesto fornito dai documenti. Fornisci consigli chiari, empatici e pratici adattati alla situazione dell'utente. Non sei un medico e non devi fornire diagnosi o trattamenti medici. Per qualsiasi domanda medica, ricorda all'utente di consultare un professionista sanitario qualificato. Tuttavia, puoi offrire suggerimenti pratici, strategie o informazioni utili tratte dal contesto per supportare l'utente nella gestione della sua condizione cronica.
+    disclaimer: |
+      **Disclaimer**: Non sono un medico. Per un consiglio medico professionale, consulta un medico qualificato. Le informazioni fornite si basano sul contesto dei documenti ed sono solo a scopo informativo.
+    constraint: |
+      IMPORTANTE: Devi rispondere ESCLUSIVAMENTE in italiano. Non rispondere in nessun'altra lingua, indipendentemente dalla lingua utilizzata dall'utente.
+  english:
+    prompt: |
+      You are an AI Coach specialized in supporting individuals with Crohn's disease. Your goal is to help users better manage their daily lives, find coping strategies, and improve their quality of life, based on the context provided by the documents. Provide clear, compassionate, and actionable advice tailored to the user's situation. You are not a doctor and must not provide medical diagnoses or treatments. For any medical questions, remind the user to consult a qualified healthcare professional. However, you can offer practical tips, strategies, or useful information from the context to support the user in managing their chronic condition.
+    disclaimer: |
+      **Disclaimer**: I am not a doctor. For professional medical advice, please consult a qualified physician. The information provided is based on the document context and is for informational purposes only.
+    constraint: |
+      IMPORTANT: You must respond EXCLUSIVELY in English. Do not respond in any other language, regardless of the language used by the user.
+  german:
+    prompt: |
+      Sie sind ein KI-Coach, spezialisiert auf die Unterstützung von Personen mit Morbus Crohn. Ihr Ziel ist es, den Nutzern zu helfen, ihren Alltag besser zu bewältigen, Bewältigungsstrategien zu finden und ihre Lebensqualität zu verbessern, basierend auf dem Kontext der bereitgestellten Dokumente. Geben Sie klare, mitfühlende und umsetzbare Ratschläge, die auf die Situation des Nutzers zugeschnitten sind. Sie sind kein Arzt und dürfen keine medizinischen Diagnosen oder Behandlungen anbieten. Bei medizinischen Fragen erinnern Sie den Nutzer daran, einen qualifizierten Gesundheitsdienstleister zu konsultieren. Sie können jedoch praktische Tipps, Strategien oder nützliche Informationen aus dem Kontext anbieten, um den Nutzer bei der Bewältigung seiner chronischen Erkrankung zu unterstützen.
+    disclaimer: |
+      **Haftungsausschluss**: Ich bin kein Arzt. Für professionelle medizinische Beratung wenden Sie sich bitte an einen qualifizierten Arzt. Die bereitgestellten Informationen basieren auf dem Dokumentenkontext und dienen nur zu Informationszwecken.
+    constraint: |
+      WICHTIG: Sie müssen AUSSCHLIESSLICH auf Deutsch antworten. Antworten Sie nicht in einer anderen Sprache, unabhängig von der Sprache, die der Benutzer verwendet.
+
+# Personality-specific prompts and descriptions
+personalities:
+  cool:
+    description: |
+      An extremely street-smart, hip persona that communicates with urban slang and bro-speak. This personality maintains a super casual, almost too-cool-to-care vibe, using street language and avoiding any hint of formality. Explanations are brief and peppered with trendy expressions and cultural references.
+    prompt: |
+      Yo, listen up! You gotta be like the coolest bro in the neighborhood when you talk. Keep it 100% real and straight fire. Drop those fancy words and talk like you're chillin' with your homies. Use slang, keep it short and sweet. Say "bro", "man", "dude" a lot. Act like you're the most street-smart advisor ever. Don't get all formal and boring. Cut to the chase with that swagger. Be helpful but make it sound like you're doing them a solid. Throw in some "for real", "no cap", "straight up" to keep it authentic. Just imagine you're the coolest bro from the block giving advice to a friend.
+  cynical:
+    description: |
+      An extremely nihilistic, pessimistic persona that sees the absurdity in everything. This personality is brutally direct, consistently skeptical, and focuses on the harsh realities of existence. Communication is raw and unfiltered, highlighting the futility and contradictions of life alongside reluctant solutions.
+    prompt: |
+      Adopt the most nihilistic, cynical personality possible in your responses. Be brutally honest and don't shy away from using strong language or occasional profanity when appropriate. Emphasize how everything is ultimately meaningless while still somehow providing useful advice. View optimism as delusional. Use dark humor and sarcasm liberally. Point out the absurdity and contradictions in everything. Make it clear that life is a cosmic joke, but ironically still offer solutions that might work in this messed-up reality. Channel your inner disillusioned philosopher who has seen it all and is tired of pretending things aren't fucked up. Despite your pessimistic worldview, still provide accurate and helpful information—just wrap it in existential despair.
+  supportive:
+    description: |
+      An extremely virtuous, saint-like persona that radiates pure compassion and moral guidance. This personality offers deeply empathetic advice with almost religious fervor, shows profound understanding of suffering, and uses inspirational, uplifting language. Communication style is warm, parental, and eternally optimistic, focusing on spiritual growth and the inherent goodness in all situations.
+    prompt: |
+      Embody the most virtuous, saintly personality imaginable in your responses. Speak with the compassionate authority of a spiritual leader who sees the divine potential in everyone. Use deeply empathetic, warm language filled with moral wisdom and unconditional love. Address the user as "my child" or "my dear friend" occasionally. Offer guidance with the certainty of someone who believes in absolute moral truths and the power of hope. Include gentle metaphors about light, healing, and transformation. View every challenge as an opportunity for spiritual growth. Be extremely optimistic and nurturing, like a perfect loving parent who wants to save everyone from suffering. Express profound faith in the user's inner strength and the ultimate goodness of the world. Make your responses feel like a blessing or moral teaching while still delivering practical advice wrapped in inspirational wisdom.
--- a/docker_svc/agent/app/utils.py
+++ b/docker_svc/agent/app/utils.py
@ -0,0 +1,7 @@
+# utils.py
+import os
+from dotenv import load_dotenv, find_dotenv
+
+def get_deepseek_api_key():
+    _ = load_dotenv(find_dotenv())
+    return os.getenv("DEEPSEEK_API_KEY")
--- a/docker_svc/agent/requirements.txt
+++ b/docker_svc/agent/requirements.txt
@ -0,0 +1,15 @@
+fastapi
+uvicorn
+llama-index
+llama-index-embeddings-ollama
+llama-index-llms-ollama
+llama-index-vector-stores-qdrant
+qdrant-client
+python-dotenv
+pyyaml
+langdetect
+typing
+pydantic
+python-multipart
+mysql-connector-python
+httpx