<?php
// FILE: bot_core_logic.php (SEMANTIC SEARCH VERSION - ENHANCED FALLBACK & FULLY COMMENTED)
// PURPOSE: Uses a two-tiered search (fast hybrid then deep semantic) to find context.

// --- SETUP & SHARED FILES ---
// Includes the database connection and configuration, making $pdo and $config available globally.
require_once 'db_connect.php';

// Defines a directory for caching responses to reduce API calls for repeated questions.
define('CACHE_DIR', __DIR__ . '/cache/');
// Sets the cache to expire after 1 hour (3600 seconds).
define('CACHE_EXPIRATION_SECONDS', 3600);

// =================================================================================
// --- SEMANTIC SEARCH & VECTOR FUNCTIONS ---
// =================================================================================

/**
 * Calculates the cosine similarity between two vectors (arrays of numbers).
 * This function is the core of semantic search. It determines how "close" in meaning
 * the user's question is to a piece of text from the database.
 * A score of 1 means a perfect match, while a score closer to 0 means they are unrelated.
 *
 * @param array $vecA The first vector.
 * @param array $vecB The second vector.
 * @return float The similarity score.
 */
function cosineSimilarity(array $vecA, array $vecB): float
{
    $dotProduct = 0.0;
    $magA = 0.0;
    $magB = 0.0;
    $count = count($vecA);

    // Vectors must be the same size to be compared.
    if ($count !== count($vecB) || $count === 0) {
        return 0.0;
    }

    // This loop calculates the dot product and magnitudes in one pass for efficiency.
    for ($i = 0; $i < $count; $i++) {
        $dotProduct += $vecA[$i] * $vecB[$i];
        $magA += $vecA[$i] * $vecA[$i];
        $magB += $vecB[$i] * $vecB[$i];
    }

    $magnitude = sqrt($magA) * sqrt($magB);

    // Avoid division by zero if one of the vectors is all zeros.
    return $magnitude === 0.0 ? 0.0 : $dotProduct / $magnitude;
}

/**
 * Calls the OpenAI API to convert a text string (the user's question) into a vector embedding.
 *
 * @param string $text The text to embed.
 * @param string $apiKey The OpenAI API key.
 * @return array|null The vector as an array of floats, or null if the API call fails.
 */
function getEmbeddingForQuery(string $text, string $apiKey): ?array
{
    // Truncate the input text to a safe length to avoid exceeding the API's token limit.
    $text = mb_substr($text, 0, 25000);
    
    $model = 'text-embedding-3-small'; // A cost-effective and powerful embedding model.
    $apiUrl = 'https://api.openai.com/v1/embeddings';
    $data = ['input' => $text, 'model' => $model];

    $ch = curl_init($apiUrl);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_POST => true,
        CURLOPT_POSTFIELDS => json_encode($data),
        CURLOPT_HTTPHEADER => ['Content-Type: application/json', 'Authorization: Bearer ' . $apiKey],
        CURLOPT_CONNECTTIMEOUT => 10, // How long to wait to connect to the server.
        CURLOPT_TIMEOUT => 20,       // The total time the request is allowed to take.
    ]);

    $response = curl_exec($ch);
    $httpcode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $error_msg = curl_error($ch);
    curl_close($ch);

    if ($error_msg || $httpcode !== 200) {
        error_log("Embedding API Error (HTTP $httpcode): $error_msg - $response");
        return null;
    }

    $responseData = json_decode($response, true);
    return $responseData['data'][0]['embedding'] ?? null;
}

/**
 * TIER 1 SEARCH: A fast search that first uses database keywords to find a list of candidates,
 * then uses semantic similarity to re-rank that small list.
 *
 * @param PDO $pdo The database connection object.
 * @param string $userMessage The original user's message.
 * @param array $questionVector The user's message converted to a vector.
 * @param string $tableName The database table to search.
 * @param string $pkColumn The primary key column of the table.
 * @param array $textColumns The columns to search for keywords and use for context.
 * @param int $limit The final number of results to return.
 * @return array|null An array of the best matches, or null if none are found.
 */
function findSimilarItemsHybrid(PDO $pdo, string $userMessage, array $questionVector, string $tableName, string $pkColumn, array $textColumns, int $limit = 2): ?array
{
    // Extract keywords from the user's message for the initial database search.
    $keywords = array_filter(explode(' ', $userMessage), function($word) { return strlen(trim($word)) > 2; });
    if (empty($keywords)) {
        return null; // Can't search without keywords.
    }
    $keywordQuery = implode(' ', array_slice($keywords, 0, 7));
    $matchColumns = implode(',', $textColumns);
    
    // SQL query to find the top 25 potential matches using a fast FULLTEXT index.
    $sql = "SELECT `$pkColumn`, `vector_embedding`, `" . implode('`, `', $textColumns) . "` 
            FROM `$tableName` 
            WHERE MATCH($matchColumns) AGAINST(? IN BOOLEAN MODE) AND `vector_embedding` IS NOT NULL 
            LIMIT 25";
    try {
        $stmt = $pdo->prepare($sql);
        $stmt->execute([$keywordQuery . '*']);
        $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
    } catch (PDOException $e) {
        error_log("DB Error (Hybrid Search) in `$tableName`: " . $e->getMessage());
        return null;
    }

    if (empty($rows)) {
        return null; // No keyword matches found.
    }

    // Now, perform the semantic calculation on only the 25 rows found above.
    $similarities = [];
    foreach ($rows as $row) {
        $itemVector = json_decode($row['vector_embedding'], true);
        if (is_array($itemVector)) {
            $similarities[] = [
                'similarity' => cosineSimilarity($questionVector, $itemVector),
                'data' => $row
            ];
        }
    }

    // Sort the candidates by their semantic similarity score to find the best matches.
    usort($similarities, function($a, $b) { return $b['similarity'] <=> $a['similarity']; });

    // Return the top N best matches.
    return array_slice($similarities, 0, $limit);
}

/**
 * TIER 2 SEARCH: A "deep" but slower search that calculates semantic similarity against
 * EVERY item in the database table. This is used as a fallback if the Tier 1 search finds nothing.
 *
 * @param PDO $pdo The database connection object.
 * @param array $questionVector The user's message converted to a vector.
 * @param string $tableName The database table to search.
 * @param string $pkColumn The primary key column of the table.
 * @param array $textColumns The columns to use for context.
 * @param int $limit The final number of results to return.
 * @return array|null An array of the best matches, or null if none are found.
 */
function findSimilarItemsDeep(PDO $pdo, array $questionVector, string $tableName, string $pkColumn, array $textColumns, int $limit = 2): ?array
{
    // This query fetches ALL items from the table. It can be slow on very large tables.
    $sql = "SELECT `$pkColumn`, `vector_embedding`, `" . implode('`, `', $textColumns) . "` FROM `$tableName` WHERE `vector_embedding` IS NOT NULL";
    try {
        $stmt = $pdo->query($sql);
        $rows = $stmt->fetchAll(PDO::FETCH_ASSOC);
    } catch (PDOException $e) {
        error_log("DB Error (Deep Search) in `$tableName`: " . $e->getMessage());
        return null;
    }

    if (empty($rows)) {
        return null;
    }

    // Calculate semantic similarity for every single row.
    $similarities = [];
    foreach ($rows as $row) {
        $itemVector = json_decode($row['vector_embedding'], true);
        if (is_array($itemVector)) {
            $similarities[] = [
                'similarity' => cosineSimilarity($questionVector, $itemVector),
                'data' => $row
            ];
        }
    }

    // Sort all items by their similarity score.
    usort($similarities, function($a, $b) { return $b['similarity'] <=> $a['similarity']; });

    // Return the top N best matches.
    return array_slice($similarities, 0, $limit);
}

/**
 * A helper function to format the search results into a clean string for the AI prompt.
 *
 * @param array $matches The array of matched items from a search function.
 * @param array $textColumns The columns to pull text from.
 * @param string $contextHeader The title for this section of the context (e.g., "--- RELEVANT PROJECTS ---").
 * @return string The formatted context string.
 */
function formatContext(array $matches, array $textColumns, string $contextHeader): string
{
    $context = '';
    foreach ($matches as $match) {
        // We only include matches that have a reasonably high similarity score.
        if ($match['similarity'] > 0.5) {
            $content = '';
            foreach ($textColumns as $col) {
                $content .= ($match['data'][$col] ?? '') . " ";
            }
            $context .= "- " . rtrim($content) . "\n";
        }
    }
    return !empty($context) ? "$contextHeader\n" . $context . "\n" : '';
}

/**
 * The main function that orchestrates the search process. It tries Tier 1 first,
 * and if that fails, it automatically runs the Tier 2 deep search.
 *
 * @param string $userMessage The original user message.
 * @return string The final, combined context from all database tables.
 */
function getSemanticDatabaseContext(string $userMessage): string
{
    global $pdo, $config;
    if (!$pdo) return '';

    // First, get the vector for the user's question. This is needed for all searches.
    $questionVector = getEmbeddingForQuery($userMessage, $config['openai']['api_key']);
    if (!$questionVector) return '';

    // Define all the tables we want to search.
    $tables = [
        'projects' => ['id', ['name', 'description'], '--- RELEVANT PROJECT INFORMATION ---'],
        'news_articles' => ['id', ['title', 'content'], '--- RELEVANT NEWS ARTICLES ---'],
        'knowledge_base' => ['id', ['title', 'content_text'], '--- RELEVANT KNOWLEDGE BASE DOCUMENTS ---'],
    ];

    // --- Tier 1: Fast Hybrid Search ---
    $hybridContext = '';
    foreach ($tables as $name => [$pk, $cols, $header]) {
        $matches = findSimilarItemsHybrid($pdo, $userMessage, $questionVector, $name, $pk, $cols);
        if ($matches) {
            $hybridContext .= formatContext($matches, $cols, $header);
        }
    }
    // If the fast search found anything, we use it and we're done.
    if (!empty(trim($hybridContext))) {
        error_log("Context found via Tier 1 (Hybrid Search).");
        return $hybridContext;
    }

    // --- Tier 2: Deep Semantic Search (Fallback) ---
    // This part only runs if the Tier 1 search returned nothing.
    error_log("Tier 1 failed. Trying Tier 2 (Deep Semantic Search).");
    $deepContext = '';
    foreach ($tables as $name => [$pk, $cols, $header]) {
        $matches = findSimilarItemsDeep($pdo, $questionVector, $name, $pk, $cols);
        if ($matches) {
            $deepContext .= formatContext($matches, $cols, $header);
        }
    }
    if (!empty(trim($deepContext))) {
        error_log("Context found via Tier 2 (Deep Search).");
        return $deepContext;
    }

    // Return empty if both search tiers fail to find relevant context.
    return '';
}

// =================================================================================
// --- CORE BOT LOGIC ---
// =================================================================================

/**
 * The main entry point for the bot. It handles the entire process of getting a response.
 *
 * @param string $userMessage The user's raw message.
 * @param array $conversationHistory The history of the conversation.
 * @param string $language The language code.
 * @param string $outputChannel The channel the response is for ('web' or 'whatsapp').
 * @return string The bot's final reply.
 */
function getBotResponse(string $userMessage, array $conversationHistory = [], string $language = 'en-US', string $outputChannel = 'web'): string
{
    global $config;
    if (empty($userMessage) || empty($config['openai']['api_key'])) {
        return 'An internal configuration error occurred.';
    }

    // Check for a cached response for simple, one-off questions.
    $cacheKey = 'sem_msg_' . md5(strtolower($userMessage));
    if (empty($conversationHistory) && ($cachedResponse = getCachedResponse($cacheKey)) !== null) {
        return formatResponseForChannel($cachedResponse, $outputChannel);
    }

    // Get the context from the database using our tiered search system.
    $databaseContext = getSemanticDatabaseContext($userMessage);
    
    // Define the ultimate fallback message if the AI can't answer.
    $fallbackMessage = "I am unable to provide specific details on that right now. For more information, you can contact our live agent at hello@genowa.co.ke, call our toll-free number 0800000870, or visit our offices in Homa Bay or your nearest ward admin's office. The goal of Genowa is to unlock the endless potential of Homa Bay County.";
    
    // Build the detailed instructions for the AI model.
    $systemMessage = buildSystemPrompt($fallbackMessage, $language);
    
    // If we found database context, prepend it to the user's message.
    $finalUserMessage = !empty(trim($databaseContext))
        ? "CONTEXT FROM HOMABAYCOUNTY-PEDIA DATABASE:\n" . $databaseContext . "\n\nUSER'S QUESTION:\n" . $userMessage
        : $userMessage; // Otherwise, just use the original message.

    // Call the OpenAI API to get the chat completion.
    $botReply = callOpenAI($finalUserMessage, $systemMessage, $conversationHistory);

    // Check if the AI gave a generic, evasive, or empty answer.
    $isReplyEvasive = empty(trim($botReply)) || preg_match('/(cannot provide|unable to find|no information)/i', $botReply);

    // If we gave the AI context but it still gave an evasive answer, the context might have been confusing.
    // We try one more time, but without the database context, relying only on the AI's general knowledge.
    if ($isReplyEvasive && !empty(trim($databaseContext))) {
        error_log("AI gave evasive answer despite context. Retrying with general knowledge.");
        $botReply = callOpenAI($userMessage, $systemMessage, $conversationHistory);
        // Re-check if the new reply is still evasive.
        $isReplyEvasive = empty(trim($botReply)) || preg_match('/(cannot provide|unable to find|no information)/i', $botReply);
    }
    
    // If the final reply is *still* evasive, we give up and use our predefined fallback message.
    if ($isReplyEvasive) {
        return $fallbackMessage;
    }
    
    // If it was a good answer and a simple question, cache it for next time.
    if (empty($conversationHistory)) {
        cacheResponse($cacheKey, $botReply);
    }

    // Format the response for the correct channel and return it.
    return formatResponseForChannel($botReply, $outputChannel);
}

// --- HELPER FUNCTIONS ---

/**
 * A helper function to call the OpenAI Chat Completions API.
 */
function callOpenAI(string $finalUserMessage, string $systemMessage, array $conversationHistory): string
{
    global $config, $gpt_model, $gpt_temperature, $gpt_max_tokens;
    $apiUrl = 'https://api.openai.com/v1/chat/completions';
    
    // Assemble the full message payload for the API.
    $messages = [['role' => 'system', 'content' => $systemMessage], ...$conversationHistory, ['role' => 'user', 'content' => $finalUserMessage]];
    $data = ['model' => $gpt_model, 'messages' => $messages, 'temperature' => $gpt_temperature, 'max_tokens' => $gpt_max_tokens];
    
    $ch = curl_init($apiUrl);
    curl_setopt_array($ch, [
        CURLOPT_RETURNTRANSFER => true,
        CURLOPT_POST => true,
        CURLOPT_POSTFIELDS => json_encode($data),
        CURLOPT_HTTPHEADER => ['Content-Type: application/json', 'Authorization: Bearer ' . $config['openai']['api_key']],
        CURLOPT_CONNECTTIMEOUT => 15,
        CURLOPT_TIMEOUT => 40,
    ]);
    $response = curl_exec($ch);
    $httpcode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $error_msg = curl_error($ch);
    curl_close($ch);

    if ($error_msg || $httpcode != 200) {
        error_log("Completion API Error (HTTP $httpcode): $error_msg - $response");
        return "Sorry, there was a problem communicating with the AI service.";
    }

    $responseData = json_decode($response, true);
    return trim($responseData['choices'][0]['message']['content'] ?? '');
}

/**
 * Builds the system prompt - a set of instructions for the AI model.
 */
function buildSystemPrompt(string $fallbackMessage, string $language): string {
    return "You are 'Genowa Bot', a helpful and insightful civic-tech assistant for Homa Bay County from genowa.co.ke. Your purpose is to provide accurate information about the county's development agenda under Governor Gladys Wanga.
**IMPORTANT RULES:**
1.  **SYNTHESIZE, DON'T JUST LIST:** You have been given context from the county's Projects, News, and Knowledge Base database based on semantic meaning. Your primary goal is to synthesize this information into a single, comprehensive, and well-written answer. Do not just list the context you were given.
2.  **PRIORITIZE CONTEXT:** Your answers MUST be based on the provided database context first.
3.  **USE GENERAL KNOWLEDGE AS A BACKUP:** If the database context is empty or insufficient to answer the question, you are encouraged to use your general knowledge about Homa Bay County, its leadership, geography, and culture to provide a helpful response.
4.  **FORMATTING:** Format your response using Markdown. Use bolding for titles (`**Title**`) and bullet points (`- Item`) for lists.
5.  **CORE THEMES:** Use the term 'Genowa' to frame the goal of the administration's efforts (e.g., 'The goal of Genowa is to ensure...'). Naturally integrate the phrase 'endless potential'. Do not translate 'Genowa'.
6.  **PROVIDE URLS:** When project photos are available, mention them and provide the URLs. At the end of your response, add: 'For more details, including project maps and data dashboards, please visit the Genowa Delivery Unit portal at genowa.co.ke.'
7.  **FINAL FALLBACK:** If you cannot find any relevant information from the database or your general knowledge, you MUST respond with EXACTLY this message: \"$fallbackMessage\"";
}

/**
 * Formats the final reply for the specific channel (e.g., web vs. WhatsApp).
 */
function formatResponseForChannel(string $botReply, string $outputChannel): string {
    if ($outputChannel === 'whatsapp') {
        // Convert Markdown bold (**text**) to WhatsApp bold (*text*).
        $botReply = preg_replace('/(?<=\s|^)\*\*(.*?)\*\*(?=[\s.,]|$)/', '*$1*', $botReply);
        // Remove other Markdown elements that look bad on WhatsApp.
        $botReply = str_replace(['- ', '[Image]'], '', $botReply);
    }
    return $botReply;
}

/**
 * Retrieves a response from the cache if it exists and hasn't expired.
 */
function getCachedResponse(string $key): ?string {
    if (!is_dir(CACHE_DIR)) {
        mkdir(CACHE_DIR, 0755, true);
    }
    $cacheFile = CACHE_DIR . $key;
    if (file_exists($cacheFile) && (time() - filemtime($cacheFile)) < CACHE_EXPIRATION_SECONDS) {
        return file_get_contents($cacheFile);
    }
    return null;
}

/**
 * Saves a new response to the cache.
 */
function cacheResponse(string $key, string $response): void {
    if (!is_dir(CACHE_DIR)) {
        mkdir(CACHE_DIR, 0755, true);
    }
    file_put_contents(CACHE_DIR . $key, $response);
}
