<?php
/**
 * RagIndexer - The RAG System Orchestrator
 * 
 * This script indexes all documentation for the RAG system:
 * - Codebase (classes, functions, API endpoints)
 * - Database schemas (tables, columns, conventions)
 * - API specifications (endpoints, parameters, responses)
 * 
 * Run this script to build/update the RAG index:
 *   php lib/RagIndexer.php
 */

require_once __DIR__ . '/../config.php';

class RagIndexer
{
    const VERSION = '1.0.0';
    const OUTPUT_DIR = '/lamp/www/importer/docs/rag';
    
    /**
     * Run all indexers
     */
    public static function run() {
        $outputDir = self::OUTPUT_DIR;
        $version = self::VERSION;
        
        echo "RAG Indexer v" . $version . "\n";
        echo str_repeat('=', 60) . "\n\n";
        
        // Ensure output directory exists
        if (!is_dir($outputDir)) {
            mkdir($outputDir, 0755, true);
        }
        
        // Run each indexer
        self::indexCodebase();
        self::indexDatabase();
        self::indexApi();
        self::generateMainIndex();
        
        echo "\n" . str_repeat('=', 60) . "\n";
        echo "Indexing complete. Output: " . $outputDir . "\n";
    }
    
    /**
     * Index all PHP library files
     */
    private static function indexCodebase() {
        echo "Indexing codebase...\n";
        
        $outputDir = RagIndexer::OUTPUT_DIR . '/codebase';
        if (!is_dir($outputDir)) {
            mkdir($outputDir, 0755, true);
        }
        
        // Index lib directory
        self::indexPhpDirectory(__DIR__, $outputDir . '/lib');
        
        // Index api directory
        self::indexPhpDirectory(__DIR__ . '/../api', $outputDir . '/api');
        
        // Index root PHP files
        self::indexRootPhpFiles($outputDir . '/root');
        
        echo "  Codebase indexed.\n";
    }
    
    /**
     * Index a directory of PHP files
     */
    private static function indexPhpDirectory($sourceDir, $outputDir) {
        if (!is_dir($outputDir)) {
            mkdir($outputDir, 0755, true);
        }
        
        $files = glob($sourceDir . '/*.php');
        foreach ($files as $file) {
            $basename = basename($file);
            if ($basename === 'RagIndexer.php') {
                continue; // Skip self
            }
            
            $content = file_get_contents($file);
            $doc = self::extractClassDocumentation($content, $basename);
            
            if ($doc) {
                $outputFile = $outputDir . '/' . $basename . '.md';
                file_put_contents($outputFile, $doc);
                echo "  Indexed: $basename\n";
            }
        }
    }
    
    /**
     * Index root PHP files
     */
    private static function indexRootPhpFiles($outputDir) {
        if (!is_dir($outputDir)) {
            mkdir($outputDir, 0755, true);
        }
        
        $rootFiles = array(
            'config.php',
            'index.php',
            'arrival.php',
            'insert.php',
            'upload.php',
            'table_operations.php',
            'viewer.php',
            'dashboard.php',
            'mockcrud.php'
        );
        
        foreach ($rootFiles as $file) {
            $path = dirname(__DIR__) . '/' . $file;
            if (is_file($path)) {
                $content = file_get_contents($path);
                $doc = self::extractScriptDocumentation($content, $file);
                
                if ($doc) {
                    $outputFile = $outputDir . '/' . $file . '.md';
                    file_put_contents($outputFile, $doc);
                    echo "  Indexed: $file\n";
                }
            }
        }
    }
    
    /**
     * Extract class documentation from PHP file
     */
    private static function extractClassDocumentation($content, $filename) {
        // Extract class name
        if (!preg_match('/class\s+(\w+)/', $content, $classMatch)) {
            return null;
        }
        $className = $classMatch[1];
        
        // Extract file docblock
        $docblock = self::extractDocblock($content);
        
        // Extract all methods with docblocks
        $methods = array();
        if (preg_match_all('/(\/\*\*[\s\S]*?\*\/)\s*(public\s+)?function\s+(\w+)\s*\(/', $content, $matches, PREG_OFFSET_CAPTURE)) {
            foreach ($matches[0] as $idx => $match) {
                $methodDoc = $matches[1][$idx][0];
                $methodName = $matches[3][$idx][0];
                $isPublic = ($matches[2][$idx][0] !== '');
                
                // Extract method description
                $desc = self::extractDescription($methodDoc);
                
                $methods[] = array(
                    'name' => $methodName,
                    'public' => $isPublic,
                    'description' => $desc ? $desc : 'No description available',
                    'signature' => $match[0]
                );
            }
        }
        
        // Extract constants
        $constants = array();
        if (preg_match_all('/const\s+(\w+)\s*=/', $content, $constMatches)) {
            $constants = $constMatches[1];
        }
        
        // Build markdown
        $md = "# Class: $className\n\n";
        $md .= "**File:** `$filename`\n\n";
        
        if ($docblock) {
            $md .= "\n## Description\n\n" . self::formatDocblock($docblock) . "\n";
        }
        
        if (!empty($constants)) {
            $md .= "\n## Constants\n\n";
            foreach ($constants as $const) {
                $md .= "- `$const`\n";
            }
        }
        
        if (!empty($methods)) {
            $md .= "\n## Methods\n\n";
            foreach ($methods as $method) {
                $visibility = $method['public'] ? 'Public' : 'Private';
                $md .= "### " . $visibility . "::" . $method['name'] . "()\n\n";
                $md .= $method['description'] . "\n\n";
                $md .= "```php\n" . $method['signature'] . "\n```\n\n";
            }
        }
        
        return $md;
    }
    
    /**
     * Extract documentation from script file (non-class)
     */
    private static function extractScriptDocumentation($content, $filename) {
        $docblock = self::extractDocblock($content);
        
        // Extract requires/includes
        $requires = array();
        if (preg_match_all('/require_once.*[\'"].([\w\/\.]+)[\'"]/', $content, $reqMatches)) {
            $requires = $reqMatches[1];
        }
        
        // Extract main functionality description
        $description = self::extractDescription($docblock);
        
        // Build markdown
        $md = "# Script: $filename\n\n";
        
        if ($description) {
            $md .= "## Description\n\n" . $description . "\n\n";
        }
        
        if (!empty($requires)) {
            $md .= "\n## Dependencies\n\n";
            foreach ($requires as $req) {
                $md .= "- `$req`\n";
            }
        }
        
        return $md;
    }
    
    /**
     * Extract docblock comment from PHP content
     */
    private static function extractDocblock($content) {
        if (preg_match('/\/\*\*([\s\S]*?)\*\//', $content, $match)) {
            return trim($match[1]);
        }
        return null;
    }
    
    /**
     * Extract description from docblock
     */
    private static function extractDescription($docblock) {
        if (!$docblock) return null;
        
        // Remove asterisks at start of lines
        $text = preg_replace('/^\s*\*\s*/m', '', $docblock);
        $text = trim($text);
        
        // Get first paragraph
        $paragraphs = preg_split('/\n\s*\n/', $text);
        return trim($paragraphs[0]) ? trim($paragraphs[0]) : null;
    }
    
    /**
     * Format docblock for markdown
     */
    private static function formatDocblock($docblock) {
        // Remove docblock markers
        $text = preg_replace('/^\s*\*\s*/m', '', $docblock);
        $text = trim($text);
        
        // Convert @param to - Parameter:
        $text = preg_replace('/@param\s+\w+\s+/', '- ', $text);
        
        // Convert @return to - Returns:
        $text = preg_replace('/@return\s+\w+/', '- Returns: ', $text);
        
        return $text;
    }
    
    /**
     * Index database schemas
     */
    private static function indexDatabase() {
        echo "\nIndexing database schemas...\n";
        
        $outputDir = RagIndexer::OUTPUT_DIR . '/database';
        if (!is_dir($outputDir)) {
            mkdir($outputDir, 0755, true);
        }
        
        // Document schema conventions
        self::generateConventionsDoc($outputDir);
        
        // Index import logger tables
        self::generateImportLoggerDoc($outputDir);
        
        echo "  Database schemas indexed.\n";
    }
    
    /**
     * Generate conventions documentation
     */
    private static function generateConventionsDoc($outputDir) {
        $md = "# Database Schema Conventions\n\n";
        $md .= "## Primary Key Naming\n\n";
        $md .= "Dynamic, table-specific naming convention:\n\n";
        $md .= "- Pattern: `{singular_table}_id`\n";
        $md .= "- Example: `propietarios` table -> `propietario_id` primary key\n\n";
        
        $md .= "## Standard Audit Fields (Spanish Canonical)\n\n";
        $md .= "| Field | Type | Constraint | Description |\n";
        $md .= "|-------|------|------------|-------------|\n";
        $md .= "| `alta_db` | TIMESTAMP | DEFAULT CURRENT_TIMESTAMP | Timestamp of record insertion |\n";
        $md .= "| `alta_por` | VARCHAR(32) | DEFAULT 'system' | User who inserted the record |\n";
        $md .= "| `ultimo_cambio` | TIMESTAMP | NULL DEFAULT NULL | Timestamp of last modification |\n";
        $md .= "| `ultimo_cambio_por` | VARCHAR(32) | NULL | User who made last change |\n\n";
        
        $md .= "## Table Naming\n\n";
        $md .= "- Use plural form: `propietarios`, `departments`, `alumnos_becados`\n\n";
        
        $md .= "## Character Set\n\n";
        $md .= "- `utf8mb4` with `utf8mb4_unicode_ci` collation\n\n";
        
        $md .= "## Storage Engine\n\n";
        $md .= "- `InnoDB` for transaction support\n\n";
        
        $md .= "## Singular/Plular Mapping\n\n";
        $md .= "| Plural | Singular |\n";
        $md .= "|--------|----------|\n";
        $md .= "| propietarios | propietario |\n";
        $md .= "| departments | department |\n";
        $md .= "| alumnos_becados | alumno_becado |\n";
        $md .= "| eleyeme_cfdi_emitidos | eleyeme_cfdi_emitido |\n";
        
        file_put_contents($outputDir . '/conventions.md', $md);
        echo "  Generated: conventions.md\n";
    }
    
    /**
     * Generate import logger documentation
     */
    private static function generateImportLoggerDoc($outputDir) {
        $md = "# Import Logger Schema\n\n";
        $md .= "## Table: import_logs\n\n";
        $md .= "Tracks every import operation with full schema capture.\n\n";
        
        $md .= "| Column | Type | Description |\n";
        $md .= "|--------|------|-------------|\n";
        $md .= "| `import_log_id` | VARCHAR(32) | UUID primary key |\n";
        $md .= "| `database_name` | VARCHAR(64) | Target database name |\n";
        $md .= "| `table_name` | VARCHAR(64) | Target table name |\n";
        $md .= "| `file_name` | VARCHAR(255) | Original uploaded filename |\n";
        $md .= "| `file_type` | ENUM('XLSX','CSV') | File type |\n";
        $md .= "| `file_size` | INT | File size in bytes |\n";
        $md .= "| `operation_status` | ENUM | 'pending', 'success', 'error', 'partial' |\n";
        $md .= "| `rows_processed` | INT | Total rows processed |\n";
        $md .= "| `rows_inserted` | INT | Successfully inserted rows |\n";
        $md .= "| `rows_failed` | INT | Failed row count |\n";
        $md .= "| `alta_db` | TIMESTAMP | Import timestamp |\n";
        $md .= "| `alta_por` | VARCHAR(32) | User who performed import |\n\n";
        
        $md .= "## Table: import_schema_logs\n\n";
        $md .= "Stores column definitions for each import.\n\n";
        
        $md .= "| Column | Type | Description |\n";
        $md .= "|--------|------|-------------|\n";
        $md .= "| `import_schema_log_id` | VARCHAR(32) | UUID primary key |\n";
        $md .= "| `import_log_id` | VARCHAR(32) | FK to import_logs |\n";
        $md .= "| `column_name` | VARCHAR(64) | Column name |\n";
        $md .= "| `column_order` | INT | Position in table (1-based) |\n";
        $md .= "| `data_type` | VARCHAR(50) | MySQL data type |\n";
        $md .= "| `length_values` | VARCHAR(255) | Length/precision |\n";
        $md .= "| `is_nullable` | BOOLEAN | NULL constraint |\n";
        $md .= "| `is_indexed` | BOOLEAN | Whether column has an index |\n";
        $md .= "| `column_comment` | TEXT | Column description |\n";
        
        file_put_contents($outputDir . '/import_logger.md', $md);
        echo "  Generated: import_logger.md\n";
    }
    
    /**
     * Index API specifications
     */
    private static function indexApi() {
        echo "\nIndexing API specifications...\n";
        
        $outputDir = RagIndexer::OUTPUT_DIR . '/api';
        if (!is_dir($outputDir)) {
            mkdir($outputDir, 0755, true);
        }
        
        // Index each API endpoint
        self::indexApiEndpoint($outputDir, 'getDatabases', 'List all databases');
        self::indexApiEndpoint($outputDir, 'getTables', 'List tables in a database');
        self::indexApiEndpoint($outputDir, 'getTableRows', 'Get paginated table data');
        self::indexApiEndpoint($outputDir, 'getTableNames', 'Get table names from import history');
        self::indexApiEndpoint($outputDir, 'getImportSchema', 'Get schema preview');
        self::indexApiEndpoint($outputDir, 'getLastImportSchema', 'Get last successful import schema');
        self::indexApiEndpoint($outputDir, 'getProgress', 'Get import progress');
        self::indexApiEndpoint($outputDir, 'tableOperations', 'Copy/move/rename tables');
        
        echo "  API specifications indexed.\n";
    }
    
    /**
     * Generate documentation for an API endpoint
     */
    private static function indexApiEndpoint($outputDir, $name, $description) {
        $md = "# Endpoint: $name\n\n";
        $md .= "**Description:** $description\n\n";
        
        // Determine HTTP method and path
        if ($name === 'getDatabases') {
            $md .= "\n## GET /api/getDatabases.php\n\n";
            $md .= "### Query Parameters\n\n";
            $md .= "None required.\n\n";
            $md .= "### Response (200 OK)\n\n";
            $md .= "```json\n";
            $md .= "{\n";
            $md .= "  \"status\": \"ok\",\n";
            $md .= "  \"databases\": [\"db1\", \"db2\"],\n";
            $md .= "  \"count\": 2\n";
            $md .= "}\n";
            $md .= "```\n\n";
        }
        else if ($name === 'getTables') {
            $md .= "\n## GET /api/getTables.php?db={database}\n\n";
            $md .= "### Query Parameters\n\n";
            $md .= "- `db` (required): Database name\n\n";
            $md .= "### Response (200 OK)\n\n";
            $md .= "```json\n";
            $md .= "{\n";
            $md .= "  \"status\": \"ok\",\n";
            $md .= "  \"tables\": [\n";
            $md .= "    {\n";
            $md .= "      \"name\": \"table1\",\n";
            $md .= "      \"rows\": 100,\n";
            $md .= "      \"engine\": \"InnoDB\"\n";
            $md .= "    }\n";
            $md .= "  ]\n";
            $md .= "}\n";
            $md .= "```\n\n";
        }
        else if ($name === 'getTableRows') {
            $md .= "\n## GET /api/getTableRows.php?db={database}&table={table}&limit={limit}&offset={offset}\n\n";
            $md .= "### Query Parameters\n\n";
            $md .= "- `db` (required): Database name\n";
            $md .= "- `table` (required): Table name\n";
            $md .= "- `limit` (optional, default 50): Number of rows\n";
            $md .= "- `offset` (optional, default 0): Row offset\n\n";
            $md .= "### Response (200 OK)\n\n";
            $md .= "```json\n";
            $md .= "{\n";
            $md .= "  \"status\": \"ok\",\n";
            $md .= "  \"table_name\": \"table1\",\n";
            $md .= "  \"database\": \"db1\",\n";
            $md .= "  \"columns\": [\"col1\", \"col2\"],\n";
            $md .= "  \"rows\": [\n";
            $md .= "    {\"col1\": \"value1\", \"col2\": \"value2\"}\n";
            $md .= "  ],\n";
            $md .= "  \"total_rows\": 100,\n";
            $md .= "  \"has_more\": true\n";
            $md .= "}\n";
            $md .= "```\n\n";
        }
        else if ($name === 'getTableNames') {
            $md .= "\n## GET /api/getTableNames.php?search={search}&limit={limit}\n\n";
            $md .= "### Query Parameters\n\n";
            $md .= "- `search` (optional): Filter table names\n";
            $md .= "- `limit` (optional, default 20): Max results\n\n";
            $md .= "### Response (200 OK)\n\n";
            $md .= "```json\n";
            $md .= "{\n";
            $md .= "  \"status\": \"ok\",\n";
            $md .= "  \"table_names\": [\n";
            $md .= "    {\n";
            $md .= "      \"name\": \"propietarios\",\n";
            $md .= "      \"last_imported\": \"2025-12-30 20:43:55\",\n";
            $md .= "      \"import_count\": 5\n";
            $md .= "    }\n";
            $md .= "  ],\n";
            $md .= "  \"count\": 1\n";
            $md .= "}\n";
            $md .= "```\n\n";
        }
        else if ($name === 'getImportSchema') {
            $md .= "\n## GET /api/getImportSchema.php?table={table}&db={database}\n\n";
            $md .= "### Query Parameters\n\n";
            $md .= "- `table` (required): Table name\n";
            $md .= "- `db` (optional, default 'default'): Database name\n\n";
            $md .= "### Response (200 OK)\n\n";
            $md .= "```json\n";
            $md .= "{\n";
            $md .= "  \"status\": \"ok\",\n";
            $md .= "  \"table_name\": \"table1\",\n";
            $md .= "  \"database\": \"db1\",\n";
            $md .= "  \"schema\": [...],\n";
            $md .= "  \"columns\": [...]\n";
            $md .= "}\n";
            $md .= "```\n\n";
        }
        else if ($name === 'getLastImportSchema') {
            $md .= "\n## GET /api/getLastImportSchema.php?table={table}&db={database}\n\n";
            $md .= "### Query Parameters\n\n";
            $md .= "- `table` (required): Table name\n";
            $md .= "- `db` (optional): Database name\n\n";
            $md .= "### Response (200 OK)\n\n";
            $md .= "```json\n";
            $md .= "{\n";
            $md .= "  \"status\": \"ok\",\n";
            $md .= "  \"schema\": [...],\n";
            $md .= "  \"import_date\": \"2025-12-30 20:43:55\",\n";
            $md .= "  \"import_log_id\": \"uuid\"\n";
            $md .= "}\n";
            $md .= "```\n\n";
        }
        else if ($name === 'getProgress') {
            $md .= "\n## GET /api/getProgress.php?import_log_id={import_log_id}\n\n";
            $md .= "### Query Parameters\n\n";
            $md .= "- `import_log_id` (required): Import operation UUID\n\n";
            $md .= "### Response (200 OK)\n\n";
            $md .= "```json\n";
            $md .= "{\n";
            $md .= "  \"status\": \"ok\",\n";
            $md .= "  \"import_log_id\": \"uuid\",\n";
            $md .= "  \"phase\": \"insert\",\n";
            $md .= "  \"progress\": 50,\n";
            $md .= "  \"rows_processed\": 500,\n";
            $md .= "  \"rows_inserted\": 498,\n";
            $md .= "  \"rows_failed\": 2,\n";
            $md .= "  \"errors\": [...]\n";
            $md .= "}\n";
            $md .= "```\n\n";
        }
        else if ($name === 'tableOperations') {
            $md .= "\n## POST /api/tableOperations.php\n\n";
            $md .= "### Body Parameters (JSON)\n\n";
            $md .= "- `operation` (required): 'copy', 'move', 'rename', or 'delete'\n";
            $md .= "- `source_db` (required): Source database name\n";
            $md .= "- `source_table` (required): Source table name\n";
            $md .= "- `target_db` (optional): Target database name (for copy/move)\n";
            $md .= "- `target_table` (optional): Target table name (for copy/move/rename)\n";
            $md .= "- `options` (optional): Operation-specific options\n\n";
            $md .= "### Response (200 OK)\n\n";
            $md .= "```json\n";
            $md .= "{\n";
            $md .= "  \"status\": \"ok\",\n";
            $md .= "  \"success\": true,\n";
            $md .= "  \"message\": \"Table copied successfully\",\n";
            $md .= "  \"rows_copied\": 100\n";
            $md .= "}\n";
            $md .= "```\n\n";
        }
        
        file_put_contents($outputDir . '/' . $name . '.md', $md);
        echo "  Indexed: $name.md\n";
    }
    
    /**
     * Generate main index file
     */
    private static function generateMainIndex() {
        $outputDir = RagIndexer::OUTPUT_DIR;
        $version = RagIndexer::VERSION;
        
        $md = "# RAG Index for PHP Importer Application\n\n";
        $md .= "**Version:** " . $version . "\n\n";
        $md .= "## Index Structure\n\n";
        $md .= "```\n";
        $md .= "docs/rag/\n";
        $md .= "├── codebase/\n";
        $md .= "│   ├── lib/\n";
        $md .= "│   │   ├── DatabaseHelper.md\n";
        $md .= "│   │   ├── SchemaDetector.md\n";
        $md .= "│   │   ├── ImportLogger.md\n";
        $md .= "│   │   ├── ErrorHandler.md\n";
        $md .= "│   │   ├── DataValidator.md\n";
        $md .= "│   │   ├── DataCleaner.md\n";
        $md .= "│   │   ├── TableOperations.md\n";
        $md .= "│   │   └── SchemaConventions.md\n";
        $md .= "│   ├── api/\n";
        $md .= "│   │   ├── getDatabases.md\n";
        $md .= "│   │   ├── getTables.md\n";
        $md .= "│   │   ├── getTableRows.md\n";
        $md .= "│   │   ├── getTableNames.md\n";
        $md .= "│   │   ├── getImportSchema.md\n";
        $md .= "│   │   ├── getLastImportSchema.md\n";
        $md .= "│   │   ├── getProgress.md\n";
        $md .= "│   │   └── tableOperations.md\n";
        $md .= "│   └── root/\n";
        $md .= "│       ├── config.md\n";
        $md .= "│       ├── arrival.md\n";
        $md .= "│       └── insert.md\n";
        $md .= "├── database/\n";
        $md .= "│   ├── conventions.md\n";
        $md .= "│   └── import_logger.md\n";
        $md .= "└── README.md\n";
        $md .= "```\n\n";
        
        $md .= "## Usage\n\n";
        $md .= "To query this index for LLM context:\n\n";
        $md .= "```php\n";
        $md .= "require_once 'lib/RagQuery.php';\n\n";
        $md .= "\$context = RagQuery::getContext(\"How do I add a new column to the import_logs table?\");\n";
        $md .= "```\n\n";
        
        $md .= "## Query Examples\n\n";
        $md .= "### Codebase Questions\n";
        $md .= "- \"What does the DatabaseHelper class do?\"\n";
        $md .= "- \"How do I add a new column type to SchemaDetector?\"\n";
        $md .= "- \"Explain the import logging flow\"\n\n";
        
        $md .= "### Database Questions\n";
        $md .= "- \"What tables are in the importer database?\"\n";
        $md .= "- \"What is the primary key for the propietarios table?\"\n";
        $md .= "- \"Show me the import_logs schema\"\n\n";
        
        $md .= "### API Questions\n";
        $md .= "- \"How do I get a list of tables?\"\n";
        $md .= "- \"What parameters does getTableRows accept?\"\n";
        $md .= "- \"What does the tableOperations endpoint do?\"\n\n";
        
        $md .= "### Conventions Questions\n";
        $md .= "- \"What are the standard audit fields?\"\n";
        $md .= "- \"How are primary keys named?\"\n";
        $md .= "- \"What is the character set?\"\n\n";
        
        file_put_contents($outputDir . '/README.md', $md);
        echo "\n  Generated: README.md\n";
    }
}

// Run if executed directly
if (php_sapi_name() || (isset($_SERVER['PHP_SELF']) && basename($_SERVER['PHP_SELF']) === basename(__FILE__))) {
    RagIndexer::run();
}
