<?php
/**
 * SchemaDetector - The Listener
 * Infers column types by analyzing data patterns
 */

class SchemaDetector {

    /**
     * Detect schema from rows of data
     */
    public static function detectSchema(array $rows, array $headers, array $hints = []) {
        $schema = [];

        foreach ($headers as $idx => $header) {
            // Collect all values for this column
            $values = array_column($rows, $header);

            // Analyze the column
            $columnInfo = self::analyzeColumn($values, $header);

            // Apply hints if available
            $comment = '';
            if (isset($hints[$header])) {
                $hint = $hints[$header];
                
                // Map hint type to system type
                $baseType = preg_replace('/\(.*/', '', $hint['type']);
                $baseType = strtolower($baseType);

                if ($baseType === 'text') {
                    if ($hint['length']) {
                        $columnInfo['type'] = 'VARCHAR';
                        $columnInfo['length'] = $hint['length'];
                    } else {
                        $columnInfo['type'] = 'TEXT';
                        $columnInfo['length'] = null;
                    }
                } elseif ($baseType === 'enum') {
                    $columnInfo['type'] = 'ENUM';
                    $columnInfo['length'] = $hint['length']; // Preserve ('Vigente','Cancelado')
                } else {
                    $columnInfo['type'] = strtoupper($baseType);
                    if ($hint['length']) {
                        $columnInfo['length'] = $hint['length'];
                    }
                }
                
                if ($hint['comment']) {
                    $comment = $hint['comment'];
                }
            }

            $schema[] = [
                'original' => $header,
                'name' => DataCleaner::sanitizeColumnName($header),
                'type' => $columnInfo['type'],
                'length' => $columnInfo['length'],
                'nullable' => $columnInfo['nullable'],
                'indexed' => $columnInfo['indexed'],
                'primary' => false, // UUID is always primary
                'auto_increment' => false,
                'comment' => $comment,
                // NEW: Conflict detection metadata
                'detection_method' => $columnInfo['detection_method'] ?? 'unknown',
                'conflict_detected' => $columnInfo['conflict_detected'] ?? false,
                'name_based_suggestion' => $columnInfo['name_based_suggestion'] ?? null,
                'value_based_suggestion' => $columnInfo['value_based_suggestion'] ?? null,
                'conflict_reason' => $columnInfo['conflict_reason'] ?? null
            ];
        }

        return $schema;
    }

    public static function parseMetadataRow(array $rowValues) {
    $hints = [];
    $matchCount = 0;
    $totalValues = count(array_filter($rowValues, fn($v) => !DataCleaner::isEmpty($v)));

    if ($totalValues === 0) return null;

    foreach ($rowValues as $header => $value) {
        if (DataCleaner::isEmpty($value)) continue;

        // Initialize variables for this iteration
        $type = null;
        $length = null;
        $comment = null;

        $value = trim($value);
        
        // Special case: ENUM with parentheses format
        // Match: ENUM, ('value1','value2'), comment
        // Use manual parsing to avoid CSV splitting issues
        if (stripos($value, 'enum') === 0) {
            // Find the opening parenthesis
            $parenStart = strpos($value, '(');
            if ($parenStart !== false) {
                // Find the matching closing parenthesis (use strrpos to get the last one)
                $parenEnd = strrpos($value, ')');
                if ($parenEnd !== false && $parenEnd > $parenStart) {
                    $type = 'enum';
                    $length = substr($value, $parenStart, $parenEnd - $parenStart + 1);
                    // Everything after the closing paren is the comment (trim commas and spaces)
                    $remainingText = substr($value, $parenEnd + 1);
                    $comment = trim($remainingText, ' ,');
                }
            }
        }
        
        if ($type === null) {
            // Parse CSV format: type, length, comment
            $parts = array_map('trim', str_getcsv($value, ',', '"', '\\'));
            if (count($parts) < 1) continue;

            $type = strtolower(array_shift($parts));
            $baseType = preg_replace('/\(.*/', '', $type);

            // Validate type
            if (!in_array($baseType, ['text', 'varchar', 'int', 'decimal', 'date', 'datetime', 'boolean', 'enum'])) {
                continue;
            }

            // Parse length parameter
            if (!empty($parts)) {
                $lengthRaw = array_shift($parts);

                if (strtolower($lengthRaw) === 'null') {
                    $length = null;
                } elseif ($baseType === 'decimal' && is_numeric($lengthRaw) && !empty($parts) && is_numeric($parts[0])) {
                    $scaleRaw = array_shift($parts);
                    $length = $lengthRaw . ',' . $scaleRaw;
                } elseif (preg_match('/^\((\d+)\s*,\s*(\d+)\)$/', $lengthRaw, $lm)) {
                    $length = $lm[1] . ',' . $lm[2];
                } elseif (is_numeric($lengthRaw)) {
                    $length = (int)$lengthRaw;
                } else {
                    // Not a length, put it back for comment processing
                    array_unshift($parts, $lengthRaw);
                }
            }

            // Parse comment (remaining parts)
            if (!empty($parts)) {
                $comment = implode(', ', $parts);
            }
        }

        // Only add hint if we successfully parsed a type
        if ($type !== null) {
            $hints[$header] = [
                'type' => $type,
                'length' => $length,
                'comment' => $comment
            ];
            $matchCount++;
        }
    }

    return ($matchCount > 0 && ($matchCount / $totalValues) >= 0.5) ? $hints : null;
}


    /**
     * Analyze a single column to determine its characteristics
     * Now runs BOTH name-based and value-based detection and resolves conflicts
     */
    private static function analyzeColumn(array $values, string $columnName) {
        // Filter out null/empty values for analysis
        $nonEmptyValues = array_filter($values, function($v) {
            return !DataCleaner::isEmpty($v);
        });

        // Check if column has any null values
        $hasNulls = count($nonEmptyValues) < count($values);

        // If all values are null/empty
        if (empty($nonEmptyValues)) {
            return [
                'type' => 'VARCHAR',
                'length' => 255,
                'nullable' => true,
                'indexed' => false,
                'detection_method' => 'default',
                'conflict_detected' => false
            ];
        }

        // ALWAYS RUN BOTH DETECTIONS
        $nameBasedType = self::inferTypeFromName($columnName);
        $valueBasedType = self::inferType($nonEmptyValues);
        $valueBasedLength = self::inferLength($nonEmptyValues, $valueBasedType);

        // Package value-based result
        $valueBasedResult = [
            'type' => $valueBasedType,
            'length' => $valueBasedLength
        ];

        // Resolve conflicts between name-based and value-based detection
        $resolved = self::resolveTypeConflict(
            $columnName,
            $nameBasedType,
            $valueBasedResult,
            $nonEmptyValues
        );

        // Determine if should be indexed (first column, or looks like an identifier)
        $indexed = self::shouldBeIndexed($columnName, $nonEmptyValues);

        return [
            'type' => $resolved['type'],
            'length' => $resolved['length'],
            'nullable' => $hasNulls,
            'indexed' => $indexed,
            'detection_method' => $resolved['detection_method'],
            'conflict_detected' => $resolved['conflict_detected'],
            'name_based_suggestion' => $resolved['name_based_suggestion'] ?? null,
            'value_based_suggestion' => $resolved['value_based_suggestion'] ?? null,
            'conflict_reason' => $resolved['conflict_reason'] ?? null
        ];
    }

    /**
     * Resolve conflicts between name-based and value-based type detection
     *
     * Priority rules:
     * 1. Data Compatibility (CRITICAL) - Values must fit in chosen type
     * 2. Semantic Meaning - ID/Date fields keep name-based type
     * 3. Precision Preservation - Money fields keep name-based precision
     * 4. Value-Based Fallback - No name pattern = use values
     *
     * @param string $columnName Column name
     * @param array|null $nameBased ['type' => 'TYPE', 'length' => X] or null
     * @param array $valueBased ['type' => 'TYPE', 'length' => X]
     * @param array $values Actual column values
     * @return array Resolution result with metadata
     */
    private static function resolveTypeConflict(string $columnName, ?array $nameBased, array $valueBased, array $values) {
        // CASE 1: No name pattern - use value-based detection
        if ($nameBased === null) {
            return [
                'type' => $valueBased['type'],
                'length' => $valueBased['length'],
                'detection_method' => 'value_based',
                'conflict_detected' => false
            ];
        }

        // CASE 2: Both agree - perfect match!
        if ($nameBased['type'] === $valueBased['type'] && $nameBased['length'] == $valueBased['length']) {
            return [
                'type' => $nameBased['type'],
                'length' => $nameBased['length'],
                'detection_method' => 'both_agree',
                'conflict_detected' => false
            ];
        }

        // CASE 3: CONFLICT DETECTED - Apply priority rules
        $conflict = true;
        $reason = '';

        // PRIORITY 1: Data Compatibility Check
        // If name suggests smaller type but values won't fit → use VALUES
        if ($nameBased['type'] === 'VARCHAR' && $valueBased['type'] === 'TEXT') {
            $reason = 'Values exceed VARCHAR max length, using TEXT';
            return [
                'type' => $valueBased['type'],
                'length' => $valueBased['length'],
                'detection_method' => 'value_based_override',
                'conflict_detected' => true,
                'name_based_suggestion' => $nameBased['type'] . ($nameBased['length'] ? "({$nameBased['length']})" : ""),
                'value_based_suggestion' => $valueBased['type'],
                'conflict_reason' => $reason
            ];
        }

        // Check if VARCHAR length from name is too small for actual values
        if ($nameBased['type'] === 'VARCHAR' && $valueBased['type'] === 'VARCHAR') {
            $maxValueLength = 0;
            foreach ($values as $value) {
                $len = mb_strlen((string)$value, 'UTF-8');
                if ($len > $maxValueLength) $maxValueLength = $len;
            }

            if ($nameBased['length'] && $maxValueLength > $nameBased['length']) {
                $reason = "Values up to {$maxValueLength} chars won't fit in VARCHAR({$nameBased['length']})";
                return [
                    'type' => 'VARCHAR',
                    'length' => $valueBased['length'], // Use calculated length from values
                    'detection_method' => 'value_based_override',
                    'conflict_detected' => true,
                    'name_based_suggestion' => "VARCHAR({$nameBased['length']})",
                    'value_based_suggestion' => "VARCHAR({$valueBased['length']})",
                    'conflict_reason' => $reason
                ];
            }
        }

        // PRIORITY 2: Semantic Meaning - ID fields
        // ID fields should ALWAYS be VARCHAR even if values look numeric
        if (preg_match('/_id$|^id$|codigo|_code$|folio/i', $columnName)) {
            $reason = 'ID fields must remain VARCHAR to support alphanumeric values';
            return [
                'type' => $nameBased['type'],
                'length' => $nameBased['length'],
                'detection_method' => 'name_based_semantic',
                'conflict_detected' => true,
                'name_based_suggestion' => $nameBased['type'] . ($nameBased['length'] ? "({$nameBased['length']})" : ""),
                'value_based_suggestion' => $valueBased['type'],
                'conflict_reason' => $reason
            ];
        }

        // PRIORITY 2: Semantic Meaning - Date fields
        // Date fields should be DATE even if values look like strings
        if ($nameBased['type'] === 'DATE' && $valueBased['type'] === 'VARCHAR') {
            $reason = 'Date field name detected, using DATE type for SQL compatibility';
            return [
                'type' => $nameBased['type'],
                'length' => $nameBased['length'],
                'detection_method' => 'name_based_semantic',
                'conflict_detected' => true,
                'name_based_suggestion' => $nameBased['type'],
                'value_based_suggestion' => $valueBased['type'],
                'conflict_reason' => $reason
            ];
        }

        // PRIORITY 3: Precision Preservation - Money fields
        // Exchange rates need DECIMAL(10,4) even if current values are integers
        if ($nameBased['type'] === 'DECIMAL' && $valueBased['type'] === 'INT') {
            $reason = 'Money field detected, preserving decimal precision for future values';
            return [
                'type' => $nameBased['type'],
                'length' => $nameBased['length'],
                'detection_method' => 'name_based_precision',
                'conflict_detected' => true,
                'name_based_suggestion' => "DECIMAL({$nameBased['length']})",
                'value_based_suggestion' => $valueBased['type'],
                'conflict_reason' => $reason
            ];
        }

        // PRIORITY 3: Precision Preservation - Different DECIMAL precisions
        if ($nameBased['type'] === 'DECIMAL' && $valueBased['type'] === 'DECIMAL') {
            // Use name-based precision (10,4 for rates vs 10,2 for standard money)
            $reason = 'Using name-based precision for money field';
            return [
                'type' => $nameBased['type'],
                'length' => $nameBased['length'],
                'detection_method' => 'name_based_precision',
                'conflict_detected' => true,
                'name_based_suggestion' => "DECIMAL({$nameBased['length']})",
                'value_based_suggestion' => "DECIMAL({$valueBased['length']})",
                'conflict_reason' => $reason
            ];
        }

        // DEFAULT: Name-based wins for any other conflicts
        $reason = 'Name pattern matched, using name-based type';
        return [
            'type' => $nameBased['type'],
            'length' => $nameBased['length'],
            'detection_method' => 'name_based',
            'conflict_detected' => true,
            'name_based_suggestion' => $nameBased['type'] . ($nameBased['length'] ? "({$nameBased['length']})" : ""),
            'value_based_suggestion' => $valueBased['type'] . ($valueBased['length'] ? "({$valueBased['length']})" : ""),
            'conflict_reason' => $reason
        ];
    }

    /**
     * Infer data type from column name using pattern matching
     * Returns ['type' => 'TYPE', 'length' => value] or null if no pattern matches
     */
    private static function inferTypeFromName(string $columnName) {
        // Convert to lowercase for case-insensitive matching
        $name = strtolower($columnName);

        // RULE 1: ID Fields → VARCHAR(100)
        // Patterns: *_id, codigo, *_code, *_key, folio
        if (preg_match('/_id$|^id$|codigo|_code$|^code$|_key$|^key$|folio/i', $name)) {
            return ['type' => 'VARCHAR', 'length' => 100];
        }

        // RULE 2: High-Precision Money (Exchange Rates) → DECIMAL(10,4)
        // Patterns: exchange_rate, interchange, tipo_cambio, tasa
        // Note: Excludes "rateplan" and similar non-rate fields
        if (preg_match('/exchange.*rate|interchange|tipo.*cambio|^tasa$/i', $name)) {
            return ['type' => 'DECIMAL', 'length' => '10,4'];
        }

        // Match standalone "rate" fields but exclude "rateplan", "ratetype", etc.
        if (preg_match('/^rate$|_rate$|rate_/i', $name)) {
            return ['type' => 'DECIMAL', 'length' => '10,4'];
        }

        // RULE 3: Standard Money Fields → DECIMAL(10,2)
        // Patterns: *_amount, *_fee, *_payment, *_commission, *_percentage, *_convert,
        //           *_mxn, *_usd, *_eur, *_price, *_cost, *_total, *_subtotal
        if (preg_match('/_amount$|_fee$|_payment$|_commission$|_percentage$|_convert$|_mxn$|_usd$|_eur$|_price$|_cost$|_total$|_subtotal$|precio|pago|comision/i', $name)) {
            return ['type' => 'DECIMAL', 'length' => '10,2'];
        }

        // RULE 4: Date Fields → DATE
        // Patterns: *date*, *_date, date*, check_in, check_out, fecha, created_at, updated_at
        // Matches: staydatefrom, bookeddate, date_created, birth_date, booking_date, etc.
        if (preg_match('/check.*in|check.*out|fecha|created_at$|updated_at$/i', $name)) {
            // Direct matches for check-in/out, fecha, timestamps
            return ['type' => 'DATE', 'length' => null];
        }

        // Match fields containing 'date' but exclude false positives
        if (preg_match('/date/i', $name)) {
            // Exclude words like "update", "validate", "candidate" (where 'date' is part of the word)
            if (!preg_match('/^update$|^validate$|^candidate$/i', $name)) {
                return ['type' => 'DATE', 'length' => null];
            }
        }

        // RULE 5: Email Fields → VARCHAR(255)
        // Patterns: email, correo, *_email, *_correo
        if (preg_match('/email|correo/i', $name)) {
            return ['type' => 'VARCHAR', 'length' => 255];
        }

        // RULE 6: Phone Fields → VARCHAR(20)
        // Patterns: phone, telefono, tel, celular, *_phone, *_tel
        if (preg_match('/phone|telefono|^tel$|_tel$|celular/i', $name)) {
            return ['type' => 'VARCHAR', 'length' => 20];
        }

        // RULE 7: Boolean/Status Fields → VARCHAR(50)
        // Patterns: is_*, has_*, *_status, *_estado, activo, active, enabled
        if (preg_match('/^is_|^has_|_status$|_estado$|^activo$|^active$|^enabled$/i', $name)) {
            return ['type' => 'VARCHAR', 'length' => 50];
        }

        // RULE 8: URL Fields → VARCHAR(500)
        // Patterns: *_url, *_link, website, sitio_web
        if (preg_match('/_url$|_link$|website|sitio.*web/i', $name)) {
            return ['type' => 'VARCHAR', 'length' => 500];
        }

        // RULE 9: Description/Notes Fields → TEXT
        // Patterns: *_description, *_notes, *_comentarios, *_observaciones, descripcion, notas, comentarios
        if (preg_match('/_description$|_notes$|_comentarios$|_observaciones$|^descripcion$|^notas$|^comentarios$|^observaciones$/i', $name)) {
            return ['type' => 'TEXT', 'length' => null];
        }

        // RULE 10: Quantity/Count Fields → INT
        // Patterns: *_count, *_quantity, *_qty, no_of_*, number_of_*, cantidad, numero
        if (preg_match('/_count$|_quantity$|_qty$|^no_of_|^number_of_|cantidad|^numero$/i', $name)) {
            return ['type' => 'INT', 'length' => null];
        }

        // No pattern matched - return null to trigger value-based detection
        return null;
    }

    /**
     * Infer data type from values
     */
    private static function inferType(array $values) {
        $intCount = 0;
        $floatCount = 0;
        $dateCount = 0;
        $longTextCount = 0;
        $total = count($values);

        foreach ($values as $value) {
            $value = trim($value);

            // Check if integer
            if (preg_match('/^-?\d+$/', $value)) {
                $intCount++;
                continue;
            }

            // Check if float/decimal
            if (is_numeric($value)) {
                $floatCount++;
                continue;
            }

            // Check if date (YYYY-MM-DD or DD/MM/YYYY or MM/DD/YYYY)
            if (self::isDate($value)) {
                $dateCount++;
                continue;
            }

            // Check if long text (>500 characters)
            if (strlen($value) > 500) {
                $longTextCount++;
            }
        }

        // Determine type based on majority
        $threshold = $total * 0.8; // 80% threshold

        if ($intCount >= $threshold) {
            return 'INT';
        }

        if ($floatCount >= $threshold) {
            return 'DECIMAL';
        }

        if ($dateCount >= $threshold) {
            return 'DATE';
        }

        if ($longTextCount > 0) {
            return 'TEXT';
        }

        // Default to VARCHAR
        return 'VARCHAR';
    }

    /**
     * Infer appropriate length for the column
     */
    private static function inferLength(array $values, string $type) {
        // TEXT and DATE types don't need length
        if (in_array($type, ['TEXT', 'DATE', 'DATETIME'])) {
            return null;
        }

        // For DECIMAL, return precision
        if ($type === 'DECIMAL') {
            return '10,2';
        }

        // For INT, return NULL (no length needed in modern MySQL)
        if ($type === 'INT') {
            return null;
        }

        // For VARCHAR, calculate max length
        $maxLength = 0;
        foreach ($values as $value) {
            $len = mb_strlen((string)$value, 'UTF-8');
            if ($len > $maxLength) {
                $maxLength = $len;
            }
        }

        // Add 20% buffer and round up to standard sizes
        $buffered = (int)($maxLength * 1.2);

        // Round to standard sizes
        $standardSizes = [50, 100, 255, 500];
        foreach ($standardSizes as $size) {
            if ($buffered <= $size) {
                return $size;
            }
        }

        // If still too large, use TEXT
        return 500;
    }

    /**
     * Determine if column should be indexed
     */
    private static function shouldBeIndexed(string $columnName, array $values) {
        // Always index first column
        // Also index if column name suggests it's an identifier
        $identifierPatterns = [
            '/^id$/i',
            '/^codigo/i',
            '/^departamento/i',
            '/nombre/i',
            '/^key$/i'
        ];

        foreach ($identifierPatterns as $pattern) {
            if (preg_match($pattern, $columnName)) {
                return true;
            }
        }

        return false;
    }

    /**
     * Check if value matches date pattern
     */
    private static function isDate(string $value) {
        // Try to parse as date
        $formats = [
            'Y-m-d',
            'd/m/Y',
            'm/d/Y',
            'Y/m/d',
            'd-m-Y',
            'm-d-Y'
        ];

        foreach ($formats as $format) {
            $date = DateTime::createFromFormat($format, $value);
            if ($date && $date->format($format) === $value) {
                return true;
            }
        }

        return false;
    }

    /**
     * Check if values are unique (potential primary key candidate)
     */
    private static function areValuesUnique(array $values) {
        $unique = array_unique($values);
        return count($unique) === count($values);
    }
}
