<?php
/**
 * DataCleaner - The Purifier
 * Cleanses data: UTF-8 normalization, whitespace trimming, newline→semicolon conversion
 */

require_once __DIR__ . '/DateFormatDetector.php';

class DataCleaner {

    /**
     * Clean a single value
     */
    public static function clean($value, $type = 'VARCHAR') {
        // Handle null or empty
        if ($value === null || $value === '') {
            return null;
        }

        // Convert to string
        $value = (string)$value;

        // UTF-8 normalization
        $value = self::normalizeUTF8($value);

        // Trim whitespace
        $value = self::trimWhitespace($value);

        // Replace newlines with semicolons for multi-line fields
        $value = self::convertNewlinesToSemicolons($value);

        // Handle empty after cleaning
        if ($value === '') {
            return null;
        }

        return $value;
    }

    /**
     * Normalize UTF-8 encoding
     */
    public static function normalizeUTF8($value) {
        // Ensure UTF-8 encoding
        if (!mb_check_encoding($value, 'UTF-8')) {
            $value = mb_convert_encoding($value, 'UTF-8', 'auto');
        }

        // Normalize to NFC form (Canonical Decomposition followed by Canonical Composition)
        if (class_exists('Normalizer')) {
            $value = Normalizer::normalize($value, Normalizer::FORM_C);
        }

        return $value;
    }

    /**
     * Trim whitespace intelligently
     */
    public static function trimWhitespace($value) {
        // Remove leading and trailing whitespace, tabs, newlines
        $value = trim($value);

        // Collapse multiple spaces into single space (but preserve intentional spacing)
        $value = preg_replace('/  +/', ' ', $value);

        return $value;
    }

    /**
     * Convert newlines to semicolons for easier parsing
     * Example: "email1@test.com\nemail2@test.com" → "email1@test.com;email2@test.com"
     */
    public static function convertNewlinesToSemicolons($value) {
        // Replace various newline formats with semicolon
        $value = str_replace(["\r\n", "\r", "\n"], ';', $value);

        // Clean up multiple semicolons
        $value = preg_replace('/;+/', ';', $value);

        // Remove trailing semicolon if present
        $value = rtrim($value, ';');

        return $value;
    }

    /**
     * Clean an entire row of data
     */
    public static function cleanRow(array $row, array $types = []) {
        $cleaned = [];
        foreach ($row as $key => $value) {
            $type = $types[$key] ?? 'VARCHAR';
            $cleaned[$key] = self::clean($value, $type);
        }
        return $cleaned;
    }

    /**
     * Sanitize column name for SQL identifier
     */
    public static function sanitizeColumnName($name): string
    {
        // trim spaces
        $name = trim($name);

        // Fallback: Convert accented characters to ASCII equivalents using lookup table
        $accentedChars = [
            'á' => 'a', 'à' => 'a', 'â' => 'a', 'ä' => 'a', 'ã' => 'a', 'å' => 'a',
            'é' => 'e', 'è' => 'e', 'ê' => 'e', 'ë' => 'e',
            'í' => 'i', 'ì' => 'i', 'î' => 'i', 'ï' => 'i',
            'ó' => 'o', 'ò' => 'o', 'ô' => 'o', 'ö' => 'o', 'õ' => 'o',
            'ú' => 'u', 'ù' => 'u', 'û' => 'u', 'ü' => 'u',
            'ñ' => 'n', 'ç' => 'c',
            'Á' => 'A', 'À' => 'A', 'Â' => 'A', 'Ä' => 'A', 'Ã' => 'A', 'Å' => 'A',
            'É' => 'E', 'È' => 'E', 'Ê' => 'E', 'Ë' => 'E',
            'Í' => 'I', 'Ì' => 'I', 'Î' => 'I', 'Ï' => 'I',
            'Ó' => 'O', 'Ò' => 'O', 'Ô' => 'O', 'Ö' => 'O', 'Õ' => 'O',
            'Ú' => 'U', 'Ù' => 'U', 'Û' => 'U', 'Ü' => 'U',
            'Ñ' => 'N', 'Ç' => 'C'
        ];
        
        foreach ($accentedChars as $accented => $ascii) {
            $name = str_replace($accented, $ascii, $name);
        }

        // Semantic symbol replacements - convert common symbols to meaningful words
        // This happens BEFORE removing invalid chars to preserve meaning
        $symbolReplacements = [
            // Percentage (most common in business data)
            '%' => '_percentage',

            // Currency symbols
            '$' => '_dollars',
            '£' => '_pounds',
            '€' => '_euros',
            '¥' => '_yen',

            // Temperature
            '°C' => '_celsius',
            '°F' => '_fahrenheit',
            '°' => '_degrees',

            // Number/Hash (common for ID fields)
            '#' => '_number',

            // Other common business symbols
            '@' => '_at',
            '&' => '_and',
        ];

        foreach ($symbolReplacements as $symbol => $replacement) {
            $name = str_replace($symbol, $replacement, $name);
        }

        // remove invisible/control characters
        $name = preg_replace('/[\x00-\x1F\x7F]/u', '', $name);

        // replace any remaining invalid chars with underscore
        $name = preg_replace('/[^a-zA-Z0-9_]/', '_', $name);

        // collapse multiple underscores
        $name = preg_replace('/_+/', '_', $name);

        // trim underscores
        $name = trim($name, '_');

        // to lowercase
        $name = mb_strtolower($name);

        // enforce non-empty
        return $name !== '' ? $name : 'col_' . uniqid();
    }


    /**
     * Sanitize table name for SQL identifier
     */
    public static function sanitizeTableName($name): string
    {
        // Remove file extension if present
        $name = preg_replace('/\.xlsx?$/i', '', $name);

        // Convert to lowercase
        $name = mb_strtolower($name);

        // Remove special characters except underscore
        $name = preg_replace('/[^a-z0-9_]/', '_', $name);

        // Remove leading/trailing underscores
        $name = trim($name, '_');

        // Ensure not empty
        if (empty($name)) {
            $name = 'imported_data';
        }

        return $name;
    }

    /**
     * Parse and format date value for MySQL
     * Handles both Excel numeric dates and text dates with intelligent format detection
     *
     * @param mixed $value The date value from Excel
     * @param string $targetFormat The target format: 'DATE' or 'DATETIME'
     * @param array|null $columnValues All values from the column (for intelligent format detection)
     * @param string|null $columnName Column name (for hints like 'check_in', 'fecha')
     * @return string|null Formatted date string (YYYY-MM-DD or YYYY-MM-DD HH:MM:SS) or null
     */
    public static function parseDate($value, $targetFormat = 'DATE', $columnValues = null, $columnName = null) {
        // Handle null or empty
        if ($value === null || $value === '' || trim($value) === '') {
            return null;
        }

        // Convert to string and trim
        $value = trim((string)$value);

        // Case 1: Excel numeric date (e.g., 45324)
        // Excel stores dates as days since 1900-01-01 (with a bug: it thinks 1900 was a leap year)
        if (is_numeric($value)) {
            $excelDate = floatval($value);

            // Excel's date system starts at 1900-01-01, but has a bug at 1900-02-29
            if ($excelDate >= 1) {
                // Convert Excel date to Unix timestamp
                // 25569 = days between 1900-01-01 and 1970-01-01
                $unixTimestamp = ($excelDate - 25569) * 86400;

                // Format based on target type
                if ($targetFormat === 'DATETIME') {
                    return date('Y-m-d H:i:s', $unixTimestamp);
                } else {
                    return date('Y-m-d', $unixTimestamp);
                }
            }
        }

        // Case 2: Text date formats with intelligent detection
        // Use DateFormatDetector to analyze column context and determine the correct format
        if ($columnValues !== null && is_array($columnValues)) {
            // Filter out null/empty values for analysis
            $validValues = array_filter($columnValues, function($v) {
                return $v !== null && $v !== '' && trim((string)$v) !== '';
            });

            if (!empty($validValues)) {
                // Use intelligent detection
                $format = DateFormatDetector::detectFormat($validValues, $columnName ?? '');
                $result = DateFormatDetector::parseDate($value, $format, $targetFormat);

                if ($result !== null) {
                    return $result;
                }
            }
        }

        // Case 3: Try common formats without context (legacy fallback)
        $formats = [
            // 4-digit year formats
            'd/m/Y',        // 30/11/2025 (DD/MM/YYYY - European/Latin American)
            'd-m-Y',        // 30-11-2025 (DD-MM-YYYY)
            'd.m.Y',        // 30.11.2025
            'm/d/Y',        // 11/30/2025 (MM/DD/YYYY - American)
            'm-d-Y',        // 11-30-2025
            'm.d.Y',        // 11.30.2025
            'n/j/Y',        // 5/3/2023 (M/D/YYYY)
            'j/n/Y',        // 3/5/2023 (D/M/YYYY)
            'Y-m-d',        // 2025-11-30 (ISO)
            'Y/m/d',        // 2025/11/30
            'Y-m-d H:i:s',  // 2025-11-30 14:30:00
            'm/d/Y H:i:s',  // 11/30/2025 14:30:00
            'd/m/Y H:i:s',  // 30/11/2025 14:30:00
            // 2-digit year formats (with pivot logic applied)
            'd/m/y',        // 30/11/25 (DD/MM/YY)
            'd-m-y',        // 30-11-25
            'd.m.y',        // 30.11.25
            'm/d/y',        // 11/30/25 (MM/DD/YY)
            'm-d-y',        // 11-30-25
            'm.d.y',        // 11.30.25
        ];

        foreach ($formats as $format) {
            $date = DateTime::createFromFormat($format, $value);
            if ($date !== false) {
                // Handle 2-digit year pivot: 00-49 => 2000-2049, 50-99 => 1950-1999
                // PHP's DateTime with 'y' format auto-converts to 4-digit year, so we need to reverse it
                if (in_array($format, ['d/m/y', 'm/d/y', 'd-m-y', 'm-d-y', 'd.m.y', 'm.d.y', 'y-m-d', 'y/m/d'])) {
                    // Extract the original 2-digit year from the input value
                    // Pattern: extract last 2 digits of the date string
                    if (preg_match('/(\d{2})$/', $value, $matches)) {
                        $twoDigitYear = (int) $matches[1];

                        // Apply our pivot: 00-49 => 2000-2049, 50-99 => 1950-1999
                        $adjustedYear = ($twoDigitYear <= 49) ? (2000 + $twoDigitYear) : (1900 + $twoDigitYear);
                        $date->setDate($adjustedYear, (int) $date->format('m'), (int) $date->format('d'));
                    }
                }

                // Simple validation: just check the date object was created
                if ($targetFormat === 'DATETIME') {
                    return $date->format('Y-m-d H:i:s');
                } else {
                    return $date->format('Y-m-d');
                }
            }
        }

        // Case 4: Try strtotime as last resort
        $timestamp = strtotime($value);
        if ($timestamp !== false) {
            if ($targetFormat === 'DATETIME') {
                return date('Y-m-d H:i:s', $timestamp);
            } else {
                return date('Y-m-d', $timestamp);
            }
        }

        // If all parsing fails, return null
        return null;
    }

    /**
     * Check if value is empty (null, empty string, or whitespace only)
     */
    public static function isEmpty($value) {
        return $value === null || $value === '' || trim($value) === '';
    }

    /**
     * Validate email format
     */
    public static function isEmail($value) {
        // Handle multiple emails separated by semicolons
        $emails = explode(';', $value);
        foreach ($emails as $email) {
            $email = trim($email);
            if (!empty($email) && !filter_var($email, FILTER_VALIDATE_EMAIL)) {
                return false;
            }
        }
        return true;
    }

    // ========================================
    // CSV-SPECIFIC CLEANING METHODS
    // ========================================

    /**
     * Auto-detect CSV delimiter by analyzing first few lines
     * Tests: comma, semicolon, tab, pipe
     * 
     * @param string $filePath Path to CSV file
     * @param int $sampleLines Number of lines to sample (default: 5)
     * @return string Detected delimiter (default: ',')
     */
    public static function detectDelimiter($filePath, $sampleLines = 5) {
        $handle = fopen($filePath, 'r');
        if (!$handle) {
            return ','; // Default fallback
        }

        // Test these delimiters in order of likelihood
        $delimiters = [',', ';', "\t", '|'];
        $delimiterCounts = array_fill_keys($delimiters, 0);

        // Read sample lines
        $lineCount = 0;
        while (($line = fgets($handle)) !== false && $lineCount < $sampleLines) {
            foreach ($delimiters as $delimiter) {
                $delimiterCounts[$delimiter] += substr_count($line, $delimiter);
            }
            $lineCount++;
        }
        fclose($handle);

        // Return delimiter with highest count
        arsort($delimiterCounts);
        $detectedDelimiter = key($delimiterCounts);

        // If no delimiters found, default to comma
        return ($delimiterCounts[$detectedDelimiter] > 0) ? $detectedDelimiter : ',';
    }

    /**
     * Detect and convert file encoding to UTF-8
     * Handles common encodings: UTF-8, ISO-8859-1, Windows-1252
     * 
     * @param string $filePath Path to file
     * @return string Content converted to UTF-8
     */
    public static function detectAndConvertEncoding($filePath) {
        $content = file_get_contents($filePath);
        
        // Strip BOM first if present
        $content = self::stripBOM($content);

        // Try to detect encoding
        $encoding = mb_detect_encoding($content, ['UTF-8', 'ISO-8859-1', 'Windows-1252', 'ASCII'], true);

        // If not UTF-8, convert it
        if ($encoding && $encoding !== 'UTF-8') {
            $content = mb_convert_encoding($content, 'UTF-8', $encoding);
        } elseif (!$encoding) {
            // Fallback: assume ISO-8859-1 if detection fails
            $content = mb_convert_encoding($content, 'UTF-8', 'ISO-8859-1');
        }

        return $content;
    }

    /**
     * Strip UTF-8 BOM (Byte Order Mark) if present
     * BOM can mess up the first header column
     * 
     * @param string $content File content
     * @return string Content without BOM
     */
    public static function stripBOM($content) {
        // UTF-8 BOM is: EF BB BF (in hex)
        $bom = pack('H*','EFBBBF');
        return (substr($content, 0, 3) === $bom) ? substr($content, 3) : $content;
    }

    /**
     * Normalize row length to match header count
     * Pads short rows with null, truncates long rows
     * 
     * @param array $row Current row
     * @param int $expectedLength Expected column count
     * @return array Normalized row
     */
    public static function normalizeRowLength(array $row, $expectedLength) {
        $currentLength = count($row);

        if ($currentLength < $expectedLength) {
            // Pad short rows with null
            return array_pad($row, $expectedLength, null);
        } elseif ($currentLength > $expectedLength) {
            // Truncate long rows (keep only expected columns)
            return array_slice($row, 0, $expectedLength);
        }

        return $row;
    }

    /**
     * Parse CSV file and return structured data (similar to XLSX output)
     * Handles encoding detection, delimiter detection, and row normalization
     * 
     * @param string $filePath Path to CSV file
     * @return array ['headers' => [...], 'rows' => [...], 'metadata' => [...]]
     */
    public static function parseCSV($filePath) {
        // Detect delimiter
        $delimiter = self::detectDelimiter($filePath);

        // Detect encoding and get UTF-8 content
        $content = self::detectAndConvertEncoding($filePath);

        // Save cleaned content to temporary file
        $tempFile = tempnam(sys_get_temp_dir(), 'csv_clean_');
        file_put_contents($tempFile, $content);

        $handle = fopen($tempFile, 'r');
        if (!$handle) {
            unlink($tempFile);
            throw new Exception("Unable to open CSV file: $filePath");
        }

        // Parse CSV
        $headers = [];
        $rows = [];
        $lineNumber = 0;
        $inconsistentRows = 0;

        while (($row = fgetcsv($handle, 0, $delimiter, '"', '\\')) !== false) {
            $lineNumber++;

            // First row is headers
            if ($lineNumber === 1) {
                // Clean header names
                foreach ($row as $header) {
                    $headers[] = self::sanitizeColumnName(trim($header));
                }
                continue;
            }

            // Normalize row length to match headers
            if (count($row) !== count($headers)) {
                $inconsistentRows++;
                $row = self::normalizeRowLength($row, count($headers));
            }

            // Clean each cell
            $cleanedRow = [];
            foreach ($row as $cell) {
                $cleanedRow[] = self::clean($cell);
            }

            $rows[] = $cleanedRow;
        }

        fclose($handle);
        unlink($tempFile);

        // Return structured data
        return [
            'headers' => $headers,
            'rows' => $rows,
            'metadata' => [
                'total_rows' => count($rows),
                'delimiter' => $delimiter,
                'delimiter_name' => self::getDelimiterName($delimiter),
                'inconsistent_rows' => $inconsistentRows,
                'encoding_converted' => true
            ]
        ];
    }

    /**
     * Get human-readable name for delimiter
     */
    private static function getDelimiterName($delimiter) {
        $names = [
            ',' => 'Comma',
            ';' => 'Semicolon',
            "\t" => 'Tab',
            '|' => 'Pipe'
        ];
        return $names[$delimiter] ?? 'Unknown';
    }
}
