<?php
/**
 * upload.php - Gates 1 & 2: The Arrival and The Listening
 * Receives XLSX files, parses them, detects schema, cleanses data
 */

session_start(); // Enable session for passing import_log_id to insert.php

require_once __DIR__ . '/config.php';
require_once __DIR__ . '/lib/ErrorHandler.php';
require_once __DIR__ . '/lib/DataCleaner.php';
require_once __DIR__ . '/lib/SchemaDetector.php';
require_once __DIR__ . '/lib/ImportLogger.php';
require_once __DIR__ . '/lib/ChunkReader.php';
require_once __DIR__ . '/lib/ProgressTracker.php';
require_once __DIR__ . '/vendor/autoload.php';

use PhpOffice\PhpSpreadsheet\IOFactory;
use PhpOffice\PhpSpreadsheet\Reader\Xlsx;

// Generate progress tracking key
$progressKey = ProgressTracker::generateKey();

try {
    // ===== GATE 1: THE ARRIVAL =====

    // Validate file upload
    if (!isset($_FILES['xlsx']) || $_FILES['xlsx']['error'] !== UPLOAD_ERR_OK) {
        ErrorHandler::jsonError('No file uploaded or upload error occurred', 'UPLOAD_ERROR', 400);
    }

    $file = $_FILES['xlsx'];

    // Validate file size
    if ($file['size'] > MAX_FILE_SIZE) {
        ErrorHandler::jsonError('File size exceeds maximum allowed size (' . (MAX_FILE_SIZE / 1024 / 1024) . 'MB)', 'FILE_TOO_LARGE', 400);
    }

    // Validate file extension
    $fileExt = strtolower(pathinfo($file['name'], PATHINFO_EXTENSION));
    if (!in_array($fileExt, ALLOWED_EXTENSIONS)) {
        ErrorHandler::jsonError('Invalid file type. Only .xlsx and .csv files are allowed', 'INVALID_FILE_TYPE', 400);
    }

    // Generate unique filename and move to temp directory
    $uniqueFilename = uniqid('upload_', true) . '.' . $fileExt;
    $uploadPath = UPLOAD_DIR . $uniqueFilename;

    if (!move_uploaded_file($file['tmp_name'], $uploadPath)) {
        ErrorHandler::jsonError('Failed to save uploaded file', 'SAVE_ERROR', 500);
    }

    // Initialize progress tracking
    ProgressTracker::start($progressKey, 100, 'parsing');
    ProgressTracker::update($progressKey, 5, 'File uploaded, starting analysis...');

    // ===== GATE 2: THE LISTENING =====

    // Initialize skippedRows early to prevent undefined variable warning
    $skippedRows = 0;

    // Branch based on file type
    if ($fileExt === 'csv') {
        // ===== CSV PARSING PATH =====
        
        ProgressTracker::update($progressKey, 10, 'Parsing CSV file...');
        
        $csvData = DataCleaner::parseCSV($uploadPath);
        
        $headers = $csvData['headers'];
        $rows = [];
        $totalRows = count($csvData['rows']);
        
        ProgressTracker::start($progressKey, $totalRows, 'parsing');
        ProgressTracker::update($progressKey, 0, "Analyzing $totalRows rows...");
        
        // Convert rows from indexed arrays to associative arrays
        foreach ($csvData['rows'] as $idx => $row) {
            $rowData = [];
            foreach ($headers as $hIdx => $header) {
                $rowData[$header] = $row[$hIdx] ?? null;
            }
            $rows[] = $rowData;
            
            // Update progress every 10 rows
            if ($idx % 10 === 0) {
                $message = "Analyzing row " . ($idx + 1) . " of $totalRows...";
                ProgressTracker::update($progressKey, $idx + 1, $message);
            }
        }
        
        ProgressTracker::update($progressKey, $totalRows, "Analysis complete! Found $totalRows rows.");
        
        // Store CSV metadata for response
        $csvMetadata = $csvData['metadata'];
        $metadataHints = []; // CSV doesn't support metadata row hints
        
    } else {
        // ===== XLSX PARSING PATH - CHUNK-BASED FOR LARGE FILES =====

        // Step 1: Get file metadata (lightweight operation)
        ProgressTracker::update($progressKey, 10, 'Reading Excel file structure...');
        
        $metadata = ChunkReader::getFileMetadata($uploadPath);
        $highestRow = $metadata['total_rows'];
        $highestColumn = $metadata['highest_column'];
        $highestColumnIndex = $metadata['total_columns'];

        // Update progress with actual row count
        ProgressTracker::start($progressKey, $highestRow, 'parsing');
        ProgressTracker::update($progressKey, 0, "Analyzing $highestRow rows...");

        ErrorHandler::logError('XLSX file metadata loaded', [
            'total_rows' => $highestRow,
            'total_columns' => $highestColumnIndex
        ]);

        // Step 2: Read headers (row 1 only)
        $headerData = ChunkReader::readHeaders($uploadPath);
        $headers = $headerData['headers'];
        $headerMap = $headerData['headerMap'];

        // Gate 2.5: The Whisper - Check for metadata row
        $metadataHints = [];
        $startRow = 2;

        if ($highestRow >= 2) {
            $row2Data = ChunkReader::readRow($uploadPath, 2, $headerMap, $highestColumnIndex);
            $detectedHints = SchemaDetector::parseMetadataRow($row2Data);

            if ($detectedHints) {
                $metadataHints = $detectedHints;
                $startRow = 3; // Skip row 2 as it is metadata
            }
        }

        // Step 3: Determine if we should use chunk processing
        $totalDataRows = $highestRow - $startRow + 1;
        $useChunkProcessing = ($totalDataRows > MAX_ROWS_IN_MEMORY);

        if ($useChunkProcessing) {
            ErrorHandler::logError('Large file detected - using chunk processing', [
                'total_data_rows' => $totalDataRows,
                'max_rows_in_memory' => MAX_ROWS_IN_MEMORY,
                'chunk_size' => CHUNK_SIZE
            ]);

            // Step 4: Read data rows (chunk-based or all at once)
            $rows = [];
            $skippedRows = 0;

            // Process in chunks
            ChunkReader::readInChunks(
                $uploadPath,
                $headerMap,
                $highestColumnIndex,
                $startRow,
                $highestRow,
                CHUNK_SIZE,
                function($chunkRows, $chunkStart, $chunkEnd) use (&$rows, &$skippedRows, $progressKey) {
                    // Accumulate rows from each chunk
                    foreach ($chunkRows as $rowData) {
                        // Check if row is empty
                        $isEmpty = true;
                        foreach ($rowData as $value) {
                            if ($value !== null && trim((string)$value) !== '') {
                                $isEmpty = false;
                                break;
                            }
                        }
                        
                        if ($isEmpty) {
                            $skippedRows++;
                        } else {
                            $rows[] = $rowData;
                        }
                    }

                    // Update progress
                    $currentTotal = count($rows) + $skippedRows;
                    $message = "Analyzing row $chunkEnd of $chunkEnd...";
                    ProgressTracker::update($progressKey, $currentTotal, $message);

                    ErrorHandler::logError('Processed chunk', [
                        'rows_in_chunk' => count($chunkRows),
                        'total_rows_so_far' => $currentTotal,
                        'chunk_range' => "$chunkStart-$chunkEnd"
                    ]);
                }
            );
        } else {
            // Small file - read all at once (original behavior)
            $reader = new Xlsx();
            $reader->setReadDataOnly(true);

            $spreadsheet = IOFactory::load($uploadPath);
            $worksheet = $spreadsheet->getActiveSheet();

            for ($row = $startRow; $row <= $highestRow; $row++) {
                $rowData = [];
                $isEmpty = true;

                for ($col = 1; $col <= $highestColumnIndex; $col++) {
                    $cell = $worksheet->getCell([$col, $row]);

                    // For numeric cells, use getFormattedValue() to preserve European number formatting
                    // For other types (text, dates, formulas), use getValue() to get raw value
                    $dataType = $cell->getDataType();
                    if ($dataType === \PhpOffice\PhpSpreadsheet\Cell\DataType::TYPE_NUMERIC) {
                        // Get formatted string (preserves European commas as decimal separators)
                        $cellValue = $cell->getFormattedValue();
                    } else {
                        // Get raw value for non-numeric cells
                        $cellValue = $cell->getValue();
                    }

                    $cleanedValue = DataCleaner::clean($cellValue);

                    $header = $headerMap[$col];
                    $rowData[$header] = $cleanedValue;

                    if ($cleanedValue !== null) {
                        $isEmpty = false;
                    }
                }

                // Skip empty rows
                if ($isEmpty) {
                    $skippedRows++;
                    ErrorHandler::trackRowError($row, 'All cells empty');
                    continue;
                }

                $rows[] = $rowData;

                // Update progress every 50 rows
                if (($row - $startRow + 1) % 50 === 0) {
                    $message = "Analyzing row " . ($row - $startRow + 1) . " of " . ($highestRow - $startRow + 1) . "...";
                    ProgressTracker::update($progressKey, $row - $startRow + 1, $message);
                }
            }

            // Free memory
            $spreadsheet->disconnectWorksheets();
            unset($spreadsheet);
        }
    }

    // Complete parsing progress
    $totalRows = count($rows);
    ProgressTracker::update($progressKey, $totalRows, "Analysis complete! Found $totalRows rows.");
    ProgressTracker::setPhase($progressKey, 'ready', 'Ready to insert');

    // Get table name from form input (preferred) or fallback to filename
    if (isset($_POST['tableName']) && !empty(trim($_POST['tableName']))) {
        $tableName = DataCleaner::sanitizeTableName(trim($_POST['tableName']));
    } else {
        // Fallback: Generate table name from filename if not provided
        $originalFilename = pathinfo($file['name'], PATHINFO_FILENAME);
        $tableName = DataCleaner::sanitizeTableName($originalFilename);
    }

    // ===== KEEP UPLOADED FILE WITH SEARCHABLE NAME =====
    // Create persistent copy with table name + timestamp for easy finding
    $uploadsDir = __DIR__ . '/uploads/';
    if (!file_exists($uploadsDir)) {
        mkdir($uploadsDir, 0777, true);
    }

    $timestamp = date('Y-m-d_H-i-s');
    $persistentFilename = "{$tableName}_{$timestamp}.{$fileExt}";
    $persistentPath = $uploadsDir . $persistentFilename;

    // Copy the temp file to persistent storage
    copy($uploadPath, $persistentPath);

    ErrorHandler::logError('File saved for persistence', [
        'table' => $tableName,
        'original_name' => $file['name'],
        'persistent_path' => $persistentPath,
        'persistent_filename' => $persistentFilename
    ]);

    /**
     * Check if a column is junk (should be filtered out)
     * Junk columns are: generic auto-generated names OR all values empty
     */
    function isJunkColumn($columnName, $columnData) {
        // 1. Generic auto-generated name pattern (e.g., column_18, column_19)
        if (preg_match('/^column_\d+$/i', $columnName)) {
            ErrorHandler::logError('Junk column detected: generic name pattern', ['column' => $columnName]);
            return true;
        }

        // 2. Check if all values are null/empty
        $hasData = false;
        foreach ($columnData as $value) {
            if ($value !== null && trim($value) !== '') {
                $hasData = true;
                break;
            }
        }

        if (!$hasData) {
            ErrorHandler::logError('Junk column detected: all values empty', ['column' => $columnName]);
            return true;
        }

        return false;
    }

    /**
     * Filter out junk columns from headers and rows
     * Returns cleaned headers array
     */
    function filterJunkColumns($headers, $rows) {
        $cleanHeaders = [];
        $removedColumns = [];

        foreach ($headers as $header) {
            // Extract column data for this header
            $columnData = array_column($rows, $header);

            if (!isJunkColumn($header, $columnData)) {
                $cleanHeaders[] = $header;
            } else {
                $removedColumns[] = $header;
            }
        }

        if (!empty($removedColumns)) {
            ErrorHandler::logError('Junk columns filtered out', [
                'removed_count' => count($removedColumns),
                'columns' => $removedColumns,
                'original_count' => count($headers),
                'clean_count' => count($cleanHeaders)
            ]);
        }

        return $cleanHeaders;
    }

    /**
     * Calculate schema match score between file headers and saved schema
     * Returns percentage (0-100)
     */
    function calculateSchemaMatchScore($fileHeaders, $savedSchema) {
        $savedColumns = array_map(function($col) {
            return $col['name'];
        }, $savedSchema);

        $matchingCount = 0;
        $totalCount = count($fileHeaders);

        foreach ($fileHeaders as $header) {
            if (in_array($header, $savedColumns)) {
                $matchingCount++;
            }
        }

        $score = ($totalCount > 0) ? ($matchingCount / $totalCount) * 100 : 0;

        ErrorHandler::logError('Schema match score calculated', [
            'matching_columns' => $matchingCount,
            'total_file_columns' => $totalCount,
            'saved_schema_columns' => count($savedColumns),
            'score' => round($score, 2) . '%'
        ]);

        return $score;
    }

    // ===== FILTER JUNK COLUMNS BEFORE PROCESSING =====
    $originalHeaderCount = count($headers);
    $headers = filterJunkColumns($headers, $rows);
    $cleanHeaderCount = count($headers);

    if ($originalHeaderCount !== $cleanHeaderCount) {
        ErrorHandler::logError('Headers cleaned', [
            'original_count' => $originalHeaderCount,
            'clean_count' => $cleanHeaderCount,
            'removed' => $originalHeaderCount - $cleanHeaderCount
        ]);
    }

    // Detect schema from file contents (with cleaned headers)
    // For large files (XLSX only), use sampling to reduce memory usage
    $rowsForSchema = $rows;
    if ($fileExt === 'xlsx' && isset($useChunkProcessing) && $useChunkProcessing && count($rows) > SCHEMA_SAMPLE_SIZE) {
        // We already loaded all rows in chunks, but for schema detection we can use a sample
        // Take evenly distributed sample
        $totalRows = count($rows);
        $interval = floor($totalRows / SCHEMA_SAMPLE_SIZE);
        $rowsForSchema = [];
        for ($i = 0; $i < SCHEMA_SAMPLE_SIZE; $i++) {
            $index = $i * $interval;
            if ($index < $totalRows) {
                $rowsForSchema[] = $rows[$index];
            }
        }

        ErrorHandler::logError('Using sampled rows for schema detection', [
            'total_rows' => $totalRows,
            'sample_size' => count($rowsForSchema)
        ]);
    }

    $autoDetectedSchema = SchemaDetector::detectSchema($rowsForSchema, $headers, $metadataHints);

    /**
     * Check if schema match score meets threshold (fuzzy matching)
     * Returns true if score >= threshold (default 70%)
     */
    function canUseSchemaMemoryFuzzy($fileHeaders, $memorySchema, $threshold = 70) {
        $score = calculateSchemaMatchScore($fileHeaders, $memorySchema);
        $meetsThreshold = $score >= $threshold;

        ErrorHandler::logError('Fuzzy schema match check', [
            'score' => round($score, 2) . '%',
            'threshold' => $threshold . '%',
            'result' => $meetsThreshold ? 'MATCH' : 'NO MATCH'
        ]);

        return ['matches' => $meetsThreshold, 'score' => $score];
    }

    /**
     * OLD: Check if all file headers exist in a saved schema (subset matching)
     * Returns true if file columns are a subset of memory schema columns
     * NOTE: Kept for backward compatibility, but fuzzy matching is now preferred
     */
    function canUseSchemaMemory($fileHeaders, $memorySchema) {
        // Extract column names from memory schema
        $memoryColumns = array_map(function($col) {
            return $col['name'];
        }, $memorySchema);

        // Check if ALL file headers exist in memory schema
        foreach ($fileHeaders as $header) {
            if (!in_array($header, $memoryColumns)) {
                return false; // File has a column not in memory
            }
        }

        return true; // All file columns found in memory
    }

    /**
     * Get default value for a column type (for missing columns)
     */
    function getDefaultValueForColumnType($type) {
        switch (strtoupper($type)) {
            case 'VARCHAR':
            case 'TEXT':
            case 'ENUM':
                return ''; // Empty string
            case 'INT':
            case 'INTEGER':
            case 'BIGINT':
            case 'SMALLINT':
            case 'TINYINT':
                return 0;
            case 'DECIMAL':
            case 'NUMERIC':
            case 'FLOAT':
            case 'DOUBLE':
                return '0.00';
            case 'DATE':
            case 'DATETIME':
            case 'TIMESTAMP':
                return null; // NULL for dates
            case 'BOOLEAN':
            case 'BOOL':
                return 0;
            default:
                return null;
        }
    }

    /**
     * Merge auto-detected schema with previous import schema
     * NEW: Supports fuzzy matching - marks new columns and missing columns
     * Priority: Use memory for matching columns, auto-detect for new columns
     */
    function mergeSchemasWithMemory($autoSchema, $memorySchema, $headers) {
        $merged = [];
        $memoryByName = [];
        $newColumns = [];
        $missingColumns = [];

        // Index memory schema by column name for fast lookup
        foreach ($memorySchema as $col) {
            $memoryByName[$col['name']] = $col;
        }

        // STEP 1: Add columns from file (using memory settings where available)
        foreach ($autoSchema as $autoCol) {
            $colName = $autoCol['name'];

            if (isset($memoryByName[$colName])) {
                // Column exists in memory - use memory version
                $merged[] = $memoryByName[$colName];
                unset($memoryByName[$colName]); // Remove to track what's missing
            } else {
                // NEW column not in memory - use auto-detected settings
                $autoCol['is_new_column'] = true; // Flag for UI highlighting
                $merged[] = $autoCol;
                $newColumns[] = $colName;
            }
        }

        // STEP 2: Add missing columns from memory schema (not in file)
        // These columns will use default values during INSERT
        foreach ($memoryByName as $colName => $memoryCol) {
            // This column is in saved schema but NOT in uploaded file
            // Make it nullable and set default value
            $memoryCol['nullable'] = true;
            $memoryCol['missing_in_file'] = true; // Flag for INSERT logic + UI
            $memoryCol['default_value'] = getDefaultValueForColumnType($memoryCol['type']);

            $merged[] = $memoryCol;
            $missingColumns[] = $colName;
        }

        if (!empty($newColumns)) {
            ErrorHandler::logError('New columns detected (not in saved schema)', [
                'count' => count($newColumns),
                'columns' => $newColumns
            ]);
        }

        if (!empty($missingColumns)) {
            ErrorHandler::logError('Missing columns (in saved schema but not in file)', [
                'count' => count($missingColumns),
                'columns' => $missingColumns
            ]);
        }

        return $merged;
    }

    // ===== SCHEMA MEMORY LOOKUP WITH FUZZY MATCHING =====
    // Try to retrieve schema from previous successful import
    $schemaSource = 'auto';
    $importDate = null;
    $schema = $autoDetectedSchema;
    $schemaMatchScore = null;

    try {
        // Get database instance and connection
        $db = DatabaseHelper::getInstance();
        $conn = $db->getConnection();

        // Try to select the default database to search for import history
        // Use simple SELECT query instead of createAndSelectDatabase to avoid socket issues
        $dbCheckResult = $conn->query("SELECT SCHEMA_NAME FROM INFORMATION_SCHEMA.SCHEMATA WHERE SCHEMA_NAME = '" . DB_NAME . "'");

        if ($dbCheckResult && $dbCheckResult->num_rows > 0) {
            // Database exists, select it
            if ($conn->select_db(DB_NAME)) {
                ErrorHandler::logError('Checking schema memory for table: ' . $tableName, ['database' => DB_NAME]);

                // Note: We don't know the target database name yet (user hasn't selected it)
                // So we query without database filter - this returns schema from ANY database
                $previousSchema = ImportLogger::getLastSuccessfulSchema($tableName, null);

                if ($previousSchema !== null && !empty($previousSchema['schema'])) {
                    // FUZZY MATCHING: Check if match score >= 70%
                    $fuzzyResult = canUseSchemaMemoryFuzzy($headers, $previousSchema['schema'], 70);

                    if ($fuzzyResult['matches']) {
                        // ✅ Score >= 70% - use saved schema!
                        $schema = mergeSchemasWithMemory($autoDetectedSchema, $previousSchema['schema'], $headers);
                        $schemaSource = 'memory';
                        $importDate = $previousSchema['import_date'];
                        $schemaMatchScore = round($fuzzyResult['score'], 2);

                        // Count column categories
                        $savedColumnCount = count($previousSchema['schema']);
                        $fileColumnCount = count($headers);
                        $newColumnCount = count(array_filter($schema, fn($col) => isset($col['is_new_column']) && $col['is_new_column']));
                        $missingColumnCount = count(array_filter($schema, fn($col) => isset($col['missing_in_file']) && $col['missing_in_file']));

                        ErrorHandler::logError('✓ Schema loaded from memory (fuzzy match)', [
                            'table' => $tableName,
                            'import_date' => $importDate,
                            'match_score' => $schemaMatchScore . '%',
                            'threshold' => '70%',
                            'total_columns' => count($schema),
                            'file_columns' => $fileColumnCount,
                            'new_columns' => $newColumnCount,
                            'missing_columns' => $missingColumnCount
                        ]);
                    } else {
                        // ❌ Score < 70% - fall back to auto-detection
                        $schemaMatchScore = round($fuzzyResult['score'], 2);

                        ErrorHandler::logError('Schema match score below threshold - using auto-detection', [
                            'table' => $tableName,
                            'match_score' => $schemaMatchScore . '%',
                            'threshold' => '70%'
                        ]);
                    }
                } else {
                    ErrorHandler::logError('No previous schema found in memory for table: ' . $tableName);
                }
            } else {
                ErrorHandler::logError('Failed to select database, using auto-detection: ' . $conn->error);
            }
        } else {
            // Database doesn't exist yet - first import, use auto-detection
            ErrorHandler::logError('Database ' . DB_NAME . ' does not exist yet (first import), using auto-detection');
        }
    } catch (Exception $memEx) {
        // If schema memory lookup fails, fall back to auto-detection
        ErrorHandler::logError('Schema memory lookup failed, using auto-detection: ' . $memEx->getMessage());
        $schema = $autoDetectedSchema;
        $schemaSource = 'auto';
    }

    // ===== STORE METADATA FOR LOGGING IN INSERT.PHP =====
    // Store file metadata in session for later logging in insert.php
    $fileMetadata = [
        'file_name' => $file['name'],
        'file_type' => strtoupper($fileExt),
        'file_size' => $file['size'],
        'file_path' => $persistentPath,  // Persistent file location
        'table_name' => $tableName
    ];
    
    // Add CSV-specific metadata if applicable
    if ($fileExt === 'csv' && isset($csvMetadata)) {
        $fileMetadata['csv_delimiter'] = $csvMetadata['delimiter_name'];
        $fileMetadata['csv_encoding'] = $csvMetadata['encoding'] ?? 'UTF-8';
        $fileMetadata['csv_normalized_rows'] = $csvMetadata['inconsistent_rows'] ?? 0;
    }
    
    // Store in session for insert.php to use when starting the import log
    $_SESSION['file_metadata'] = $fileMetadata;

    // Clean up uploaded file
    @unlink($uploadPath);

    // Prepare response stats
    $stats = [
        'valid_rows' => count($rows),
        'errors' => ErrorHandler::getErrors()
    ];

    // Add file-type specific stats
    if ($fileExt === 'csv') {
        $stats['file_type'] = 'CSV';
        $stats['delimiter'] = $csvMetadata['delimiter_name'];
        $stats['total_rows'] = $csvMetadata['total_rows'];
        $stats['skipped_rows'] = 0;
        if ($csvMetadata['inconsistent_rows'] > 0) {
            $stats['inconsistent_rows'] = $csvMetadata['inconsistent_rows'];
            $stats['note'] = 'Some rows had inconsistent column counts and were normalized';
        }
    } else {
        $stats['file_type'] = 'XLSX';
        $stats['total_rows'] = $highestRow - 1;
        $stats['skipped_rows'] = $skippedRows;
    }

    // Prepare response
    $response = [
        'status' => 'ok',
        'message' => sprintf('Parsed %d rows, %d columns', count($rows), count($headers)),
        'tableName' => $tableName,
        'headers' => $headers,
        'schema' => $schema,
        'rows' => $rows,
        'stats' => $stats,
        'schema_source' => $schemaSource,
        'schema_import_date' => $importDate,
        'schema_match_score' => $schemaMatchScore,
        'progress_key' => $progressKey
    ];

    ErrorHandler::jsonSuccess($response);

} catch (Exception $e) {
    // Clean up on error
    if (isset($uploadPath) && file_exists($uploadPath)) {
        @unlink($uploadPath);
    }

    ErrorHandler::handleException($e);
}
