<?php
/**
 * ChunkReader - Memory-Efficient Excel Reader
 * Reads large XLSX files in chunks to avoid memory exhaustion
 */

// Ensure PhpSpreadsheet is loaded
if (!class_exists('PhpOffice\PhpSpreadsheet\IOFactory')) {
    require_once dirname(__DIR__) . '/vendor/autoload.php';
}

use PhpOffice\PhpSpreadsheet\Reader\IReadFilter;
use PhpOffice\PhpSpreadsheet\IOFactory;
use PhpOffice\PhpSpreadsheet\Reader\Xlsx;

/**
 * Read filter to load only specific row ranges
 * This prevents loading the entire file into memory
 */
class ChunkReadFilter implements IReadFilter {
    private $startRow = 0;
    private $endRow = 0;

    /**
     * Set the row range to read
     */
    public function setRows($startRow, $chunkSize) {
        $this->startRow = $startRow;
        $this->endRow = $startRow + $chunkSize - 1;
    }

    /**
     * Should this cell be read?
     */
    public function readCell(string $columnAddress, int $row, string $worksheetName = ''): bool {
        // Read if row is within our range
        if ($row >= $this->startRow && $row <= $this->endRow) {
            return true;
        }
        // Always read header row (row 1)
        if ($row == 1) {
            return true;
        }
        return false;
    }
}

class ChunkReader {

    /**
     * Get total row count and column count without loading all data
     * This is a lightweight operation
     */
    public static function getFileMetadata($filePath) {
        $reader = new Xlsx();
        $reader->setReadDataOnly(true);

        // Load only to get dimensions
        $spreadsheet = IOFactory::load($filePath);
        $worksheet = $spreadsheet->getActiveSheet();

        $highestRow = $worksheet->getHighestRow();
        $highestColumn = $worksheet->getHighestColumn();
        $highestColumnIndex = \PhpOffice\PhpSpreadsheet\Cell\Coordinate::columnIndexFromString($highestColumn);

        // Free memory
        $spreadsheet->disconnectWorksheets();
        unset($spreadsheet);

        return [
            'total_rows' => $highestRow,
            'total_columns' => $highestColumnIndex,
            'highest_column' => $highestColumn
        ];
    }

    /**
     * Read headers from row 1
     */
    public static function readHeaders($filePath) {
        $reader = new Xlsx();
        $reader->setReadDataOnly(true);

        // Create filter to read only row 1
        $chunkFilter = new ChunkReadFilter();
        $chunkFilter->setRows(1, 1);
        $reader->setReadFilter($chunkFilter);

        $spreadsheet = IOFactory::load($filePath);
        $worksheet = $spreadsheet->getActiveSheet();
        $highestColumn = $worksheet->getHighestColumn();
        $highestColumnIndex = \PhpOffice\PhpSpreadsheet\Cell\Coordinate::columnIndexFromString($highestColumn);

        // Extract headers
        $headers = [];
        $headerMap = [];
        for ($col = 1; $col <= $highestColumnIndex; $col++) {
            $cell = $worksheet->getCell([$col, 1]);
            $cellValue = $cell->getValue();
            $header = DataCleaner::sanitizeColumnName($cellValue ?? "Column_$col");

            // Ensure uniqueness
            $originalHeader = $header;
            $counter = 1;
            while (in_array($header, $headers)) {
                $header = $originalHeader . '_' . $counter;
                $counter++;
            }

            $headers[] = $header;
            $headerMap[$col] = $header;
        }

        // Free memory
        $spreadsheet->disconnectWorksheets();
        unset($spreadsheet);

        return [
            'headers' => $headers,
            'headerMap' => $headerMap,
            'columnCount' => $highestColumnIndex
        ];
    }

    /**
     * Read a specific row (for metadata row detection)
     */
    public static function readRow($filePath, $rowNumber, $headerMap, $highestColumnIndex) {
        $reader = new Xlsx();
        $reader->setReadDataOnly(true);

        // Create filter to read only the specified row
        $chunkFilter = new ChunkReadFilter();
        $chunkFilter->setRows($rowNumber, 1);
        $reader->setReadFilter($chunkFilter);

        $spreadsheet = IOFactory::load($filePath);
        $worksheet = $spreadsheet->getActiveSheet();

        $rowData = [];
        for ($col = 1; $col <= $highestColumnIndex; $col++) {
            $cell = $worksheet->getCell([$col, $rowNumber]);
            $header = $headerMap[$col];
            $rowData[$header] = $cell->getValue();
        }

        // Free memory
        $spreadsheet->disconnectWorksheets();
        unset($spreadsheet);

        return $rowData;
    }

    /**
     * Read rows in chunks using a callback function
     * This keeps memory usage constant regardless of file size
     *
     * @param string $filePath Path to XLSX file
     * @param array $headerMap Column index to header name mapping
     * @param int $highestColumnIndex Number of columns
     * @param int $startRow First data row to read
     * @param int $endRow Last row in file
     * @param int $chunkSize Number of rows to read at once
     * @param callable $callback Function to call for each chunk of rows
     */
    public static function readInChunks($filePath, $headerMap, $highestColumnIndex, $startRow, $endRow, $chunkSize, callable $callback) {
        $reader = new Xlsx();
        $reader->setReadDataOnly(true);
        $chunkFilter = new ChunkReadFilter();

        $currentRow = $startRow;

        while ($currentRow <= $endRow) {
            // Set filter for this chunk
            $chunkFilter->setRows($currentRow, $chunkSize);
            $reader->setReadFilter($chunkFilter);

            // Load only this chunk
            $spreadsheet = IOFactory::load($filePath);
            $worksheet = $spreadsheet->getActiveSheet();

            $chunkRows = [];
            $chunkEnd = min($currentRow + $chunkSize - 1, $endRow);

            for ($row = $currentRow; $row <= $chunkEnd; $row++) {
                $rowData = [];
                $isEmpty = true;

                for ($col = 1; $col <= $highestColumnIndex; $col++) {
                    $cell = $worksheet->getCell([$col, $row]);
                    $cellValue = $cell->getValue();
                    $cleanedValue = DataCleaner::clean($cellValue);

                    $header = $headerMap[$col];
                    $rowData[$header] = $cleanedValue;

                    if ($cleanedValue !== null) {
                        $isEmpty = false;
                    }
                }

                // Skip empty rows
                if (!$isEmpty) {
                    $chunkRows[] = $rowData;
                } else {
                    ErrorHandler::trackRowError($row, 'All cells empty');
                }
            }

            // Call callback with this chunk
            $callback($chunkRows, $currentRow, $chunkEnd);

            // Free memory before next chunk
            $spreadsheet->disconnectWorksheets();
            unset($spreadsheet);

            $currentRow += $chunkSize;
        }
    }

    /**
     * Sample rows for schema detection
     * Reads evenly distributed sample of rows instead of all rows
     */
    public static function sampleRows($filePath, $headerMap, $highestColumnIndex, $startRow, $endRow, $sampleSize) {
        $totalRows = $endRow - $startRow + 1;

        // If file is small, read all rows
        if ($totalRows <= $sampleSize) {
            $allRows = [];
            self::readInChunks($filePath, $headerMap, $highestColumnIndex, $startRow, $endRow, CHUNK_SIZE,
                function($chunkRows) use (&$allRows) {
                    $allRows = array_merge($allRows, $chunkRows);
                }
            );
            return $allRows;
        }

        // Calculate sampling interval for even distribution
        $interval = floor($totalRows / $sampleSize);
        $sampleRows = [];

        $reader = new Xlsx();
        $reader->setReadDataOnly(true);

        // Read header and sample in one go
        for ($i = 0; $i < $sampleSize; $i++) {
            $rowNumber = $startRow + ($i * $interval);
            if ($rowNumber > $endRow) break;

            $chunkFilter = new ChunkReadFilter();
            $chunkFilter->setRows($rowNumber, 1);
            $reader->setReadFilter($chunkFilter);

            $spreadsheet = IOFactory::load($filePath);
            $worksheet = $spreadsheet->getActiveSheet();

            $rowData = [];
            $isEmpty = true;

            for ($col = 1; $col <= $highestColumnIndex; $col++) {
                $cell = $worksheet->getCell([$col, $rowNumber]);
                $cellValue = $cell->getValue();
                $cleanedValue = DataCleaner::clean($cellValue);

                $header = $headerMap[$col];
                $rowData[$header] = $cleanedValue;

                if ($cleanedValue !== null) {
                    $isEmpty = false;
                }
            }

            if (!$isEmpty) {
                $sampleRows[] = $rowData;
            }

            // Free memory
            $spreadsheet->disconnectWorksheets();
            unset($spreadsheet);
        }

        ErrorHandler::logError('Sampled rows for schema detection', [
            'total_rows' => $totalRows,
            'sample_size' => count($sampleRows),
            'interval' => $interval
        ]);

        return $sampleRows;
    }
}
