<?php
/**
 * tabler.php - The Ledger Whisperer
 * 
 * Converts ANY bank statement PDF (text-based or scanned) into a TSV file with strict 5-column format:
 *   Día, Concepto / Referencia, cargo, Abono, Saldo
 * 
 * Protocol: tabler.mother.tongue.protocol.v1
 * Doctrine: Miserable-First → Zoom-Out → Reconcile → Export
 * 
 * USAGE:
 *   php tabler.php input.pdf output.txt --lang=es --debug=0 --audit=audit.json
 * 
 * @version 1.0.0
 * @author  "Forrest" - calm, ruthless builder
 */

// ============================================================================
// STATIC CONFIGURATION
// ============================================================================

define('TABLER_VERSION', '1.6.1');
define('TABLER_PROTOCOL', 'tabler.mother.tongue.protocol.v1');

// Output column order (fixed)
define('OUTPUT_COLUMNS', ['day', 'description', 'debit', 'credit', 'balance']);

// Default DPI for rasterization
define('DEFAULT_DPI', 300);

// Money formatting
define('DECIMAL_SEPARATOR', '.');
define('THOUSAND_SEPARATOR', ',');
define('DECIMAL_PLACES', 2);

// ============================================================================
// CLI ARGUMENT PARSING
// ============================================================================

function parse_cli_args($argv) {
    $result = [
        'input'       => null,
        'output'      => null,
        'lang'        => 'es',
        'debug'       => false,
        'audit'       => false,
        'cache_dir'   => null,
        'help'        => false,
    ];
    
    $args = array_slice($argv, 1); // Skip script name
    
    for ($i = 0; $i < count($args); $i++) {
        $arg = $args[$i];
        
        if ($arg === '--help' || $arg === '-h' || $arg === '/?') {
            $result['help'] = true;
            return $result;
        }
        
        if ($arg === '--debug' || $arg === '-d') {
            $result['debug'] = true;
            continue;
        }
        
        if ($arg === '--audit' || $arg === '-a') {
            $result['audit'] = true;
            if (isset($args[$i + 1]) && substr($args[$i + 1], 0, 1) !== '-') {
                $result['audit'] = $args[++$i];
            }
            continue;
        }
        
        if ($arg === '--lang' || $arg === '-l') {
            if (isset($args[$i + 1])) {
                $result['lang'] = $args[++$i];
            }
            continue;
        }
        
        if ($arg === '--cache-dir' || $arg === '-c') {
            if (isset($args[$i + 1])) {
                $result['cache_dir'] = $args[++$i];
            }
            continue;
        }
        
        // Positional arguments
        if ($result['input'] === null) {
            $result['input'] = $arg;
        } else if ($result['output'] === null) {
            $result['output'] = $arg;
        }
    }
    
    return $result;
}

function show_help() {
    echo <<<HELP
The Ledger Whisperer - Bank Statement PDF to TSV Converter

USAGE:
  php tabler.php <input.pdf> [output.txt] [--lang=es] [--debug] [--audit[=audit.json]] [--cache-dir=<dir>]

ARGUMENTS:
  input.pdf        Path to input PDF file (required)
  output.txt       Path to output TSV file (optional, defaults to input basename + .txt)

OPTIONS:
  --lang, -l      Language for header detection (default: es)
  --debug, -d     Enable debug mode (generates overlay images)
  --audit[=file]   Generate audit artifact (default: audit.json)
  --cache-dir, -c  Custom cache directory (default: system temp)
  --help, -h      Show this help message

OUTPUT FORMAT (TSV):
  Día<TAB>Concepto / Referencia<TAB>cargo<TAB>Abono<TAB>Saldo
  
  - Día: DD-MM-YYYY (normalized)
  - Concepto / Reference: Description text (tabs replaced with spaces)
  - cargo: Debit amount (money out), formatted 1,234.56 or 0.00
  - Abono: Credit amount (money in), formatted 1,234.56 or 0.00
  - Saldo: Running balance, formatted 1,234.56

EXAMPLE:
  php tabler.php bank_statement.pdf bank_statement.txt --audit=audit.json

HELP;
}

function show_version() {
    echo "tabler.php version " . TABLER_VERSION . " (" . TABLER_PROTOCOL . ")\n";
}

// ============================================================================
// ERROR HANDLING
// ============================================================================

class TablerError extends \Exception {}

function tabler_error($message, $code = 1, \Exception $previous = null) {
    throw new TablerError($message, $code, $previous);
}

function tabler_warning($message) {
    echo "[WARNING] $message\n";
}

function tabler_info($message) {
    echo "[INFO] $message\n";
}

function tabler_debug($message, $debug = false) {
    if ($debug) {
        echo "[DEBUG] $message\n";
    }
}

// ============================================================================
// PII REDACTION
// ============================================================================

function redact_pii($text) {
    if (!is_string($text)) {
        return $text;
    }
    
    // Redact CLABE (12-34 digits with optional spaces/dashes)
    $text = preg_replace('/\b[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{0,4}\b/', '[CLABE REDACTED]', $text);
    
    // Redact RFC-like patterns (2 letters + 8 digits)
    $text = preg_replace('/\b[A-Z]{2}[0-9]{8}\b/', '[RFC REDACTED]', $text);
    
    // Redact account numbers (9-17 digits)
    $text = preg_replace('/\b[0-9]{9,17}\b/', '[ACCOUNT# REDACTED]', $text);
    
    // Redact IBAN patterns (groups of 4-4-4 digits)
    $text = preg_replace('/\b[0-9]{4}[-\s]?[0-9]{4}[-\s]?[0-9]{4}(?:[0-9]{0,4})?\b/', '[IBAN REDACTED]', $text);
    
    return $text;
}

function redact_row($row) {
    foreach ($row as $key => $value) {
        if (is_string($value)) {
            $row[$key] = redact_pii($value);
        }
    }
    return $row;
}

// ============================================================================
// FILE UTILITIES
// ============================================================================

function ensure_directory($path) {
    if (!is_dir($path)) {
        mkdir($path, 0755, true);
    }
    return $path;
}

function compute_file_hash($filepath) {
    return hash_file('sha256', $filepath);
}

function get_file_size($filepath) {
    return filesize($filepath);
}

function validate_pdf($filepath) {
    if (!file_exists($filepath)) {
        tabler_error("Input file does not exist: $filepath");
    }
    
    if (mime_content_type($filepath) !== 'application/pdf' && 
        substr($filepath, -4) !== '.pdf') {
        tabler_error("Input file must be a PDF: $filepath");
    }
    
    return true;
}

// ============================================================================
// PDF UTILITIES (using external tools)
// ============================================================================

function pdf_page_count($filepath) {
    $escaped_path = escapeshellarg($filepath);
    
    // Try pdfinfo first (poppler)
    $cmd = "pdfinfo $escaped_path 2>&1";
    $output = [];
    $return_var = 0;
    exec($cmd, $output, $return_var);
    
    foreach ($output as $line) {
        if (preg_match('/Pages:\s*(\d+)/', $line, $matches)) {
            return (int)$matches[1];
        }
    }
    
    // Fallback: use pdftotext with -l to list pages
    $cmd = "pdftotext $escaped_path - 2>&1 | head -1";
    $output = [];
    exec($cmd, $output, $return_var);
    
    // If pdftotext works, assume at least 1 page
    if ($return_var === 0 && !empty($output)) {
        return 1;
    }
    
    // Fallback: use identify (ImageMagick)
    $cmd = "identify $escaped_path 2>&1";
    $output = [];
    exec($cmd, $output, $return_var);
    
    return max(1, count($output));
}

function pdf_to_text($filepath, $page = null, $debug = false) {
    // CRITICAL: -layout flag preserves column structure for tabular data
    $escaped_path = escapeshellarg($filepath);
    $cmd = "pdftotext -layout";
    
    if ($page !== null) {
        $cmd .= " -f " . ($page + 1) . " -l " . ($page + 1);
    }
    
    $cmd .= " " . $escaped_path;
    $cmd .= " -";  // Output to stdout
    
    tabler_debug("Extracting text from PDF: " . basename($filepath), $debug);
    
    $output = [];
    $return_var = 0;
    exec($cmd, $output, $return_var);
    
    // Check if output looks like usage/help (pdftotext outputs usage when given bad args)
    $output_text = implode("\n", $output);
    if (strpos($output_text, 'Usage:') !== false || strpos($output_text, 'pdftotext version') !== false) {
        tabler_debug("pdftotext returned usage info (possible command issue)", $debug);
        return ['success' => false, 'text' => '', 'error' => $output_text];
    }
    
    if ($return_var !== 0 || empty(trim($output_text))) {
        tabler_debug("pdftotext failed or returned empty output", $debug);
        return ['success' => false, 'text' => '', 'error' => $output_text];
    }
    
    return ['success' => true, 'text' => $output_text];
}

function pdf_to_html_with_bbox($filepath, $page = null, $debug = false) {
    $escaped_path = escapeshellarg($filepath);
    
    // Try pdftohtml with bbox extraction
    $cmd = "pdftohtml -xml -stdout";
    
    if ($page !== null) {
        $cmd .= " -f " . ($page + 1) . " -l " . ($page + 1);
    }
    
    $cmd .= " $escaped_path";
    
    tabler_debug("Extracting HTML with bbox from PDF: " . basename($filepath), $debug);
    
    $output = [];
    $return_var = 0;
    exec($cmd, $output, $return_var);
    
    if ($return_var !== 0 || empty($output)) {
        tabler_debug("pdftohtml failed or returned empty output", $debug);
        return ['success' => false, 'html' => '', 'error' => implode("\n", $output)];
    }
    
    return ['success' => true, 'html' => implode("\n", $output)];
}

function pdf_to_image($filepath, $output_image, $page = null, $dpi = DEFAULT_DPI, $debug = false) {
    $escaped_input = escapeshellarg($filepath);
    $escaped_output = escapeshellarg($output_image);
    
    // Build the input path with optional page selector
    $input_spec = $escaped_input;
    if ($page !== null) {
        // For ImageMagick, page selector goes after the filename
        $input_spec = escapeshellarg($filepath . "[" . $page . "]");
    }
    
    $cmd = "convert -density $dpi $input_spec $escaped_output";
    
    tabler_debug("Rasterizing PDF page to image: " . basename($output_image), $debug);
    
    $output = [];
    $return_var = 0;
    exec($cmd . " 2>&1", $output, $return_var);
    
    if ($return_var !== 0 || !file_exists($output_image)) {
        tabler_debug("Image conversion failed: " . implode("\n", $output), $debug);
        return ['success' => false, 'error' => implode("\n", $output)];
    }
    
    return ['success' => true, 'path' => $output_image];
}

// ============================================================================
// OCR UTILITIES (Tesseract)
// ============================================================================

function tesseract_ocr($image_path, $debug = false) {
    $base_path = substr($image_path, 0, strrpos($image_path, '.'));
    $hocr_path = $base_path . '.html';
    $txt_path = $base_path . '.txt';
    
    // Generate HOCR output for position information
    $cmd = "tesseract \"" . escapeshellarg($image_path) . "\"";
    $cmd .= " \"" . escapeshellarg($base_path) . "\"";
    $cmd .= " hocr";
    
    tabler_debug("Running Tesseract OCR on: " . basename($image_path), $debug);
    
    $output = [];
    $return_var = 0;
    exec($cmd, $output, $return_var);
    
    if ($return_var !== 0) {
        tabler_debug("Tesseract failed, falling back to plain text", $debug);
        
        // Fallback to plain text
        $cmd = "tesseract \"" . escapeshellarg($image_path) . "\" \"" . escapeshellarg($base_path) . "\"";
        exec($cmd, $output, $return_var);
        
        if (file_exists($txt_path)) {
            return [
                'success' => true,
                'text' => file_get_contents($txt_path),
                'hocr' => null,
                'confidence' => 0,
            ];
        }
        
        return ['success' => false, 'error' => 'Tesseract failed'];
    }
    
    // Parse HOCR for word positions
    $hocr = null;
    $confidence = 0;
    
    if (file_exists($hocr_path)) {
        $hocr = file_get_contents($hocr_path);
        $confidence = calculate_ocr_confidence($hocr_path);
    }
    
    $text = '';
    if (file_exists($txt_path)) {
        $text = file_get_contents($txt_path);
    }
    
    return [
        'success' => true,
        'text' => $text,
        'hocr' => $hocr,
        'confidence' => $confidence,
    ];
}

function parse_hocr_bbox($hocr_html) {
    if (empty($hocr_html)) {
        return [];
    }
    
    $words = [];
    
    // Parse HOCR format: <span class="wordN" title="bbox x0 y0 x1 y1">WORD</span>
    if (preg_match_all('/<span[^>]*class="word(\d+)"[^>]*title="bbox\s*(\d+)\s*(\d+)\s*(\d+)\s*(\d+)"[^>]*>([^<]*)<\/span>/', $hocr_html, $matches, PREG_SET_ORDER)) {
        for ($i = 0; $i < count($matches[0]); $i++) {
            $words[] = [
                'word' => $matches[7][$i],
                'bbox' => [
                    'x0' => (int)$matches[2][$i],
                    'y0' => (int)$matches[3][$i],
                    'x1' => (int)$matches[4][$i],
                    'y1' => (int)$matches[5][$i],
                ],
                'line' => (int)$matches[1][$i],
            ];
        }
    }
    
    return $words;
}

function calculate_ocr_confidence($hocr_path) {
    // Estimate confidence from HOCR data
    // Tesseract stores confidence in title attribute for words
    $html = file_get_contents($hocr_path);
    
    if (preg_match_all('/title="bbox\s*\d+\s*\d+\s*\d+\s*\d+\s*(\d+)"/', $html, $matches)) {
        if (count($matches[1]) > 0) {
            return array_sum($matches[1]) / count($matches[1]);
        }
    }
    
    return 100; // Assume perfect if no confidence data
}

// ============================================================================
// CACHE UTILITIES
// ============================================================================

function get_cache_path($cache_dir, $filename, $subdir = '') {
    $hash = hash('sha256', $filename);
    $base = $cache_dir . DIRECTORY_SEPARATOR . substr($hash, 0, 2);
    
    if (!empty($subdir)) {
        $base .= DIRECTORY_SEPARATOR . $subdir;
    }
    
    ensure_directory($base);
    
    return $base . DIRECTORY_SEPARATOR . substr($hash, 2);
}

function cache_get($cache_path) {
    if (file_exists($cache_path)) {
        return file_get_contents($cache_path);
    }
    return null;
}

function cache_put($cache_path, $data) {
    file_put_contents($cache_path, $data);
}

// ============================================================================
// DATE PARSING AND NORMALIZATION
// ============================================================================

// Global variable to store statement year (extracted from header)
// Used for BBVA format where dates don't include year
$GLOBALS['tabler_statement_year'] = null;

/**
 * Set the statement year for date parsing (used for formats without year like BBVA).
 *
 * @param int $year The year to use for dates without year
 */
function set_statement_year($year) {
    $GLOBALS['tabler_statement_year'] = (int)$year;
}

/**
 * Get the statement year for date parsing.
 *
 * @return int|null The statement year or null if not set
 */
function get_statement_year() {
    return $GLOBALS['tabler_statement_year'];
}

/**
 * Extract statement period from header text.
 * Looks for patterns like "Periodo DEL 01/01/2024 AL 31/01/2024"
 *
 * @param string $text The text to search
 * @return array|null Array with 'start_date', 'end_date', 'year' or null
 */
function extract_statement_period($text) {
    // Pattern: "Periodo DEL DD/MM/YYYY AL DD/MM/YYYY"
    if (preg_match('/periodo\s+del\s+(\d{1,2}\/\d{1,2}\/(\d{4}))\s+al\s+(\d{1,2}\/\d{1,2}\/\d{4})/i', $text, $m)) {
        return [
            'start_date' => $m[1],
            'end_date' => $m[3],
            'year' => (int)$m[2],
        ];
    }
    
    // Pattern: "DEL DD/MM/YYYY AL DD/MM/YYYY"
    if (preg_match('/del\s+(\d{1,2}\/\d{1,2}\/(\d{4}))\s+al\s+(\d{1,2}\/\d{1,2}\/\d{4})/i', $text, $m)) {
        return [
            'start_date' => $m[1],
            'end_date' => $m[3],
            'year' => (int)$m[2],
        ];
    }
    
    // Pattern: "YYYY" standalone year in header
    if (preg_match('/\b(20\d{2})\b/', $text, $m)) {
        return [
            'start_date' => null,
            'end_date' => null,
            'year' => (int)$m[1],
        ];
    }
    
    return null;
}

function normalize_date($date_str, $lang = 'es', $default_year = null) {
    if (empty($date_str)) {
        return null;
    }
    
    $date_str = trim($date_str);
    
    // Skip if already normalized
    if (preg_match('/^\d{2}-\d{2}-\d{4}$/', $date_str)) {
        return $date_str;
    }
    
    // Use default year from parameter, global, or current year
    if ($default_year === null) {
        $default_year = get_statement_year();
    }
    if ($default_year === null) {
        $default_year = (int)date('Y');
    }
    
    // Common date patterns - ORDER MATTERS (most specific first)
    $patterns = [
        // DD-MMM-YYYY or DD/MMM/YYYY (e.g., 10-ENE-2024, 31-DIC-2023)
        '/^(\d{1,2})[-\/]([A-Za-z]{3,})[-\/](\d{4})$/' => 'd-M-y',
        // DD-MMM-YY (e.g., 10-ENE-24)
        '/^(\d{1,2})[-\/]([A-Za-z]{3,})[-\/](\d{2})$/' => 'd-M-yy',
        // DD/MM/YYYY or DD-MM-YYYY
        '/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/' => 'd-m-y',
        // DD/MM/YY or DD-MM-YY
        '/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{2})$/' => 'd-m-yy',
        // YYYY-MM-DD
        '/^(\d{4})[\/\-](\d{1,2})[\/\-](\d{1,2})$/' => 'y-m-d',
        // DD Month YYYY (Spanish)
        '/^(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})$/' => 'd M y',
        // Day DD Month YYYY
        '/^[A-Za-z]+\s+(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})$/' => 'd M y',
        // BBVA format: DD/MMM (without year, e.g., "05/ENE")
        '/^(\d{1,2})\/([A-Za-z]{3})$/' => 'd/M',
    ];
    
    foreach ($patterns as $pattern => $type) {
        if (preg_match($pattern, $date_str, $matches)) {
            array_shift($matches); // Remove full match
            
            switch ($type) {
                case 'd-m-y':
                    return sprintf('%02d-%02d-%04d', (int)$matches[0], (int)$matches[1], (int)$matches[2]);
                case 'd-m-yy':
                    $year = (int)$matches[2];
                    $year = $year < 50 ? 2000 + $year : 1900 + $year;
                    return sprintf('%02d-%02d-%04d', (int)$matches[0], (int)$matches[1], $year);
                case 'y-m-d':
                    return sprintf('%02d-%02d-%04d', (int)$matches[2], (int)$matches[1], (int)$matches[0]);
                case 'd-M-y':
                    return normalize_month_name($matches[1], $matches[0], $matches[2]);
                case 'd-M-yy':
                    $year = (int)$matches[2];
                    $year = $year < 50 ? 2000 + $year : 1900 + $year;
                    return normalize_month_name($matches[1], $matches[0], $year);
                case 'd M y':
                    return normalize_month_name($matches[1], $matches[0], $matches[2]);
                case 'd/M':
                    // BBVA format without year - use default year
                    return normalize_month_name($matches[1], $matches[0], $default_year);
            }
        }
    }
    
    // Try strtotime as last resort
    $ts = strtotime($date_str);
    if ($ts !== false) {
        return date('d-m-Y', $ts);
    }
    
    return null; // Could not parse
}

function normalize_month_name($month_name, $day, $year) {
    // Comprehensive month mapping: full names + abbreviations in multiple languages
    $months = [
        // English full names
        'january' => 1, 'february' => 2, 'march' => 3, 'april' => 4,
        'may' => 5, 'june' => 6, 'july' => 7, 'august' => 8,
        'september' => 9, 'october' => 10, 'november' => 11, 'december' => 12,
        
        // English abbreviations
        'jan' => 1, 'feb' => 2, 'mar' => 3, 'apr' => 4,
        'jun' => 6, 'jul' => 7, 'aug' => 8,
        'sep' => 9, 'sept' => 9, 'oct' => 10, 'nov' => 11, 'dec' => 12,
        
        // Spanish full names
        'enero' => 1, 'febrero' => 2, 'marzo' => 3, 'abril' => 4,
        'mayo' => 5, 'junio' => 6, 'julio' => 7, 'agosto' => 8,
        'septiembre' => 9, 'setiembre' => 9, 'octubre' => 10, 'noviembre' => 11, 'diciembre' => 12,
        
        // Spanish abbreviations (CRITICAL for Santander format)
        'ene' => 1, 'feb' => 2, 'mar' => 3, 'abr' => 4,
        'may' => 5, 'jun' => 6, 'jul' => 7, 'ago' => 8,
        'sep' => 9, 'oct' => 10, 'nov' => 11, 'dic' => 12,
        
        // Portuguese full names
        'janeiro' => 1, 'fevereiro' => 2, 'março' => 3, 'abril' => 4,
        'maio' => 5, 'junho' => 6, 'julho' => 7, 'agosto' => 8,
        'setembro' => 9, 'outubro' => 10, 'novembro' => 11, 'dezembro' => 12,
        
        // Portuguese abbreviations
        'jan' => 1, 'fev' => 2, 'mar' => 3, 'abr' => 4,
        'mai' => 5, 'jun' => 6, 'jul' => 7, 'ago' => 8,
        'set' => 9, 'out' => 10, 'nov' => 11, 'dez' => 12,
        
        // French full names
        'janvier' => 1, 'février' => 2, 'mars' => 3, 'avril' => 4,
        'mai' => 5, 'juin' => 6, 'juillet' => 7, 'août' => 8,
        'septembre' => 9, 'octobre' => 10, 'novembre' => 11, 'décembre' => 12,
        
        // French abbreviations
        'janv' => 1, 'févr' => 2, 'avr' => 4,
        'juil' => 7, 'aoû' => 8, 'déc' => 12,
    ];
    
    $month_lower = strtolower(trim($month_name));
    
    if (isset($months[$month_lower])) {
        return sprintf('%02d-%02d-%04d', (int)$day, $months[$month_lower], (int)$year);
    }
    
    return null;
}

// Legacy alias for backward compatibility
function normalize_spanish_month($month_name, $day, $year) {
    return normalize_month_name($month_name, $day, $year);
}

// ============================================================================
// MONEY PARTTING AND FORMATTING
// ============================================================================

function parse_money($amount_str) {
    if (empty($amount_str)) {
        return null;
    }
    
    // Remove currency symbols and whitespace
    $clean = preg_replace('/[^\d\.\,\-\+]/', '', (string)$amount_str);
    
    // Handle Spanish format (1.234,56) vs English (1,234.56)
    // Count occurrences
    $dot_count = substr_count($clean, '.');
    $comma_count = substr_count($clean, ',');
    
    // Determine format based on position and count of separators
    if ($dot_count > 0 && $comma_count > 0) {
        // Both separators present - determine which is decimal
        $last_dot = strrpos($clean, '.');
        $last_comma = strrpos($clean, ',');
        
        if ($last_dot > $last_comma) {
            // Dot is last: English format (1,234.56)
            $clean = str_replace(',', '', $clean);
        } else {
            // Comma is last: Spanish format (1.234,56)
            $clean = str_replace('.', '', $clean);
            $clean = str_replace(',', '.', $clean);
        }
    } elseif ($comma_count > 0) {
        // Only commas - determine if thousands or decimal
        $parts = explode(',', $clean);
        $last_part = end($parts);
        
        // If last part has exactly 2 digits AND there's only one comma, it's decimal
        // Otherwise it's thousands separator
        if (count($parts) === 2 && strlen($last_part) === 2) {
            // Could be Spanish decimal: 123,45
            // But also could be thousands: 1,234 (3 digits after comma)
            // Check if first part is small (< 1000) - likely decimal
            if ((int)$parts[0] < 1000 && strlen($last_part) === 2) {
                $clean = str_replace(',', '.', $clean);
            } else {
                // Thousands separator
                $clean = str_replace(',', '', $clean);
            }
        } else {
            // Multiple commas or 3 digits after comma = thousands separator
            $clean = str_replace(',', '', $clean);
        }
    } elseif ($dot_count > 0) {
        $parts = explode('.', $clean);
        $last_part = end($parts);
        
        // If last part has exactly 2 digits AND there's only one dot, it's decimal
        if (count($parts) === 2 && strlen($last_part) === 2) {
            // English decimal: 123.45 - keep as is
        } elseif (count($parts) === 2 && strlen($last_part) === 3) {
            // Could be thousands: 1.234 (Spanish thousands)
            // But also could be decimal with 3 places: 1.234
            // Assume thousands if first part is small
            if ((int)$parts[0] < 1000) {
                $clean = str_replace('.', '', $clean);
            }
        } else {
            // Multiple dots = thousands separator (Spanish)
            $clean = str_replace('.', '', $clean);
        }
    }
    
    $value = (float)$clean;
    
    return $value;
}

function format_money($value) {
    if ($value === null || $value === '') {
        return '0.00';
    }
    
    if (!is_numeric($value)) {
        return '0.00';
    }
    
    // Format with thousands commas and decimal dot
    $formatted = number_format((float)$value, DECIMAL_PLACES, DECIMAL_SEPARATOR, '');
    
    // Add thousands separators
    $parts = explode(DECIMAL_SEPARATOR, $formatted);
    $integer = $parts[0];
    $decimal = isset($parts[1]) ? $parts[1] : '00';
    
    // Add thousands separator from right
    $result = '';
    for ($i = strlen($integer); $i > 0; $i -= 3) {
        $start = max(0, $i - 3);
        $len = min(3, $i);
        $chunk = substr($integer, $start, $len);
        if (!empty($result)) {
            $result = THOUSAND_SEPARATOR . $result;
        }
        $result = $chunk . $result;
    }
    
    return $result . DECIMAL_SEPARATOR . $decimal;
}

// ============================================================================
// TABLE HEADER DETECTION (MULTI-LANGUAGE)
// ============================================================================

function get_header_patterns($lang = 'es') {
    // Multi-language header patterns
    $patterns = [
        // Date column headers
        'date' => [
            'es' => ['fecha', 'dia', 'fecHA', 'dia de mov', 'movimiento', 'operacion'],
            'en' => ['date', 'day'],
            'fr' => ['date', 'jour'],
            'pt' => ['data', 'dia'],
        ],
        // Description column headers
        'description' => [
            'es' => ['concepto', 'referencia', 'descripcion', 'detalle', 'movimiento'],
            'en' => ['description', 'reference', 'details', 'memo'],
            'fr' => ['description', 'reference', 'libelle'],
            'pt' => ['description', 'reference', 'detalhes'],
        ],
        // Debit column headers
        'debit' => [
            'es' => ['cargo', 'debit', 'retir', 'retiros', 'importe', 'cantidad'],
            'en' => ['debit', 'withdrawal', 'amount'],
            'fr' => ['debit', 'retirement', 'montant'],
            'pt' => ['debit', 'retirada', 'valor'],
        ],
        // Credit column headers
        'credit' => [
            'es' => ['abono', 'credit', 'ing', 'ingreso', 'deposito'],
            'en' => ['credit', 'deposit'],
            'fr' => ['credit', 'versement'],
            'pt' => ['credit', 'deposito'],
        ],
        // Balance column headers
        'balance' => [
            'es' => ['saldo', 'balance', 'saldo disponible', 'saldo operación', 'saldo operacion', 'saldo liquidación', 'saldo liquidacion'],
            'en' => ['balance', 'running balance'],
            'fr' => ['solde', 'balance'],
            'pt' => ['saldo', 'balance'],
        ],
    ];
    
    return $patterns;
}

function detect_headers($lines, $lang = 'es') {
    $patterns = get_header_patterns($lang);
    $detected = [];
    
    foreach ($lines as $line) {
        $lower = strtolower($line);
        
        foreach ($patterns as $column => $lang_patterns) {
            if (isset($lang_patterns[$lang])) {
                foreach ($lang_patterns[$lang] as $pattern) {
                    if (stripos($lower, $pattern) !== false) {
                        $detected[$column] = true;
                        break;
                    }
                }
            }
        }
    }
    
    return $detected;
}

function is_header_row($line, $lang = 'es') {
    $patterns = get_header_patterns($lang);
    $lower = strtolower($line);
    
    $match_count = 0;
    
    foreach ($patterns as $column => $lang_patterns) {
        if (isset($lang_patterns[$lang])) {
            foreach ($lang_patterns[$lang] as $pattern) {
                if (stripos($lower, $pattern) !== false) {
                    $match_count++;
                    break;
                }
            }
        }
    }
    
    // A header row should match at least 3 column patterns
    return $match_count >= 3;
}

// ============================================================================
// COLUMN DETECTION (X-POSITION CLUSTERING)
// ============================================================================

function detect_column_positions($tokens, $debug = false) {
    // Group tokens by x-position
    $positions = [];
    
    foreach ($tokens as $token) {
        if (!isset($token['bbox']) || $token['bbox'] === null) {
            continue;
        }
        
        if (!isset($token['bbox']['x0'])) {
            continue;
        }
        
        $x = $token['bbox']['x0'];
        $positions[] = $x;
    }
    
    if (empty($positions)) {
        return null;
    }
    
    sort($positions);
    
    // Simple binning to find column clusters
    $bins = [];
    $bin_size = 50; // Assume columns are roughly 50 units apart
    
    foreach ($positions as $pos) {
        $bin_index = (int)($pos / $bin_size);
        
        if (!isset($bins[$bin_index])) {
            $bins[$bin_index] = [];
        }
        
        $bins[$bin_index][] = $pos;
    }
    
    // Find the densest bins (likely column positions)
    $column_positions = [];
    
    foreach ($bins as $index => $positions_in_bin) {
        if (count($positions_in_bin) >= 3) {
            $column_positions[] = [
                'x' => array_sum($positions_in_bin) / count($positions_in_bin),
                'count' => count($positions_in_bin),
            ];
        }
    }
    
    // Sort by x position
    usort($column_positions, function($a, $b) {
        return $a['x'] - $b['x'];
    });
    
    tabler_debug("Detected " . count($column_positions) . " column positions", $debug);
    
    return $column_positions;
}

function assign_tokens_to_columns($tokens, $column_positions) {
    if (empty($column_positions)) {
        return $tokens;
    }
    
    foreach ($tokens as $index => $token) {
        if (!isset($token['bbox']) || $token['bbox'] === null || !isset($token['bbox']['x0'])) {
            continue;
        }
        
        $x = $token['bbox']['x0'];
        $min_dist = PHP_INT_MAX;
        $best_column = 0;
        
        foreach ($column_positions as $col_index => $col_pos) {
            $dist = abs($x - $col_pos['x']);
            if ($dist < $min_dist) {
                $min_dist = $dist;
                $best_column = $col_index;
            }
        }
        
        $tokens[$index]['column'] = $best_column;
    }
    
    return $tokens;
}

// ============================================================================
// ROW DETECTION
// ============================================================================

function is_transaction_row($tokens, $debug = false) {
    // A transaction row should have:
    // - A date-like token on the left (DD-MM-YYYY, DD-Month-YYYY, etc.)
    // - At least one numeric token (amount or balance)
    
    $has_date = false;
    $has_number = false;
    
    foreach ($tokens as $token) {
        $text = $token['text'] ?? (is_string($token) ? $token : '');
        
        // Check for date pattern (DD-MM-YYYY, DD-Month-YYYY, etc.)
        // Spanish dates: 31-DIC-2023, 10-ENE-2024, 01/01/2024
        if (preg_match('/^\d{1,2}[\-\/]\d{1,2}[\-\/]\d{2,4}/', $text) ||  // DD-MM-YY or DD/MM/YYYY
            preg_match('/^\d{1,2}[-\/][A-Za-z]{3,}[-\/]\d{2,4}/', $text) ||  // DD-Month-YYYY
            preg_match('/^[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}/', $text)) { // Month DD, YYYY
            $has_date = true;
        }
        
        // Check for numeric pattern (amount or balance)
        if (preg_match('/[\d\.\,\-\+]+/', $text)) {
            $has_number = true;
        }
    }
    
    return $has_date && $has_number;
}

function is_amount_line($line) {
    // Check if a line contains amounts (right-aligned numbers)
    // Santander format: amounts like 1,234.56 or 4.189,16
    return preg_match('/[\d][\.\,][\d]/', $line) && preg_match('/[\d]{1,3}(?:,[\d]{3})*(?:\.[\d]{2})?/', $line);
}

function looks_like_transaction_start($line) {
    // Check if line looks like the start of a transaction
    // Date at beginning: DD-Month-YYYY or DD/MM/YYYY
    return preg_match('/^\s*\d{1,2}[-\/]/', $line) ||  // Starts with DD-
           preg_match('/^\s*[A-Za-z]{3,}\s+\d{1,2}/', $line);  // Starts with Month name
}

function is_balance_row($tokens) {
    // A balance row has only one numeric value (the running balance)
    $numeric_count = 0;
    
    foreach ($tokens as $token) {
        $text = $token['text'] ?? (is_string($token) ? $token : '');
        
        if (preg_match('/^[\d\.\,\-\+]+$/', trim($text))) {
            $numeric_count++;
        }
    }
    
    return $numeric_count === 1;
}

function is_amount_row($tokens) {
    // An amount row has numeric values (debit, credit, or both)
    $numeric_count = 0;
    
    foreach ($tokens as $token) {
        $text = $token['text'] ?? (is_string($token) ? $token : '');
        
        if (preg_match('/[\d\.\,\-\+]+/', $text)) {
            $numeric_count++;
        }
    }
    
    return $numeric_count >= 1;
}

// ============================================================================
// PAGE CONTINUITY DETECTION
// ============================================================================

function is_page_continuation($line, $prev_line, $debug = false) {
    // If previous line ended with a continuation character or pattern
    $continuation_indicators = ['-', '—', '...', '...'];
    
    foreach ($continuation_indicators as $indicator) {
        if (strrpos($prev_line, $indicator) !== false) {
            return true;
        }
    }
    
    // If previous line didn't have a date-like start and this line doesn't have a date-like start
    $prev_has_date = preg_match('/^\s*\d{1,2}[\/\-]/', $prev_line);
    $curr_has_date = preg_match('/^\s*\d{1,2}[\/\-]/', $line);
    
    if (!$prev_has_date && !$curr_has_date) {
        // Both lines are non-date, likely continuation
        return true;
    }
    
    return false;
}

function merge_continuation_lines($lines) {
    $merged = [];
    $buffer = '';
    
    foreach ($lines as $line) {
        $trimmed = trim($line);
        
        if (empty($trimmed)) {
            if (!empty($buffer)) {
                $merged[] = $buffer;
                $buffer = '';
            }
            continue;
        }
        
        // Check if this line starts a new transaction (has date)
        $has_date = preg_match('/^\s*\d{1,2}[\/\-]/', $trimmed);
        
        if ($has_date && !empty($buffer)) {
            // Save buffered line and start new
            $merged[] = $buffer;
            $buffer = $trimmed;
        } elseif (!empty($buffer)) {
            // Continuation
            $buffer .= ' | ' . $trimmed;
        } else {
            $buffer = $trimmed;
        }
    }
    
    if (!empty($buffer)) {
        $merged[] = $buffer;
    }
    
    return $merged;
}

// ============================================================================
// NON-TRANSACTION REGION FILTERING
// ============================================================================

function is_non_transaction_region($line) {
    $lower = strtolower($line);
    $trimmed = trim($line);
    
    // =========================================================================
    // STANDALONE DOCUMENT/ACCOUNT NUMBERS (Page header artifacts)
    // =========================================================================
    // These are document numbers that appear alone at the top of pages.
    // Pattern: A line that is ONLY a 7-digit number (like "0448912")
    // These are NOT transaction folios (which appear with descriptions).
    if (preg_match('/^\d{7}$/', $trimmed)) {
        return true;
    }
    
    // Also filter lines that are just a barcode-like number (long digit sequences)
    // Pattern: *0124044891249179944001* or 0124044891249179944001
    if (preg_match('/^\*?\d{15,}\*?$/', $trimmed)) {
        return true;
    }
    
    // Legal notices
    $notice_patterns = [
        'legal', 'condiciones de service', 'privacy policy', 'terms of use',
        'powered by', 'generated by', 'statement generated',
    ];
    
    foreach ($notice_patterns as $pattern) {
        if (stripos($lower, $pattern) !== false) {
            return true;
        }
    }
    
    // Summaries and totals
    $summary_patterns = [
        'subtotal', 'total', 'suma', 'average', 'maximum', 'minimum',
        'beginning balance', 'ending balance', 'opening balance',
        'grand total', 'sub total', 'page subtotal',
        'cheques pagados',      // BBVA summary line
        'manejo de cuenta',     // BBVA summary line
        'anualidad',            // BBVA summary line
        'operaciones',          // BBVA summary line (when standalone)
        'cargos objetados',     // BBVA summary line
        'abonos objetados',     // BBVA summary line
        'total comisiones',     // BBVA summary line
        'interes nominal',      // BBVA summary line
        'antes de impuestos',   // BBVA summary line
        'depósitos / abonos',   // BBVA summary section
        'comisiones (-)',       // BBVA summary section
        'intereses a favor',    // BBVA summary section
        'otros cargos',         // BBVA summary section
        'cuadro resumen',       // BBVA summary section
        'gráfico de movimientos', // BBVA summary section
        'concepto cantidad porcentaje', // BBVA summary table header
        'folio:',               // BBVA footer content
        'no. cuenta',           // BBVA footer content
        'no. cliente',          // BBVA footer content
    ];
    
    foreach ($summary_patterns as $pattern) {
        if (stripos($lower, $pattern) !== false) {
            return true;
        }
    }
    
    // Glossary pages
    $glossary_patterns = [
        'glossary', 'legend', 'key:', 'abbreviations',
    ];
    
    foreach ($glossary_patterns as $pattern) {
        if (stripos($lower, $pattern) !== false) {
            return true;
        }
    }
    
    // =========================================================================
    // PAGE HEADER DETECTION (CRITICAL for multi-line description handling)
    // =========================================================================
    // These patterns detect repeated page headers that appear at the top of each page.
    // They should NOT be merged into transaction descriptions.
    
    // Table column headers (appear at top of each page)
    // Pattern: "F E C H A   FOLIO   DESCRIPCION   DEPOSITOS   RETIROS   SALDO"
    // Note: Some PDFs have spaced-out letters like "F E C H A" instead of "FECHA"
    $column_header_patterns = [
        // Spaced-out column headers (OCR artifacts)
        '/F\s*E\s*C\s*H\s*A/i',                    // F E C H A
        '/D\s*E\s*S\s*C\s*R\s*I\s*P\s*C\s*I\s*O\s*N/i',  // D E S C R I P C I O N
        '/D\s*E\s*P\s*O\s*S\s*I\s*T\s*O\s*S/i',    // D E P O S I T O S
        '/R\s*E\s*T\s*I\s*R\s*O\s*S/i',            // R E T I R O S
        '/S\s*A\s*L\s*D\s*O/i',                    // S A L D O
        
        // Normal column headers (multiple columns on same line)
        '/fecha\s+folio\s+descripcion/i',
        '/descripcion\s+depositos?\s+retiros?/i',
        '/depositos?\s+retiros?\s+saldo/i',
        '/fecha\s+concepto\s+cargo/i',
        '/concepto\s+cargo\s+abono/i',
        '/cargo\s+abono\s+saldo/i',
        '/date\s+description\s+debit/i',
        '/description\s+debit\s+credit/i',
        '/debit\s+credit\s+balance/i',
    ];
    
    foreach ($column_header_patterns as $pattern) {
        if (preg_match($pattern, $trimmed)) {
            return true;
        }
    }
    
    // Detect lines that look like column headers (multiple column names)
    // If a line contains 3+ column-like words, it's likely a header
    $column_words = ['fecha', 'folio', 'descripcion', 'concepto', 'depositos', 'deposito',
                     'retiros', 'retiro', 'saldo', 'cargo', 'abono', 'balance',
                     'date', 'description', 'debit', 'credit', 'amount', 'reference'];
    $column_word_count = 0;
    foreach ($column_words as $word) {
        if (stripos($lower, $word) !== false) {
            $column_word_count++;
        }
    }
    if ($column_word_count >= 3) {
        return true;
    }
    
    // Bank-specific header/footer patterns (Santander, BBVA, etc.)
    $bank_patterns = [
        'cuenta santander',
        'cuenta pyme',
        'codigo de cliente',
        'periodo :',
        'periodo:',
        'periodo —',
        'banco santander mexico',
        'institucion de banca multiple',
        'grupo financiero',
        'crupo financiro',  // OCR typo
        'estado de cuenta',
        'moneda nacional',
        'moneda —',
        'sucursal —',
        'sucursal 0',
        'telefono —',
        'corte al',
        'corteal',
        'resumen intereses',
        'intereses brutos',
        'intereses netos',
        'isr retenido',
        'gat nominal',
        'gat real',
        'inversion creciente',
        'depositos |',
        'saldo final $',
        'fecha folio',
        'descripcion deposito retiro',
        'unidad especializada',
        'fecha y hora de expedicion',
        'fecha y hora de certificacion',
        'regimen fiscal',
        'lugar de expedicion',
        'metodo de pago',
        'unidad de medida',
        'r.f.c.',
        'rfc:',
        'clabe',
        'estado de cuenta integral',
        'resumen informativo',
        'informacion a clientes',
        'grafico cuenta',
        'detalle de movimientos',
        'saldo promedio',
        'tasa bruta',
        'dias del periodo',
        'corte al',
        'comisiones cobradas',
        'intereses brutos',
        'i.s.r. retenido',
        'intereses netos',
        'gat nominal',
        'gat real',
        'agradeceremos nos comunique',
        'objeciones en un plazo',
        'prolongacion paseo de la reforma',
        'alcaldia alvaro obregon',
        'www.ipab.org.mx',
        'https://www.gob.mx',
        'ueac@santander',
        'telefonos en la ciudad',
        'numero, tipo clase',
        'obligaciones a su favor',
        'recibe las consultas',
        'reclamaciones o aclaraciones',
        'electronico',
        'para mas informacion visita',
        'uuid del timbrado',
        'sello digital',
        'cadena original',
        'certificado del emisor',
        'certificado del sat',
        'folio interno',
        'tipo de comprobante',
        'uso de cfdi',
        'domicilio fiscal',
        'mensajes importantes',
        'apreciable cliente',
        'estimado cliente',
        'importante!',
        'condusef',
        'super linea',
        'supernet',
        'supermovil',
        'contact center',
        'ahorro bancario',
        'ipab',
        'depositos bancarios',
        'prestamos y creditos',
        'unidad de atencion',
        'solucionespyme@',
        'dimo',
        'transferencias de dinero',
        'logotipos',
        'banco de m',
        'situacion fiscal',
        'datos fiscales',
        'bsm970519',
        'pagina',
        'página',
        'fechay hora',
        'fecha y hora',
        'informacion fiscal',
        'información fiscal',
        // Additional page header patterns
        'su conformidad',           // Footer text that appears at page breaks
        'cualquier aclaracion',     // Footer text
        'consulte su estado',       // Footer text
        'conserve este documento',  // Footer text
        // NOTE: Removed overly broad patterns that were filtering valid transactions:
        // 'este documento', 'para cualquier', 'en caso de', 'si tiene alguna',
        // 'comuniquese con', 'linea de atencion', 'atencion a clientes', 'servicio al cliente'
        
        // BBVA-specific patterns
        'maestra pyme bbva',
        'bbva mexico',
        'bbva méxico',
        'informacion financiera',
        'información financiera',
        'rendimiento',
        'comportamiento',
        'tasa bruta anual',
        'detalle de movimientos realizados',
        'torre bbva reforma',
        'av paseo de la reforma',
        'sucursal :',
        'direccion:',
        'dirección:',
        'cuenta :',
        'cliente :',
        'periodo del',
        'fecha oper liq',
        'oper liq',
        'saldo operacion liquidacion',
        'saldo operación liquidación',
        'cargos abonos',
        'informacion adicional',
        'información adicional',
        'consulta de saldos',
        'banca por internet',
        'linea bbva',
        'línea bbva',
        'app bbva',
        'cajeros automaticos',
        'cajeros automáticos',
        'sucursales bbva',
        'www.bbva.mx',
        'bbva.mx',
        'grupo financiero bbva',
        'bbva bancomer',
        'bancomer',
    ];
    
    foreach ($bank_patterns as $pattern) {
        if (stripos($lower, $pattern) !== false) {
            return true;
        }
    }
    
    // NOTE: Removed uppercase ratio check - it was incorrectly filtering valid transaction descriptions
    // like "ABONO TRANSFERENCIA ENLACE SF" which are 100% uppercase but are valid transactions.
    // The other pattern-based filters are sufficient for detecting headers/footers.
    
    return false;
}

// ============================================================================
// AMOUNT VALIDATION (Filter reference numbers from real amounts)
// ============================================================================

/**
 * Check if a string looks like a valid transaction amount (not a reference number).
 *
 * Key insight: Real amounts have decimal points (e.g., 277,820.00)
 * Reference numbers are integers (e.g., 5280089)
 *
 * @param string $text The text to check
 * @param float $value The parsed numeric value
 * @return bool True if this looks like a valid amount
 */
function is_valid_transaction_amount($text, $value) {
    // Null or zero is not a valid amount for filtering purposes
    if ($value === null) {
        return false;
    }
    
    // Clean the text - strip trailing non-numeric characters (like ":" or spaces)
    $clean = trim($text);
    $clean = preg_replace('/[^0-9.,]+$/', '', $clean);
    $clean = trim($clean);
    
    // CRITICAL: Real amounts MUST have a decimal point with 2 digits
    // This filters out reference numbers like 5280089
    if (!preg_match('/[.,]\d{2}$/', $clean)) {
        // No decimal point - likely a reference number
        // Exception: very small integers (< 100) might be valid amounts
        if (abs($value) >= 100) {
            return false;
        }
    }
    
    // Reject amounts that are too large (> 100 million)
    // Real bank transactions rarely exceed this
    if (abs($value) > 100000000) {
        return false;
    }
    
    // Reject if it looks like a reference number pattern
    // Reference numbers: 6-10 digits without decimal
    $digits_only = preg_replace('/[^0-9]/', '', $clean);
    if (strlen($digits_only) >= 6 && strlen($digits_only) <= 10 && !preg_match('/[.,]/', $clean)) {
        return false;
    }
    
    return true;
}

/**
 * Validate that a date is within a reasonable range for bank statements.
 *
 * @param string $date_str Date in DD-MM-YYYY format
 * @return bool True if date is valid
 */
function validate_transaction_date($date_str) {
    if (!preg_match('/^(\d{2})-(\d{2})-(\d{4})$/', $date_str, $m)) {
        return false;
    }
    
    $day = (int)$m[1];
    $month = (int)$m[2];
    $year = (int)$m[3];
    
    // Validate year range (2000 to current year)
    // Bank statements should not have future dates
    $current_year = (int)date('Y');
    if ($year < 2000 || $year > $current_year) {
        return false;
    }
    
    // Validate month
    if ($month < 1 || $month > 12) {
        return false;
    }
    
    // Validate day
    if ($day < 1 || $day > 31) {
        return false;
    }
    
    // Additional check: reject dates more than 1 month in the future
    $date_ts = mktime(0, 0, 0, $month, $day, $year);
    $future_limit = strtotime('+1 month');
    if ($date_ts > $future_limit) {
        return false;
    }
    
    return true;
}

// ============================================================================
// ROW ASSEMBLY
// ============================================================================

function assemble_row($tokens, $column_positions = null, $debug = false) {
    $row = [
        'day'          => null,
        'description'  => '',
        'debit'        => null,
        'credit'       => null,
        'balance'      => null,
        'raw'          => $tokens,
        'confidence'   => 100,
    ];
    
    // If we have column positions, assign tokens
    if ($column_positions !== null) {
        $tokens = assign_tokens_to_columns($tokens, $column_positions);
    }
    
    // Collect all text from tokens
    $all_text = [];
    foreach ($tokens as $token) {
        $all_text[] = $token['text'] ?? (is_string($token) ? $token : '');
    }
    $full_text = implode(' ', $all_text);
    
    // Try to extract date (look for date patterns in any token)
    foreach ($tokens as $token) {
        $text = $token['text'] ?? (is_string($token) ? $token : '');
        $normalized = normalize_date($text);
        if ($normalized !== null) {
            $row['day'] = $normalized;
            break;
        }
    }
    
    // Try to extract amounts and balance from right-aligned tokens
    // Santander format: amounts are right-aligned, often with . as decimal separator
    $amounts = [];
    foreach ($tokens as $token) {
        $text = $token['text'] ?? (is_string($token) ? $token : '');
        $amount = parse_money($text);
        if ($amount !== null) {
            $amounts[] = ['value' => $amount, 'text' => $text];
        }
    }
    
    // Heuristic for Santander bank statement:
    // - If we have 3+ amounts, the largest is likely the balance
    // - Smaller amounts are debit/credit
    if (count($amounts) >= 2) {
        // Sort by value descending
        usort($amounts, function($a, $b) {
            return $b['value'] - $a['value'];
        });
        
        // Largest value is likely the balance
        $row['balance'] = $amounts[0]['value'];
        
        // Remaining amounts are debit/credit
        // If we have 2 amounts, one is debit, one is credit
        // If we have 1 amount, it's either debit or credit
        if (count($amounts) >= 3) {
            // Two smaller amounts - one debit, one credit
            $row['debit'] = $amounts[1]['value'];
            $row['credit'] = $amounts[2]['value'];
        } elseif (count($amounts) == 2) {
            // One amount - determine if debit or credit based on context
            // For Santander, we need more heuristics
            $row['debit'] = $amounts[1]['value'];
        }
    } elseif (count($amounts) == 1) {
        // Single amount - could be balance or amount
        $row['balance'] = $amounts[0]['value'];
    }
    
    // Description is everything that's not a clear amount or date
    $description_parts = [];
    foreach ($tokens as $token) {
        $text = $token['text'] ?? (is_string($token) ? $token : '');
        // Skip if it looks like a date or amount
        if (normalize_date($text) !== null) continue;
        if (parse_money($text) !== null && preg_match('/^[\d\.\,\-\+]+$/', trim($text))) continue;
        if (!empty(trim($text))) {
            $description_parts[] = trim($text);
        }
    }
    $row['description'] = implode(' | ', $description_parts);
    
    return $row;
}

// ============================================================================
// LEDGER RECONCILIATION
// ============================================================================

function reconcile_ledger($rows, $debug = false) {
    $balance = null;
    $reconciliation_failures = [];
    
    foreach ($rows as $index => &$row) {
        $row['reconciliation_status'] = 'ok';
        $row['reconciliation_error'] = null;
        
        $debit = $row['debit'] ?? 0;
        $credit = $row['credit'] ?? 0;
        $balance_raw = $row['balance'];
        
        if ($balance === null) {
            // First row: validate if balance is present
            if ($balance_raw !== null) {
                $balance = $balance_raw;
            }
            continue;
        }
        
        if ($balance_raw !== null) {
            // Validate balance progression
            $expected_balance = $balance + $credit - $debit;
            
            // Allow small floating point differences
            if (abs($balance_raw - $expected_balance) > 0.01) {
                $row['reconciliation_status'] = 'fail';
                $row['reconciliation_error'] = sprintf(
                    'Expected balance %.2f, got %.2f (diff: %.2f)',
                    $expected_balance,
                    $balance_raw,
                    $balance_raw - $expected_balance
                );
                
                $reconciliation_failures[] = [
                    'row' => $index,
                    'expected' => $expected_balance,
                    'actual' => $balance_raw,
                    'debit' => $debit,
                    'credit' => $credit,
                ];
                
                tabler_debug(
                    "Reconciliation failure at row $index: expected $expected_balance, got $balance_raw",
                    $debug
                );
            }
            
            $balance = $balance_raw;
        } else {
            // Infer balance from previous + amounts
            $balance = $balance + $credit - $debit;
            $row['balance'] = $balance;
        }
    }
    
    return [
        'rows' => $rows,
        'failures' => $reconciliation_failures,
        'is_valid' => empty($reconciliation_failures),
    ];
}

// ============================================================================
// INFER SIGN FROM BALANCE DELTAS
// ============================================================================

function infer_amounts_from_balance($rows) {
    // If rows have balance but no debit/credit, infer from balance deltas
    
    $prev_balance = null;
    
    foreach ($rows as &$row) {
        $balance = $row['balance'];
        
        if ($balance === null || $prev_balance === null) {
            $prev_balance = $balance;
            continue;
        }
        
        $delta = $balance - $prev_balance;
        
        if ($delta > 0) {
            // Positive change = credit
            $row['credit'] = $delta;
            $row['debit'] = null;
        } elseif ($delta < 0) {
            // Negative change = debit
            $row['debit'] = abs($delta);
            $row['credit'] = null;
        }
        
        $prev_balance = $balance;
    }
    
    return $rows;
}

// ============================================================================
// EXPORT TO TSV
// ============================================================================

function export_tsv($rows, $output_path, $debug = false) {
    $handle = fopen($output_path, 'w');
    
    if ($handle === false) {
        tabler_error("Cannot open output file: $output_path");
    }
    
    // Write header
    $header = [
        'Día',
        'Concepto / Referencia',
        'cargo',
        'Abono',
        'Saldo',
    ];
    
    fputcsv($handle, $header, "\t");
    
    // Write rows
    foreach ($rows as $row) {
        $fields = [
            $row['day'] ?? '',
            str_replace("\t", ' ', $row['description'] ?? ''),
            format_money($row['debit'] ?? null),
            format_money($row['credit'] ?? null),
            format_money($row['balance'] ?? null),
        ];
        
        fputcsv($handle, $fields, "\t");
    }
    
    fclose($handle);
    
    tabler_info("Exported " . count($rows) . " rows to: $output_path");
    
    return true;
}

// ============================================================================
// GENERATE AUDIT
// ============================================================================

function generate_audit($audit_data, $audit_path) {
    $audit_data['generated_at'] = date('Y-m-d H:i:s');
    $audit_data['protocol'] = TABLER_PROTOCOL;
    $audit_data['version'] = TABLER_VERSION;
    
    // Redact PII from audit data
    function redact_recursive(&$item, $key) {
        if (is_string($item)) {
            $item = redact_pii($item);
        }
    }
    
    array_walk_recursive($audit_data, 'redact_recursive');
    
    file_put_contents($audit_path, json_encode($audit_data, JSON_PRETTY_PRINT));
    
    tabler_info("Generated audit artifact: $audit_path");
}

// ============================================================================
// GENERATE DEBUG OVERLAY
// ============================================================================

function generate_debug_overlay($image_path, $regions, $output_path) {
    // Use ImageMagick to draw boxes around detected regions
    $cmd = "convert \"" . escapeshellarg($image_path) . "\"";
    
    foreach ($regions as $region) {
        $color = $region['color'] ?? 'red';
        $bbox = $region['bbox'];
        
        $cmd .= " -fill none -stroke $color -stroroke 2";
        $cmd .= " -draw \"rectangle " . $bbox['x0'] . "," . $bbox['y0'];
        $cmd .= " " . $bbox['x1'] . "," . $bbox['y1'] . "\"";
    }
    
    $cmd .= " \"" . escapeshellarg($output_path) . "\"";
    
    exec($cmd);
}

// ============================================================================
// MAIN PIPELINE
// ============================================================================

function run_pipeline($options) {
    $input = $options['input'];
    $output = $options['output'] ?? substr($input, 0, strrpos($input, '.')) . '.txt';
    $debug = $options['debug'];
    $audit_path = $options['audit'];
    $cache_dir = $options['cache_dir'];
    $lang = $options['lang'];
    
    $audit = [
        'input_file' => basename($input),
        'input_hash' => compute_file_hash($input),
        'input_size' => get_file_size($input),
        'pipeline_stages' => [],
        'miseries' => [],
        'confidence_stats' => [],
        'page_count' => 0,
        'row_count' => 0,
        'reconciliation_failures' => [],
        'warnings' => [],
    ];
    
    // ============================================================================
    // STAGE 1: INGEST
    // ============================================================================
    
    tabler_info("Stage 1: Ingest starting");
    
    $stage_start = microtime(true);
    
    validate_pdf($input);
    $page_count = pdf_page_count($input);
    
    $audit['page_count'] = $page_count;
    $audit['pipeline_stages'][] = [
        'name' => 'ingest',
        'duration' => microtime(true) - $stage_start,
        'page_count' => $page_count,
    ];
    
    tabler_info("Ingested $page_count pages from: $input");
    
    // ============================================================================
    // STAGE 2: LINEAR CRAWL
    // ============================================================================
    
    tabler_info("Stage 2: Linear Crawl starting");
    
    $stage_start = microtime(true);
    
    $all_lines = [];
    $all_tokens = [];
    $miseries = [];
    $ocr_confidences = [];
    
    for ($page = 0; $page < $page_count; $page++) {
        tabler_info("Processing page " . ($page + 1) . " of $page_count");
        
        $page_data = [
            'page' => $page,
            'raw_text' => '',
            'tokens' => [],
            'ocr_confidence' => null,
            'miseries' => [],
        ];
        
        // Try text extraction first
        $text_result = pdf_to_text($input, $page, $debug);
        
        if ($text_result['success'] && !empty(trim($text_result['text']))) {
            $page_data['raw_text'] = $text_result['text'];
            $lines = explode("\n", $text_result['text']);
            
            // Extract tokens with positions if available
            $html_result = pdf_to_html_with_bbox($input, $page, $debug);
            
            if ($html_result['success'] && !empty($html_result['html'])) {
                $page_data['tokens'] = parse_html_tokens($html_result['html']);
            }
            
            $page_data['extraction_method'] = 'text';
        } else {
            // Fallback to OCR
            tabler_info("Text extraction failed, using OCR for page " . ($page + 1));
            
            // Rasterize page to image
            $image_path = $cache_dir . DIRECTORY_SEPARATOR . "page_$page.png";
            $raster_result = pdf_to_image($input, $image_path, $page, DEFAULT_DPI, $debug);
            
            if ($raster_result['success']) {
                // Run Tesseract
                $ocr_result = tesseract_ocr($image_path, $debug);
                
                if ($ocr_result['success']) {
                    $page_data['raw_text'] = $ocr_result['text'];
                    $lines = explode("\n", $ocr_result['text']);
                    
                    // Parse HOCR for positions
                    if ($ocr_result['hocr']) {
                        $page_data['tokens'] = parse_hocr_bbox($ocr_result['hocr']);
                    }
                    
                    $page_data['ocr_confidence'] = $ocr_result['confidence'];
                    $ocr_confidences[] = $ocr_result['confidence'];
                } else {
                    $miseries[] = [
                        'page' => $page,
                        'type' => 'ocr_failure',
                        'message' => 'Tesseract failed on page',
                    ];
                }
            } else {
                $miseries[] = [
                    'page' => $page,
                    'type' => 'raster_failure',
                    'message' => 'Could not rasterize page for OCR',
                ];
                $lines = [];
            }
            
            $page_data['extraction_method'] = 'ocr';
        }
        
        // Collect raw observations
        foreach ($lines as $line_index => $line) {
            $trimmed = trim($line);
            
            if (empty($trimmed)) {
                continue;
            }
            
            // Try to extract statement period from header lines (for BBVA format)
            // Check all pages since page 1 might fail OCR and period might be on page 2
            if (get_statement_year() === null) {
                $period = extract_statement_period($trimmed);
                if ($period !== null && $period['year'] !== null) {
                    set_statement_year($period['year']);
                    tabler_info("Detected statement year: " . $period['year'] . " from page " . ($page + 1));
                }
            }
            
            $all_lines[] = [
                'page' => $page,
                'line' => $line_index,
                'text' => $trimmed,
            ];
        }
        
        foreach ($page_data['tokens'] as $token) {
            $all_tokens[] = [
                'page' => $page,
                'text' => $token['word'] ?? '',
                'bbox' => $token['bbox'] ?? null,
            ];
        }
        
        // Detect page-specific miseries
        detect_page_miseries($lines, $page, $miseries);
    }
    
    $audit['miseries'] = array_merge($audit['miseries'], $miseries);
    $audit['confidence_stats'] = [
        'mean' => count($ocr_confidences) > 0 ? array_sum($ocr_confidences) / count($ocr_confidences) : 100,
        'min' => count($ocr_confidences) > 0 ? min($ocr_confidences) : 100,
        'max' => count($ocr_confidences) > 0 ? max($ocr_confidences) : 100,
        'count' => count($ocr_confidences),
    ];
    
    $audit['pipeline_stages'][] = [
        'name' => 'linear_crawl',
        'duration' => microtime(true) - $stage_start,
        'lines_found' => count($all_lines),
        'tokens_found' => count($all_tokens),
        'miseries_count' => count($miseries),
    ];
    
    tabler_info("Linear crawl found " . count($all_lines) . " lines and " . count($all_tokens) . " tokens");
    
    // ============================================================================
    // STAGE 3: ZOOM-OUT (LAYOUT GRAPH + PAGE IMAGES)
    // ============================================================================
    
    tabler_info("Stage 3: Zoom-Out starting");
    
    $stage_start = microtime(true);
    
    // Detect table header
    $header_row_index = null;
    
    foreach ($all_lines as $index => $line_data) {
        if (is_header_row($line_data['text'], $lang)) {
            $header_row_index = $index;
            break;
        }
    }
    
    if ($header_row_index !== null) {
        tabler_info("Detected header at line " . ($header_row_index + 1));
    } else {
        tabler_warning("No header row detected, will infer from structure");
        $audit['warnings'][] = 'No header row detected';
    }
    
    // Detect column positions from tokens
    $column_positions = detect_column_positions($all_tokens, $debug);
    
    // Detect page breaks and continuations
    $merged_lines = merge_continuation_lines(array_column($all_lines, 'text'));
    
    $audit['pipeline_stages'][] = [
        'name' => 'zoom_out',
        'duration' => microtime(true) - $stage_start,
        'column_positions' => count($column_positions ?? []),
        'merged_lines' => count($merged_lines),
    ];
    
    tabler_info("Zoom-Out identified " . count($merged_lines) . " merged lines");
    
    // ============================================================================
    // STAGE 4: ROW ASSEMBLY (Using Stateful Transaction Assembly)
    // ============================================================================
    
    tabler_info("Stage 4: Row Assembly starting");
    
    $stage_start = microtime(true);
    
    // Use the new stateful transaction assembly for better multi-line handling
    // This uses layout-based column detection from pdftotext -layout output
    $raw_lines = array_column($all_lines, 'text');
    
    // Try stateful assembly first (better for layout-preserved text)
    $assembled_rows = assemble_transactions_stateful($raw_lines, $debug);
    
    tabler_debug("Stateful assembly produced " . count($assembled_rows) . " rows", $debug);
    
    // If stateful assembly produced few results, fall back to legacy method
    if (count($assembled_rows) < 3) {
        tabler_info("Stateful assembly produced few rows, trying legacy method");
        
        $assembled_rows = [];
        
        foreach ($merged_lines as $line) {
            // Skip non-transaction regions
            if (is_non_transaction_region($line)) {
                continue;
            }
            
            // Tokenize line
            $tokens = tokenize_line($line);
            
            // Skip if not a transaction row
            if (!is_transaction_row($tokens)) {
                continue;
            }
            
            // Assemble row
            $row = assemble_row($tokens, $column_positions, $debug);
            $assembled_rows[] = $row;
        }
        
        tabler_debug("Legacy assembly produced " . count($assembled_rows) . " rows", $debug);
    }
    
    $audit['pipeline_stages'][] = [
        'name' => 'row_assembly',
        'duration' => microtime(true) - $stage_start,
        'rows_assembled' => count($assembled_rows),
        'method' => count($assembled_rows) >= 3 ? 'stateful' : 'legacy',
    ];
    
    tabler_info("Row Assembly produced " . count($assembled_rows) . " rows");
    
    // ============================================================================
    // STAGE 4.5: POST-PROCESSING VALIDATION
    // ============================================================================
    
    tabler_info("Stage 4.5: Post-processing validation starting");
    
    $stage_start_validation = microtime(true);
    
    // Clean and validate rows before reconciliation
    $rows_before_cleaning = count($assembled_rows);
    $assembled_rows = validate_and_clean_rows($assembled_rows, $debug);
    $rows_after_cleaning = count($assembled_rows);
    
    $audit['pipeline_stages'][] = [
        'name' => 'post_processing_validation',
        'duration' => microtime(true) - $stage_start_validation,
        'rows_before' => $rows_before_cleaning,
        'rows_after' => $rows_after_cleaning,
        'rows_removed' => $rows_before_cleaning - $rows_after_cleaning,
    ];
    
    tabler_info("Post-processing: $rows_before_cleaning rows -> $rows_after_cleaning valid rows");
    
    // ============================================================================
    // STAGE 5: RECONCILIATION (Using Smart Reconciliation)
    // ============================================================================
    
    tabler_info("Stage 5: Reconciliation starting");
    
    $stage_start = microtime(true);
    
    // Use smart reconciliation that corrects debit/credit based on balance deltas
    $reconciliation = smart_reconcile($assembled_rows, $debug);
    $assembled_rows = $reconciliation['rows'];
    
    // If smart reconciliation had failures, try legacy reconciliation
    if (!$reconciliation['is_valid']) {
        tabler_debug("Smart reconciliation had failures, trying legacy method", $debug);
        
        // Check if we have explicit debit/credit columns
        $has_amount_columns = false;
        
        foreach ($assembled_rows as $row) {
            if ($row['debit'] !== null || $row['credit'] !== null) {
                $has_amount_columns = true;
                break;
            }
        }
        
        if (!$has_amount_columns && count($assembled_rows) > 0) {
            tabler_info("No explicit amount columns found, inferring from balance deltas");
            $assembled_rows = infer_amounts_from_balance($assembled_rows);
        }
        
        // Re-run reconciliation
        $reconciliation = reconcile_ledger($assembled_rows, $debug);
        $assembled_rows = $reconciliation['rows'];
    }
    
    $audit['reconciliation_failures'] = $reconciliation['failures'];
    
    $audit['pipeline_stages'][] = [
        'name' => 'reconciliation',
        'duration' => microtime(true) - $stage_start,
        'is_valid' => $reconciliation['is_valid'],
        'failure_count' => count($reconciliation['failures']),
    ];
    
    if ($reconciliation['is_valid']) {
        tabler_info("Reconciliation passed with " . count($assembled_rows) . " valid rows");
    } else {
        tabler_warning("Reconciliation had " . count($reconciliation['failures']) . " failures (see audit)");
    }
    
    // ============================================================================
    // STAGE 6: EXPORT
    // ============================================================================
    
    tabler_info("Stage 6: Export starting");
    
    $stage_start = microtime(true);
    
    export_tsv($assembled_rows, $output, $debug);
    
    $audit['row_count'] = count($assembled_rows);
    $audit['pipeline_stages'][] = [
        'name' => 'export',
        'duration' => microtime(true) - $stage_start,
        'output_file' => $output,
    ];
    
    // Generate audit if requested
    if ($audit_path) {
        generate_audit($audit, $audit_path);
    }
    
    tabler_info("Export complete: $output");
    
    return [
        'success' => true,
        'output' => $output,
        'row_count' => count($assembled_rows),
        'reconciliation_valid' => $reconciliation['is_valid'],
    ];
}

// ============================================================================
// HELPER FUNCTIONS
// ============================================================================

function parse_html_tokens($html) {
    // Simple HTML tag parser for pdftohtml output
    $tokens = [];
    
    // Look for text tokens with bbox info
    // Format varies by tool, try common patterns
    
    // Pattern 1: <text x="123" y="456" w="789" h="012">content</text>
    if (preg_match_all('/<text[^>]*>([^<]*)<\/text>/', $html, $matches)) {
        for ($i = 0; $i < count($matches[0]); $i++) {
            $token = ['word' => $matches[1][$i], 'bbox' => null];
            
            $has_bbox = false;
            $bbox = ['x0' => 0, 'y0' => 0, 'x1' => 0, 'y1' => 0];
            
            if (preg_match('/x="(\d+)"/', $matches[0][$i], $x_match)) {
                $bbox['x0'] = (int)$x_match[1];
                $has_bbox = true;
            }
            if (preg_match('/y="(\d+)"/', $matches[0][$i], $y_match)) {
                $bbox['y0'] = (int)$y_match[1];
                $has_bbox = true;
            }
            if (preg_match('/w="(\d+)"/', $matches[0][$i], $w_match)) {
                $bbox['x1'] = $bbox['x0'] + (int)$w_match[1];
            }
            if (preg_match('/h="(\d+)"/', $matches[0][$i], $h_match)) {
                $bbox['y1'] = $bbox['y0'] + (int)$h_match[1];
            }
            
            if ($has_bbox) {
                $token['bbox'] = $bbox;
            }
            
            $tokens[] = $token;
        }
    }
    
    return $tokens;
}

function tokenize_line($line) {
    // Split line into tokens, preserving whitespace positions
    $tokens = [];
    $pattern = '/(\S+)/';
    
    preg_match_all($pattern, $line, $matches);
    
    foreach ($matches[1] as $match) {
        $tokens[] = ['text' => $match];
    }
    
    return $tokens;
}

/**
 * Parse a layout-preserved line into columns using whitespace boundaries.
 * This is the KEY function for -layout output from pdftotext.
 *
 * Bank statements have columns separated by 2+ spaces.
 * Example: "10-ENE-2024   ABONO TRANSFERENCIA SPEI   1,234.56   5,678.90"
 *
 * @param string $line The line from pdftotext -layout
 * @return array Array of column values
 */
function parse_layout_columns($line) {
    // Split by 2+ consecutive spaces (column boundaries in -layout output)
    $columns = preg_split('/\s{2,}/', trim($line));
    
    // Filter empty columns
    $columns = array_filter($columns, function($col) {
        return strlen(trim($col)) > 0;
    });
    
    return array_values($columns);
}

/**
 * Detect if a line starts with a date pattern (transaction start indicator).
 *
 * @param string $line The line to check
 * @return bool True if line starts with a date
 */
function line_starts_with_date($line) {
    $trimmed = trim($line);
    
    // DD-MMM-YYYY (e.g., 10-ENE-2024)
    if (preg_match('/^\d{1,2}[-\/][A-Za-z]{3}[-\/]\d{2,4}/', $trimmed)) {
        return true;
    }
    
    // DD/MM/YYYY or DD-MM-YYYY
    if (preg_match('/^\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4}/', $trimmed)) {
        return true;
    }
    
    // Month DD, YYYY
    if (preg_match('/^[A-Za-z]{3,}\s+\d{1,2},?\s+\d{4}/', $trimmed)) {
        return true;
    }
    
    // BBVA format: DD/MMM DD/MMM (e.g., "05/ENE 05/ENE")
    // Two dates without year - operation date and liquidation date
    if (preg_match('/^\d{1,2}\/[A-Za-z]{3}\s+\d{1,2}\/[A-Za-z]{3}/', $trimmed)) {
        return true;
    }
    
    // Single DD/MMM without year (e.g., "05/ENE")
    if (preg_match('/^\d{1,2}\/[A-Za-z]{3}\s/', $trimmed)) {
        return true;
    }
    
    return false;
}

/**
 * Extract amounts from a line using layout-based column detection.
 * Amounts are typically right-aligned in bank statements.
 *
 * CRITICAL: Only numbers with decimal points are treated as amounts.
 * Reference numbers (FOLIO) are integers and should be part of description.
 *
 * Supports multiple bank formats:
 * - Santander: DD-MMM-YYYY FOLIO DESCRIPTION DEPOSITOS RETIROS SALDO
 * - BBVA: DD/MMM DD/MMM COD DESCRIPTION REFERENCIA CARGOS ABONOS SALDO_OPER SALDO_LIQ
 *
 * @param string $line The line to parse
 * @return array Array with 'amounts' and 'description'
 */
function extract_amounts_from_layout($line) {
    $columns = parse_layout_columns($line);
    
    $amounts = [];
    $description_parts = [];
    $date = null;
    $folio = null;
    
    // BBVA format detection: DD/MMM DD/MMM at the start
    // Pattern: "05/ENE 05/ENE S39 SERV BANCA INTERNET..."
    $trimmed = trim($line);
    $is_bbva_format = false;
    if (preg_match('/^(\d{1,2}\/[A-Za-z]{3})\s+(\d{1,2}\/[A-Za-z]{3})\s+(.*)$/', $trimmed, $bbva_match)) {
        // BBVA format detected - use first date (operation date)
        $is_bbva_format = true;
        $date = normalize_date($bbva_match[1]);

        // The rest of the line contains: COD DESCRIPTION REFERENCIA AMOUNTS
        $rest = $bbva_match[3];

        // Parse the rest as columns
        $rest_columns = parse_layout_columns($rest);

        foreach ($rest_columns as $index => $col) {
            $col = trim($col);
            if (empty($col)) continue;

            // Check if it's a numeric value
            $col_clean = preg_replace('/[^0-9.,\-\+]+$/', '', $col);
            if (preg_match('/^[\-\+]?[\d\.\,]+$/', $col_clean)) {
                $amount = parse_money($col_clean);

                if ($amount !== null && is_valid_transaction_amount($col, $amount)) {
                    $amounts[] = [
                        'value' => $amount,
                        'text' => $col,
                        'column_index' => $index,
                    ];
                } else if ($amount !== null && $amount != 0) {
                    // Reference number
                    $folio = $col;
                }
            } else {
                // Description part
                $description_parts[] = $col;
            }
        }

        // If we found a folio, prepend it to description
        if ($folio !== null) {
            array_unshift($description_parts, "FOLIO: $folio");
        }

        return [
            'date' => $date,
            'amounts' => $amounts,
            'description' => implode(' ', $description_parts),
            'column_count' => count($columns),
            'folio' => $folio,
            'is_bbva' => true,
        ];
    }
    
    // Standard format processing (Santander, etc.)
    foreach ($columns as $index => $col) {
        $col = trim($col);
        
        // Skip empty columns
        if (empty($col)) {
            continue;
        }
        
        // Check if it's a date (exact match)
        $normalized_date = normalize_date($col);
        if ($normalized_date !== null && $date === null) {
            // Validate the date is reasonable
            if (validate_transaction_date($normalized_date)) {
                $date = $normalized_date;
                continue;
            }
        }
        
        // Check if the column STARTS with a date (e.g., "04-MAR-2024 4260547 ABONO...")
        // This handles cases where date, folio, and description are in the same column
        $date_extracted_from_col = false;
        if ($date === null && $index === 0) {
            // Try to extract date from beginning of column
            // Pattern: DD-MMM-YYYY or DD/MM/YYYY at the start
            if (preg_match('/^(\d{1,2}[-\/][A-Za-z]{3}[-\/]\d{4})\s+(.*)$/', $col, $m)) {
                $normalized_date = normalize_date($m[1]);
                if ($normalized_date !== null && validate_transaction_date($normalized_date)) {
                    $date = $normalized_date;
                    $col = $m[2]; // Rest of the column is description (may include folio + text)
                    $date_extracted_from_col = true;
                }
            } elseif (preg_match('/^(\d{1,2}[-\/]\d{1,2}[-\/]\d{2,4})\s+(.*)$/', $col, $m)) {
                $normalized_date = normalize_date($m[1]);
                if ($normalized_date !== null && validate_transaction_date($normalized_date)) {
                    $date = $normalized_date;
                    $col = $m[2]; // Rest of the column is description (may include folio + text)
                    $date_extracted_from_col = true;
                }
            }
        }
        
        // If we extracted a date from this column, the remaining text is description (possibly with folio prefix)
        // Parse it specially: "0000000 CARGO TRANSFERENCIA ENLACE..." -> folio + description
        if ($date_extracted_from_col && !empty($col)) {
            // Check if it starts with a folio (all digits, 5-10 chars)
            if (preg_match('/^(\d{5,10})\s+(.*)$/', $col, $folio_match)) {
                $folio = $folio_match[1];
                $description_parts[] = $folio_match[2]; // The actual description text
            } else {
                // No folio prefix, entire remaining text is description
                $description_parts[] = $col;
            }
            continue; // Skip the numeric check for this column
        }
        
        // Check if column starts with digits followed by text (e.g., "0000000 CARGO TRANSFERENCIA...")
        // This is a folio + description pattern, NOT a numeric value
        if (preg_match('/^(\d{5,10})\s+(.+)$/', $col, $folio_desc_match)) {
            $folio = $folio_desc_match[1];
            $description_parts[] = $folio_desc_match[2];
            continue;
        }
        
        // Check if it's a numeric value (strip trailing non-numeric chars like ":" first)
        $col_clean = preg_replace('/[^0-9.,\-\+]+$/', '', $col);
        if (preg_match('/^[\-\+]?[\d\.\,]+$/', $col_clean)) {
            $amount = parse_money($col_clean);
            
            // CRITICAL: Use is_valid_transaction_amount to filter reference numbers
            if ($amount !== null && is_valid_transaction_amount($col, $amount)) {
                $amounts[] = [
                    'value' => $amount,
                    'text' => $col,
                    'column_index' => $index,
                ];
            } else {
                // This is likely a reference number (FOLIO) - add to description
                if ($amount !== null && $amount != 0) {
                    $folio = $col;
                }
            }
        } else {
            // It's part of the description
            $description_parts[] = $col;
        }
    }
    
    // If we found a folio, prepend it to description
    if ($folio !== null) {
        array_unshift($description_parts, "FOLIO: $folio");
    }

    return [
        'date' => $date,
        'amounts' => $amounts,
        'description' => implode(' ', $description_parts),
        'column_count' => count($columns),
        'folio' => $folio,
        'is_bbva' => false,
    ];
}

/**
 * Check if a line looks like a description continuation (not a new transaction).
 *
 * A continuation line:
 * - Has no date
 * - Has no amounts (or amounts are reference numbers)
 * - Has text content
 * - May start with common continuation patterns (REFERENCIA:, BENEFICIARIO:, etc.)
 *
 * @param string $line The line to check
 * @return bool True if this looks like a description continuation
 */
function is_description_continuation($line) {
    $trimmed = trim($line);
    
    if (empty($trimmed)) {
        return false;
    }
    
    // Skip non-transaction regions (headers, footers, etc.)
    if (is_non_transaction_region($trimmed)) {
        return false;
    }
    
    // Parse the line
    $parsed = extract_amounts_from_layout($trimmed);
    
    // If it has a date, it's a new transaction, not a continuation
    if ($parsed['date'] !== null) {
        return false;
    }
    
    // If it has valid amounts (not reference numbers), it might be amounts-only line
    // But we still consider it continuation if it also has description text
    if (count($parsed['amounts']) > 0 && empty($parsed['description'])) {
        return false; // Amounts-only line, not a description continuation
    }
    
    // If it has description text, it's a continuation
    if (!empty($parsed['description'])) {
        return true;
    }
    
    return false;
}

/**
 * Get smart separator for description continuation.
 * Uses ' | ' for distinct fields, space for natural flow.
 *
 * @param string $text The continuation text
 * @return string The separator to use
 */
function get_description_separator($text) {
    // Use | for distinct fields (REFERENCIA:, BENEFICIARIO:, FOLIO:, etc.)
    $field_patterns = [
        '/^REFERENCIA\s*:/i',
        '/^BENEFICIARIO\s*:/i',
        '/^FOLIO\s*:/i',
        '/^HORA\s*:/i',
        '/^FECHA\s*:/i',
        '/^CONCEPTO\s*:/i',
        '/^RFC\s*:/i',
        '/^CUENTA\s*:/i',
        '/^CLABE\s*:/i',
        '/^ORDENANTE\s*:/i',
        '/^RECEPTOR\s*:/i',
        '/^CLAVE\s*:/i',
        '/^NUMERO\s*:/i',
        '/^NO\.\s*/i',
        '/^NUM\.\s*/i',
    ];
    
    foreach ($field_patterns as $pattern) {
        if (preg_match($pattern, trim($text))) {
            return ' | ';
        }
    }
    
    // Use space for natural flow
    return ' ';
}

/**
 * Assemble transactions using a state machine approach with look-ahead.
 * This handles multi-line transactions where date, description, and amounts
 * may be on separate lines.
 *
 * ENHANCED v1.5.0: Implements "Collect Greedily, Finalize Conservatively" doctrine.
 * - Look-ahead buffer to detect description continuations
 * - Smart separators (space vs. ' | ')
 * - Greedy collection until certain transaction is complete
 *
 * States:
 * - WAITING: Looking for a line that starts with a date
 * - COLLECTING: Collecting description lines until new transaction starts
 *
 * @param array $lines Array of lines from the PDF
 * @param bool $debug Enable debug output
 * @return array Array of assembled transaction rows
 */
function assemble_transactions_stateful($lines, $debug = false) {
    $transactions = [];
    $current_transaction = null;
    $state = 'WAITING';
    $line_count = count($lines);
    
    // Pre-filter lines to remove empty and non-transaction regions
    $filtered_lines = [];
    foreach ($lines as $line) {
        $trimmed = trim($line);
        if (!empty($trimmed) && !is_non_transaction_region($trimmed)) {
            $filtered_lines[] = $trimmed;
        }
    }
    $line_count = count($filtered_lines);
    
    for ($line_index = 0; $line_index < $line_count; $line_index++) {
        $trimmed = $filtered_lines[$line_index];
        
        // Parse current line
        $parsed = extract_amounts_from_layout($trimmed);
        $has_date = $parsed['date'] !== null;
        $has_amounts = count($parsed['amounts']) > 0;
        
        // LOOK-AHEAD: Check if next line(s) are description continuations
        $continuation_count = 0;
        $look_ahead_index = $line_index + 1;
        while ($look_ahead_index < $line_count &&
               is_description_continuation($filtered_lines[$look_ahead_index])) {
            $continuation_count++;
            $look_ahead_index++;
        }
        
        $has_continuations = ($continuation_count > 0);
        
        tabler_debug("Line $line_index: date=" . ($has_date ? $parsed['date'] : 'no') .
                     ", amounts=" . count($parsed['amounts']) .
                     ", continuations=$continuation_count" .
                     ", desc=" . substr($parsed['description'], 0, 40), $debug);
        
        switch ($state) {
            case 'WAITING':
                if ($has_date) {
                    // Start a new transaction
                    $current_transaction = [
                        'day' => $parsed['date'],
                        'description' => $parsed['description'],
                        'amounts' => $parsed['amounts'],
                        'lines' => [$trimmed],
                        'is_bbva' => $parsed['is_bbva'] ?? false,
                    ];
                    
                    // GREEDY: Don't finalize immediately if there are continuations
                    if ($has_amounts && !$has_continuations) {
                        // Complete transaction on single line (no continuations ahead)
                        $transactions[] = finalize_transaction($current_transaction);
                        $current_transaction = null;
                        $state = 'WAITING';
                    } else {
                        // Need to collect more lines (either no amounts or has continuations)
                        $state = 'COLLECTING';
                    }
                }
                break;
                
            case 'COLLECTING':
                if ($has_date) {
                    // New transaction started - finalize current one
                    if ($current_transaction !== null) {
                        $transactions[] = finalize_transaction($current_transaction);
                    }
                    
                    // Start new transaction
                    $current_transaction = [
                        'day' => $parsed['date'],
                        'description' => $parsed['description'],
                        'amounts' => $parsed['amounts'],
                        'lines' => [$trimmed],
                        'is_bbva' => $parsed['is_bbva'] ?? false,
                    ];
                    
                    // GREEDY: Don't finalize immediately if there are continuations
                    if ($has_amounts && !$has_continuations) {
                        $transactions[] = finalize_transaction($current_transaction);
                        $current_transaction = null;
                        $state = 'WAITING';
                    }
                    // else: stay in COLLECTING state
                } else {
                    // Continue collecting description
                    if (!empty($parsed['description'])) {
                        // SMART SEPARATOR: Use appropriate separator
                        $separator = get_description_separator($parsed['description']);
                        $current_transaction['description'] .= $separator . $parsed['description'];
                    }
                    
                    // Merge any amounts found
                    if ($has_amounts) {
                        $current_transaction['amounts'] = array_merge(
                            $current_transaction['amounts'],
                            $parsed['amounts']
                        );
                    }
                    
                    $current_transaction['lines'][] = $trimmed;
                    
                    // GREEDY: Only finalize if we have amounts AND no more continuations
                    if (count($current_transaction['amounts']) >= 1 && !$has_continuations) {
                        $transactions[] = finalize_transaction($current_transaction);
                        $current_transaction = null;
                        $state = 'WAITING';
                    }
                    // else: stay in COLLECTING state
                }
                break;
        }
    }
    
    // Finalize any remaining transaction
    if ($current_transaction !== null) {
        // Even if no amounts, try to finalize if we have a date and description
        if (count($current_transaction['amounts']) > 0 ||
            ($current_transaction['day'] !== null && !empty($current_transaction['description']))) {
            $transactions[] = finalize_transaction($current_transaction);
        }
    }
    
    return $transactions;
}

/**
 * Finalize a transaction by assigning amounts to debit/credit/balance.
 *
 * Santander format has columns: FECHA | FOLIO | DESCRIPCION | DEPOSITOS | RETIROS | SALDO
 * - DEPOSITOS = credits (abonos) - money in
 * - RETIROS = debits (cargos) - money out
 * - SALDO = balance
 *
 * Key insight: The description tells us if it's a debit or credit:
 * - "ABONO" = credit (money in)
 * - "PAGO", "CARGO", "CGO" = debit (money out)
 *
 * @param array $transaction The transaction data
 * @return array Finalized row
 */
function finalize_transaction($transaction) {
    $row = [
        'day' => $transaction['day'],
        'description' => trim($transaction['description']),
        'debit' => null,
        'credit' => null,
        'balance' => null,
        'raw' => $transaction['lines'],
        'confidence' => 100,
    ];
    
    $amounts = $transaction['amounts'];
    $count = count($amounts);
    $description = strtoupper($row['description']);
    $is_bbva = $transaction['is_bbva'] ?? false;

    // Determine if this is a debit or credit based on description keywords
    $is_debit = false;
    $is_credit = false;

    // Credit indicators (money in)
    $credit_keywords = ['ABONO', 'DEPOSITO', 'INGRESO', 'TRANSFERENCIA RECIBIDA', 'SPEI RECIBIDO', 'T20 SPEI RECIBIDO'];
    foreach ($credit_keywords as $keyword) {
        if (strpos($description, $keyword) !== false) {
            $is_credit = true;
            break;
        }
    }

    // Debit indicators (money out) - check these AFTER credit to handle "ABONO" priority
    $debit_keywords = ['SPEI ENVIADO', 'T17 SPEI ENVIADO', 'PAGO', 'CARGO', 'CGO', 'RETIRO', 'COMISION', 'COM ', 'I V A', 'IVA ', 'IMPTO', 'APORT', 'RETENCION', 'ISR'];
    if (!$is_credit) {
        foreach ($debit_keywords as $keyword) {
            if (strpos($description, $keyword) !== false) {
                $is_debit = true;
                break;
            }
        }
    }

    // BBVA format: Handle variable amounts (1, 2, or 3 amounts depending on transaction)
    // BBVA column order: CARGOS | ABONOS | SALDO_OPERACION | SALDO_LIQUIDACION
    // Key insight: SALDO columns appear as consecutive duplicates (or very similar values)
    // Transaction amount (CARGO or ABONO) is a single value, balance columns come in pairs
    if ($is_bbva && $count >= 1) {
        // Sort by column index to maintain order
        usort($amounts, function($a, $b) {
            return ($a['column_index'] ?? 0) - ($b['column_index'] ?? 0);
        });

        // Detect balance columns: look for consecutive duplicates or very close values (within 0.01)
        $transaction_amounts = [];
        $balance_value = null;
        $skip_next = false;

        for ($i = 0; $i < $count; $i++) {
            if ($skip_next) {
                $skip_next = false;
                continue;
            }

            // Check if this and next amount are similar (balance pair: SALDO_OPER + SALDO_LIQUID)
            if ($i < $count - 1 && abs($amounts[$i]['value'] - $amounts[$i+1]['value']) < 0.01) {
                // This is a balance pair - use the second one (SALDO_LIQUIDACION)
                $balance_value = $amounts[$i+1]['value'];
                $skip_next = true; // Skip the next one since we've processed the pair
            } else {
                // This is a transaction amount (CARGO or ABONO)
                $transaction_amounts[] = $amounts[$i];
            }
        }

        // Assign balance
        $row['balance'] = $balance_value;

        // Assign transaction amount to debit or credit based on description
        if (count($transaction_amounts) > 0) {
            $amount = $transaction_amounts[0]['value'];

            if ($is_debit) {
                $row['debit'] = $amount;
                $row['credit'] = null;
            } elseif ($is_credit) {
                $row['credit'] = $amount;
                $row['debit'] = null;
            } else {
                // Default: will be corrected in reconciliation
                $row['credit'] = $amount;
            }
        }

    } elseif ($count >= 3) {
        // Three or more amounts: Santander format [deposito, retiro, saldo]
        // Sort by column index to maintain order
        usort($amounts, function($a, $b) {
            return ($a['column_index'] ?? 0) - ($b['column_index'] ?? 0);
        });

        // Rightmost is always balance (SALDO)
        $row['balance'] = $amounts[$count - 1]['value'];

        // For Santander: DEPOSITOS comes before RETIROS
        // Column order: DEPOSITOS | RETIROS | SALDO
        $amt1 = $amounts[$count - 3]['value'] ?? 0;  // DEPOSITOS (credit)
        $amt2 = $amounts[$count - 2]['value'] ?? 0;  // RETIROS (debit)

        // Assign based on column order
        $row['credit'] = $amt1;  // DEPOSITOS
        $row['debit'] = $amt2;   // RETIROS

    } elseif ($count == 2) {
        // Two amounts: [amount, balance]
        // Sort by column index
        usort($amounts, function($a, $b) {
            return ($a['column_index'] ?? 0) - ($b['column_index'] ?? 0);
        });
        
        // Rightmost is balance
        $row['balance'] = $amounts[1]['value'];
        
        // First amount - use description to determine if debit or credit
        $amount = $amounts[0]['value'];
        
        if ($is_debit) {
            $row['debit'] = $amount;
            $row['credit'] = null;
        } elseif ($is_credit) {
            $row['credit'] = $amount;
            $row['debit'] = null;
        } else {
            // Default: will be corrected in reconciliation based on balance delta
            $row['credit'] = $amount;
        }
        
    } elseif ($count == 1) {
        // Single amount - assume it's the balance
        $row['balance'] = $amounts[0]['value'];
    }
    
    return $row;
}

/**
 * Clean footer/header content from description text.
 * Removes common footer patterns that get merged into descriptions.
 *
 * @param string $description The description to clean
 * @return string Cleaned description
 */
function clean_description_footer_content($description) {
    // Patterns that indicate footer content was merged into description
    $footer_patterns = [
        // BBVA footer patterns
        '/\s*\|\s*FOLIO:\s*\d+\s*No\.\s*Cuenta.*$/i',
        '/\s*FOLIO:\s*\d+\s*No\.\s*Cuenta.*$/i',
        '/\s*No\.\s*Cuenta\s*\|\s*No\.\s*Cliente.*$/i',
        '/\s*También le informamos.*$/i',
        '/\s*Con BBVA adelante.*$/i',
        '/\s*Cuadro resumen.*$/i',
        '/\s*gráfico de movimientos.*$/i',
        
        // Santander footer patterns
        '/\s*\|\s*Su conformidad.*$/i',
        '/\s*\|\s*Cualquier aclaracion.*$/i',
        '/\s*\|\s*Conserve este documento.*$/i',
        '/\s*\|\s*Para cualquier.*$/i',
        '/\s*\|\s*En caso de.*$/i',
        
        // Generic footer patterns
        '/\s*\|\s*Página\s*\d+.*$/i',
        '/\s*\|\s*Pagina\s*\d+.*$/i',
    ];
    
    foreach ($footer_patterns as $pattern) {
        $description = preg_replace($pattern, '', $description);
    }
    
    return trim($description);
}

/**
 * Post-process and validate rows before reconciliation.
 * Removes invalid rows and cleans up data.
 *
 * @param array $rows Array of transaction rows
 * @param bool $debug Enable debug output
 * @return array Cleaned rows
 */
function validate_and_clean_rows($rows, $debug = false) {
    $cleaned = [];
    
    // Get statement year for date validation
    $statement_year = get_statement_year();
    $current_year = (int)date('Y');
    
    foreach ($rows as $index => $row) {
        // Debug: Show all rows being processed
        if ($debug && strpos($row['description'] ?? '', 'ENLACE SF') !== false) {
            tabler_debug("Row $index: ENLACE SF found - day={$row['day']}, balance={$row['balance']}, desc=" . substr($row['description'], 0, 50), true);
        }
        
        // Skip rows with invalid dates
        if ($row['day'] !== null && !validate_transaction_date($row['day'])) {
            tabler_debug("Row $index: Skipping invalid date: " . $row['day'], $debug);
            continue;
        }
        
        // Skip rows with dates that don't match the statement year (if known)
        // This filters out garbage rows from summary sections
        if ($row['day'] !== null && $statement_year !== null) {
            if (preg_match('/(\d{4})$/', $row['day'], $m)) {
                $row_year = (int)$m[1];
                // Allow statement year and adjacent years (for year-end statements)
                if (abs($row_year - $statement_year) > 1) {
                    tabler_debug("Row $index: Skipping date outside statement period: " . $row['day'] . " (statement year: $statement_year)", $debug);
                    continue;
                }
            }
        }
        
        // Skip rows with future dates (more than 1 month ahead)
        if ($row['day'] !== null) {
            if (preg_match('/^(\d{2})-(\d{2})-(\d{4})$/', $row['day'], $m)) {
                $row_ts = mktime(0, 0, 0, (int)$m[2], (int)$m[1], (int)$m[3]);
                $future_limit = strtotime('+1 month');
                if ($row_ts > $future_limit) {
                    tabler_debug("Row $index: Skipping future date: " . $row['day'], $debug);
                    continue;
                }
            }
        }
        
        // NOTE: We do NOT call is_non_transaction_region() on assembled descriptions.
        // That function is designed for raw PDF lines, not transaction descriptions.
        // Descriptions legitimately contain terms like "CLABE", "RFC:", "CUENTA" etc.
        // Pre-filtering in assemble_transactions_stateful() is sufficient.
        
        // Clean footer content from description
        $row['description'] = clean_description_footer_content($row['description']);
        
        // Skip rows with very long descriptions (likely garbage from footer/header)
        if (strlen($row['description']) > 500) {
            tabler_debug("Row $index: Skipping very long description (likely garbage)", $debug);
            continue;
        }
        
        // Skip rows with astronomical balance values (parsing errors)
        if ($row['balance'] !== null && abs($row['balance']) > 100000000) {
            tabler_debug("Row $index: Skipping astronomical balance: " . $row['balance'], $debug);
            continue;
        }
        
        // Skip rows with no date and no balance (likely continuation text)
        if ($row['day'] === null && $row['balance'] === null) {
            tabler_debug("Row $index: Skipping row with no date and no balance", $debug);
            continue;
        }
        
        // Clean up zero amounts
        if ($row['debit'] !== null && abs($row['debit']) < 0.001) {
            $row['debit'] = null;
        }
        if ($row['credit'] !== null && abs($row['credit']) < 0.001) {
            $row['credit'] = null;
        }
        
        $cleaned[] = $row;
    }
    
    tabler_debug("Cleaned " . count($rows) . " rows to " . count($cleaned) . " valid rows", $debug);
    
    return $cleaned;
}

/**
 * Smart reconciliation that uses balance deltas to correct debit/credit assignment.
 *
 * @param array $rows Array of transaction rows
 * @param bool $debug Enable debug output
 * @return array Reconciliation result with corrected rows
 */
function smart_reconcile($rows, $debug = false) {
    $prev_balance = null;
    $reconciliation_failures = [];
    
    foreach ($rows as $index => &$row) {
        $row['reconciliation_status'] = 'ok';
        $row['reconciliation_error'] = null;
        
        $balance = $row['balance'];
        
        if ($balance === null) {
            continue;
        }
        
        if ($prev_balance === null) {
            $prev_balance = $balance;
            continue;
        }
        
        // Calculate balance delta
        $delta = $balance - $prev_balance;
        
        // If we have a debit but no credit, check if it should be swapped
        if ($row['debit'] !== null && $row['credit'] === null) {
            $amount = $row['debit'];
            
            // If delta is positive, this should be a credit
            if ($delta > 0 && abs($delta - $amount) < 0.01) {
                $row['credit'] = $amount;
                $row['debit'] = null;
                tabler_debug("Row $index: Swapped debit to credit based on balance delta", $debug);
            }
            // If delta is negative, this should be a debit
            elseif ($delta < 0 && abs(abs($delta) - $amount) < 0.01) {
                // Already correct
            }
            // If neither matches, try to infer
            else {
                // Use delta to determine
                if ($delta > 0) {
                    $row['credit'] = abs($delta);
                    $row['debit'] = null;
                } else {
                    $row['debit'] = abs($delta);
                    $row['credit'] = null;
                }
                tabler_debug("Row $index: Inferred amount from balance delta: $delta", $debug);
            }
        }
        // If we have neither debit nor credit, infer from delta
        elseif ($row['debit'] === null && $row['credit'] === null) {
            if ($delta > 0) {
                $row['credit'] = abs($delta);
            } elseif ($delta < 0) {
                $row['debit'] = abs($delta);
            }
            tabler_debug("Row $index: Inferred amount from balance delta: $delta", $debug);
        }
        
        // Validate reconciliation
        $debit = $row['debit'] ?? 0;
        $credit = $row['credit'] ?? 0;
        $expected_balance = $prev_balance + $credit - $debit;
        
        if (abs($balance - $expected_balance) > 0.01) {
            $row['reconciliation_status'] = 'fail';
            $row['reconciliation_error'] = sprintf(
                'Expected %.2f, got %.2f (diff: %.2f)',
                $expected_balance,
                $balance,
                $balance - $expected_balance
            );
            
            $reconciliation_failures[] = [
                'row' => $index,
                'expected' => $expected_balance,
                'actual' => $balance,
                'debit' => $debit,
                'credit' => $credit,
                'date' => $row['day'],
                'description' => substr($row['description'], 0, 60),
            ];
            
            // Always log failures for debugging
            tabler_debug("RECONCILIATION FAILURE Row $index: date={$row['day']}, desc=" . substr($row['description'], 0, 40) .
                         ", expected=$expected_balance, got=$balance, debit=$debit, credit=$credit", true);
        }
        
        $prev_balance = $balance;
    }
    
    return [
        'rows' => $rows,
        'failures' => $reconciliation_failures,
        'is_valid' => empty($reconciliation_failures),
    ];
}

function detect_page_miseries($lines, $page, &$miseries) {
    // Detect common extraction issues on a page
    
    // Check for repeated headers (indicates column bleed)
    $header_count = 0;
    
    foreach ($lines as $line) {
        if (is_header_row($line)) {
            $header_count++;
        }
    }
    
    if ($header_count > 1) {
        $miseries[] = [
            'page' => $page,
            'type' => 'repeated_header',
            'message' => "Found $header_count header-like rows on page (possible column bleed)",
            'severity' => 'high',
        ];
    }
    
    // Check for very short lines (possible truncated text)
    foreach ($lines as $index => $line) {
        if (strlen(trim($line)) > 0 && strlen(trim($line)) < 5) {
            $miseries[] = [
                'page' => $page,
                'line' => $index,
                'type' => 'short_line',
                'message' => "Line too short: '" . substr(trim($line), 0, 20) . "' (possible truncation)",
                'severity' => 'low',
            ];
        }
    }
}

// ============================================================================
// SCRIPT ENTRY POINT
// ============================================================================

try {
    // Parse command line arguments
    $options = parse_cli_args($argv);
    
    if ($options['help']) {
        show_help();
        exit(0);
    }
    
    if (empty($options['input'])) {
        show_help();
        tabler_error("Input file is required");
    }
    
    // Setup cache directory
    if (empty($options['cache_dir'])) {
        $options['cache_dir'] = sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'tabler_' . getmypid();
    }
    
    ensure_directory($options['cache_dir']);
    
    // Run the pipeline
    $result = run_pipeline($options);
    
    if ($result['success']) {
        echo "\n";
        tabler_info("SUCCESS: Output: " . $result['output']);
        tabler_info("Rows processed: " . $result['row_count']);
        
        if (!$result['reconciliation_valid']) {
            tabler_warning("Ledger reconciliation had failures - see audit for details");
        }
        
        exit(0);
    } else {
        tabler_error("Pipeline failed");
    }
} catch (TablerError $e) {
    echo "\n[ERROR] " . $e->getMessage() . "\n";
    exit($e->getCode() ?: 1);
} catch (\Exception $e) {
    echo "\n[UNHANDLED ERROR] " . $e->getMessage() . "\n";
    echo "Stack trace:\n" . $e->getTraceAsString() . "\n";
    exit(1);
}