#!/bin/bash
# adapted from code genarated by a LLM; script is thus under a CC0 license
# call: check-columns pdffilename
# returns success (0) if pdffile is likely one-column format; returns error code 1 if it is likely two-column
# The analysis is based on prabability, so no guarantee!
# Manfred 2025-09-20 (2025-11-25)
#
# 2025-11-25: Only use pages 2-3 for the test for speed up.


check_two_column_pdf() {
    local pdf_file="$1"
    
    if [[ ! -f "$pdf_file" ]]; then
        echo "Error: File '$pdf_file' not found" >&2
        return 1
    fi
    
    # Check if pdftotext is available
    if ! command -v pdftotext &> /dev/null; then
        echo "Error: pdftotext not found. Install poppler-utils." >&2
        return 1
    fi
    
    local temp_text=$(mktemp)
    
    # Extract text with layout preservation
    if ! pdftotext -layout -f 2 -l 3 "$pdf_file" "$temp_text" 2>/dev/null; then
        echo "Error: Could not extract text from PDF" >&2
        rm -f "$temp_text"
        return 1
    fi
    
    # Check if file has content
    if [[ ! -s "$temp_text" ]]; then
        echo "Error: No text content extracted from PDF" >&2
        rm -f "$temp_text"
        return 1
    fi
    
    local total_lines=0
    local two_col_candidates=0
    local max_line_length=0
    
    # First pass: find maximum line length to understand page width
    while IFS= read -r line; do
        [[ -z "$line" ]] && continue
        local line_length=${#line}
        if [[ $line_length -gt $max_line_length ]]; then
            max_line_length=$line_length
        fi
    done < "$temp_text"
    
    # Reset file pointer
    exec < "$temp_text"
    
    # Second pass: analyze column structure
    while IFS= read -r line; do
        [[ -z "$line" ]] && continue
        ((total_lines++))
        
        local line_length=${#line}
        
        # Only analyze lines that are at least 60% of max length (to avoid short lines)
        if [[ $line_length -gt $((max_line_length * 6 / 10)) ]]; then
            local mid_point=$((line_length / 2))
            local search_window=$((line_length / 10))  # 10% of line length
            
            # Check for a gap of spaces in the middle region
            for ((i = mid_point - search_window; i <= mid_point + search_window; i++)); do
                if [[ $i -lt $line_length && $i -gt 0 ]]; then
                    # Look for a sequence of at least 4 spaces in the middle region
                    if [[ "${line:$i:4}" == "    " ]]; then
                        # Check if both sides have substantial content
                        local left_side="${line:0:$i}"
                        local right_side="${line:$i}"
                        
                        # Remove trailing/leading spaces
                        left_side="${left_side%"${left_side##*[![:space:]]}"}"
                        right_side="${right_side#"${right_side%%[![:space:]]*}"}"
                        
                        if [[ ${#left_side} -gt 10 && ${#right_side} -gt 10 ]]; then
                            ((two_col_candidates++))
                            break
                        fi
                    fi
                fi
            done
        fi
    done
    
    rm -f "$temp_text"
    
    if [[ $total_lines -eq 0 ]]; then
        echo "Error: No lines to analyze" >&2
        return 1
    fi
    
    local ratio=$(echo "scale=3; $two_col_candidates / $total_lines" | bc)
    
#    echo "Analysis for: $pdf_file"
#    echo "Total lines analyzed: $total_lines"
#    echo "Two-column candidate lines: $two_col_candidates"
#    echo "Ratio: $ratio"
#    echo "Max line length: $max_line_length"
    
    # More conservative threshold
    if (( $(echo "$ratio > 0.15" | bc -l) )); then
#        echo "Result: LIKELY two-column format"
        return 1
    else
#        echo "Result: probably single column"
        return 0
    fi
}

# Usage
check_two_column_pdf "$1"
