#!/bin/bash
#
# File: check-titles
# Version: 0.10
#
# Checks whether the PDF files are in two column style. They should be in 1-column style!
# (C) 2025 by Manfred Jeusfeld. This script is made available under the
# Creative Commons Attribution-ShareAlike CC-BY-SA 4.0 license.
#
# The BASH script is part of the scripts used for CEUR-WS.org. No warrantee whatsoever. No support. 
#
# Requires the installation of certain packages, in particular perl
# Install on Debian-based systems with
#   sudo apt install perl
#   
# Note that this script is updated on a regular basis.
#
# The content of this script is also copied into check-pdf-errors.
#
# Manfred 2025-11-13 (2025-11-21)
#


if [ ! -f "index.html" ]; then
    echo "No file index.html found."
    exit 1
fi

if [[ ! `ls *.pdf` ]]; then
    echo "No file with filetype *.pdf in this directory."
    exit 1
fi


echo ""
echo "(*) Check whether the paper titles in index.html match the title of the PDF file"
echo ""
MISMATCHTITLE="no"

# Capture Perl output directly
perl_output=$(perl -Mopen=locale -0777 -ne '
use strict;
use warnings;

my $any_mismatch = 0;

# Subroutine to aggressively normalize text by removing punctuation and standardizing spaces.
sub normalize_text {
    my ($text) = @_;
    
    # 1. Standardize whitespace and remove newlines
    $text =~ s/[\n\t\r]+/ /g; 
    $text =~ s/\s+/ /g;       

    # 2. Convert to lowercase
    $text = lc($text);
    
    # 3. FIX: Specifically strip known "noise" characters before stripping all punctuation
    # This handles both the standard * and the Unicode ⋆
    $text =~ s/[\*\x{22c6}]//g; 

    # 4. Remove all remaining non-alphanumeric characters
    $text =~ s/[^a-z0-9]/ /g; 
    
    # 5. Final cleanup: collapse spaces so "Cyber-Security" and "Cyber Security" match
    $text =~ s/\s+//g; # Remove ALL spaces for a "condensed" comparison
    
    return $text;
}

while (/<li[^>]*>(.*?)<\/li>/sg) {
    my $li = $1;

    my $pdf;
    my $title;

    # PDF link and title extraction
    if ($li =~ /<a href="([^"]+\.pdf)"/s) {
        $pdf = $1;

        if ($li =~ /<span class="CEURTITLE">(.*?)<\/span>/s) {
            $title = $1;
        } elsif ($li =~ /<a [^>]*class="CEURTITLE">(.*?)<\/a>/s) {
            $title = $1;
        } else {
            # Skip if no title element found
            next;
        }
        
        # Title normalization (only to remove trailing *)
        $title =~ s/\s+/ /g;
        $title =~ s/ \*$//;

        # Check if PDF file exists
        if (!-f $pdf) {
            print "Error: File $pdf missing\n";
            $any_mismatch = 1;
            next;
        }

        # PDF text extraction (first page only)
        # pdftotext -q (quiet) -f 1 (first page) -l 1 (last page) "$pdf" - (output to stdout) 2>/dev/null (hide errors)
        my $page1 = `pdftotext -q -f 1 -l 1 "$pdf" - 2>/dev/null`;
        
        # NEW: Aggressively normalize both title and PDF text
        my $normtitle = normalize_text($title);
        my $normpage  = normalize_text($page1);

        # Trim leading/trailing spaces again after normalization
        $normtitle =~ s/^\s+|\s+$//g; 
        
        # Escape characters in the normalized title for regex matching
        my $escaped = quotemeta($normtitle);

        # NEW: Tolerant comparison without word boundaries (\b).
        # Checks if the normalized string is found anywhere in the normalized page text.
        if ($normpage =~ /$escaped/) {
            # Title found — do nothing
        } else {
            print "$pdf: Expected CEURTITLE \"$title\" not found on first page\n";
            $any_mismatch = 1;
        }
    }
}

# Output the final status flag for the bash script to parse
if ($any_mismatch) {
    print "__MISMATCHTITLE__=yes\n";
} else {
    print "__MISMATCHTITLE__=no\n";
}
' index.html)

# Parse the MISMATCHTITLE value from Perl output
while IFS= read -r line; do
    if [[ "$line" == __MISMATCHTITLE__=* ]]; then
        MISMATCHTITLE="${line#__MISMATCHTITLE__=}"
    else
        # Echo any error messages from the Perl script
        echo "$line"
    fi
done <<< "$perl_output"


if [[ "$MISMATCHTITLE" == "yes" ]] ; then
  echo " ===> Make sure that paper CEURTITLE in index.html is matching the title in the paper PDFs!"
  echo "      Make sure that the title in the PDF file is not a bitmap."
  echo " "
else
  echo "ok"
  echo " "
fi