From a17d8a05017123fc0faf911bcb57d204c61577aa Mon Sep 17 00:00:00 2001
From: Alexandre Duret-Lutz <adl@lrde.epita.fr>
Date: Wed, 3 Apr 2024 17:47:18 +0200
Subject: [PATCH] help2man: work around some utf8 issues
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

help2man used characters in the range 0x80,...,0x84 to mark special
sections/characters during its processing, but those bytes where also
occurring in other utf-8 characters breaking the output.  For instance
the character '₁' ( a subscript 1), is encoded as "0xE2 0x82 0x81" in
utf-8.

* tools/help2man: Tell perl that input and output should be assumed to
be utf-8.  Also use "private-use codepoints" for those special
characters to avoid any future conflict.
---
 tools/help2man | 46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/tools/help2man b/tools/help2man
index bf1f075cb..82437df46 100755
--- a/tools/help2man
+++ b/tools/help2man
@@ -2,7 +2,8 @@
 
 # Generate a short man page from --help and --version output.
 # Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2009,
-# 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
+# 2010, 2011, 2012, 2013, 2014, 2015, 2016  Free Software Foundation, Inc.
+# Later modified by the Spot authors.
 
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -26,6 +27,9 @@ use Getopt::Long;
 use Text::ParseWords qw(shellwords);
 use Text::Tabs qw(expand);
 use POSIX qw(strftime setlocale LC_ALL);
+use utf8;
+use open IN => ":encoding(UTF-8)";
+binmode STDOUT, ":encoding(UTF-8)";
 
 my $this_program = 'help2man';
 my $this_version = '1.47.4';
@@ -405,10 +409,10 @@ s/\n\n+/\n\n/g;
 s/([A-Za-z])-\n *([A-Za-z])/$1$2/g;
 
 # Temporarily exchange leading dots, apostrophes and backslashes for
-# tokens.
-s/^\./\x80/mg;
-s/^'/\x81/mg;
-s/\\/\x82/g;
+# tokens.  \xE000..\xF8FF are so called "private-use codepoints".
+s/^\./\xE080/mg;
+s/^'/\xE081/mg;
+s/\\/\xE082/g;
 
 # Translators: patterns are used to match common program output. In the source
 # these strings are all of the form of "my $PAT_something = _('...');" and are
@@ -524,7 +528,7 @@ while (length)
     {
 	$matched .= $& if %append_match;
 	$indent = length ($4 || "$1$3");
-	$content = ".TP\n\x84$2\n\x84$5\n";
+	$content = ".TP\n\xE084$2\n\xE084$5\n";
 	unless ($4)
 	{
 	    # Indent may be different on second line.
@@ -536,7 +540,7 @@ while (length)
     elsif (s/^ {1,10}([+-]\S.*)\n//)
     {
 	$matched .= $& if %append_match;
-	$content = ".HP\n\x84$1\n";
+	$content = ".HP\n\xE084$1\n";
 	$indent = 80; # not continued
     }
 
@@ -545,7 +549,7 @@ while (length)
     {
 	$matched .= $& if %append_match;
 	$indent = length ($4 || "$1$3");
-	$content = ".TP\n\x84$2\n\x84$5\n";
+	$content = ".TP\n\xE084$2\n\xE084$5\n";
     }
 
     # Indented paragraph.
@@ -553,7 +557,7 @@ while (length)
     {
 	$matched .= $& if %append_match;
 	$indent = length $1;
-	$content = ".IP\n\x84$2\n";
+	$content = ".IP\n\xE084$2\n";
     }
 
     # Left justified paragraph.
@@ -569,7 +573,7 @@ while (length)
     while ($indent ? s/^ {$indent}(\S.*)\n// : s/^(\S.*)\n//)
     {
 	$matched .= $& if %append_match;
-	$content .= "\x84$1\n";
+	$content .= "\xE084$1\n";
     }
 
     # Move to next paragraph.
@@ -578,9 +582,9 @@ while (length)
     for ($content)
     {
 	# Leading dot and apostrophe protection.
-	s/\x84\./\x80/g;
-	s/\x84'/\x81/g;
-	s/\x84//g;
+	s/\xE084\./\xE080/g;
+	s/\xE084'/\xE081/g;
+	s/\xE084//g;
 
 	# Examples should be verbatim.
 	unless ($sect eq _('EXAMPLES'))
@@ -603,7 +607,7 @@ while (length)
 	}
 
 	# Escape remaining hyphens.
-	s/-/\x83/g;
+	s/-/\xE083/g;
 
 	if ($sect eq _('COPYRIGHT'))
 	{
@@ -675,6 +679,7 @@ while (my ($sect, $text) = each %replace)
 # Output header.
 print <<EOT;
 .\\" DO NOT MODIFY THIS FILE!  It was generated by $this_program $this_version.
+.\\" -*- coding: utf-8 -*-
 .TH $PROGRAM "$section" "$date" "$source" "$manual"
 EOT
 
@@ -699,14 +704,13 @@ for my $sect (@pre, (grep !$filter{$_}, @sections), @post)
 	{
 	    # Replace leading dot, apostrophe, backslash and hyphen
 	    # tokens.
-	    s/\x80/\\&./g;
-	    s/\x81/\\&'/g;
-	    s/\x82/\\e/g;
-	    s/\x83/\\-/g;
+	    s/\xE080/\\&./g;
+	    s/\xE081/\\&'/g;
+	    s/\xE082/\\e/g;
+	    s/\xE083/\\-/g;
 
 	    # Convert some latin1 chars to troff equivalents
-	    s/\xa0/\\ /g; # non-breaking space
-
+	    s/\xA0/\\ /g; # non-breaking space
 	    print enc $_;
 	}
     }
@@ -756,7 +760,7 @@ sub convert_option
 {
     local $_ = '\fB' . shift;
 
-    s/-/\x83/g;
+    s/-/\xE083/g;
     if (s/\[=(.*)\]$/\\fR[=\\fI$1\\fR]/)
     {
 	s/\|/\\fR|\\:\\fI/g;