From a17d8a05017123fc0faf911bcb57d204c61577aa Mon Sep 17 00:00:00 2001 From: Alexandre Duret-Lutz Date: Wed, 3 Apr 2024 17:47:18 +0200 Subject: [PATCH] help2man: work around some utf8 issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit help2man used characters in the range 0x80,...,0x84 to mark special sections/characters during its processing, but those bytes where also occurring in other utf-8 characters breaking the output. For instance the character '₁' ( a subscript 1), is encoded as "0xE2 0x82 0x81" in utf-8. * tools/help2man: Tell perl that input and output should be assumed to be utf-8. Also use "private-use codepoints" for those special characters to avoid any future conflict. --- tools/help2man | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/tools/help2man b/tools/help2man index bf1f075cb..82437df46 100755 --- a/tools/help2man +++ b/tools/help2man @@ -2,7 +2,8 @@ # Generate a short man page from --help and --version output. # Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2009, -# 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc. +# 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc. +# Later modified by the Spot authors. # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -26,6 +27,9 @@ use Getopt::Long; use Text::ParseWords qw(shellwords); use Text::Tabs qw(expand); use POSIX qw(strftime setlocale LC_ALL); +use utf8; +use open IN => ":encoding(UTF-8)"; +binmode STDOUT, ":encoding(UTF-8)"; my $this_program = 'help2man'; my $this_version = '1.47.4'; @@ -405,10 +409,10 @@ s/\n\n+/\n\n/g; s/([A-Za-z])-\n *([A-Za-z])/$1$2/g; # Temporarily exchange leading dots, apostrophes and backslashes for -# tokens. -s/^\./\x80/mg; -s/^'/\x81/mg; -s/\\/\x82/g; +# tokens. \xE000..\xF8FF are so called "private-use codepoints". +s/^\./\xE080/mg; +s/^'/\xE081/mg; +s/\\/\xE082/g; # Translators: patterns are used to match common program output. In the source # these strings are all of the form of "my $PAT_something = _('...');" and are @@ -524,7 +528,7 @@ while (length) { $matched .= $& if %append_match; $indent = length ($4 || "$1$3"); - $content = ".TP\n\x84$2\n\x84$5\n"; + $content = ".TP\n\xE084$2\n\xE084$5\n"; unless ($4) { # Indent may be different on second line. @@ -536,7 +540,7 @@ while (length) elsif (s/^ {1,10}([+-]\S.*)\n//) { $matched .= $& if %append_match; - $content = ".HP\n\x84$1\n"; + $content = ".HP\n\xE084$1\n"; $indent = 80; # not continued } @@ -545,7 +549,7 @@ while (length) { $matched .= $& if %append_match; $indent = length ($4 || "$1$3"); - $content = ".TP\n\x84$2\n\x84$5\n"; + $content = ".TP\n\xE084$2\n\xE084$5\n"; } # Indented paragraph. @@ -553,7 +557,7 @@ while (length) { $matched .= $& if %append_match; $indent = length $1; - $content = ".IP\n\x84$2\n"; + $content = ".IP\n\xE084$2\n"; } # Left justified paragraph. @@ -569,7 +573,7 @@ while (length) while ($indent ? s/^ {$indent}(\S.*)\n// : s/^(\S.*)\n//) { $matched .= $& if %append_match; - $content .= "\x84$1\n"; + $content .= "\xE084$1\n"; } # Move to next paragraph. @@ -578,9 +582,9 @@ while (length) for ($content) { # Leading dot and apostrophe protection. - s/\x84\./\x80/g; - s/\x84'/\x81/g; - s/\x84//g; + s/\xE084\./\xE080/g; + s/\xE084'/\xE081/g; + s/\xE084//g; # Examples should be verbatim. unless ($sect eq _('EXAMPLES')) @@ -603,7 +607,7 @@ while (length) } # Escape remaining hyphens. - s/-/\x83/g; + s/-/\xE083/g; if ($sect eq _('COPYRIGHT')) { @@ -675,6 +679,7 @@ while (my ($sect, $text) = each %replace) # Output header. print <