help2man: work around some utf8 issues
help2man used characters in the range 0x80,...,0x84 to mark special sections/characters during its processing, but those bytes where also occurring in other utf-8 characters breaking the output. For instance the character '₁' ( a subscript 1), is encoded as "0xE2 0x82 0x81" in utf-8. * tools/help2man: Tell perl that input and output should be assumed to be utf-8. Also use "private-use codepoints" for those special characters to avoid any future conflict.
This commit is contained in:
parent
27a0137208
commit
a17d8a0501
1 changed files with 25 additions and 21 deletions
|
|
@ -2,7 +2,8 @@
|
||||||
|
|
||||||
# Generate a short man page from --help and --version output.
|
# Generate a short man page from --help and --version output.
|
||||||
# Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2009,
|
# Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2009,
|
||||||
# 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
|
# 2010, 2011, 2012, 2013, 2014, 2015, 2016 Free Software Foundation, Inc.
|
||||||
|
# Later modified by the Spot authors.
|
||||||
|
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License as published by
|
# it under the terms of the GNU General Public License as published by
|
||||||
|
|
@ -26,6 +27,9 @@ use Getopt::Long;
|
||||||
use Text::ParseWords qw(shellwords);
|
use Text::ParseWords qw(shellwords);
|
||||||
use Text::Tabs qw(expand);
|
use Text::Tabs qw(expand);
|
||||||
use POSIX qw(strftime setlocale LC_ALL);
|
use POSIX qw(strftime setlocale LC_ALL);
|
||||||
|
use utf8;
|
||||||
|
use open IN => ":encoding(UTF-8)";
|
||||||
|
binmode STDOUT, ":encoding(UTF-8)";
|
||||||
|
|
||||||
my $this_program = 'help2man';
|
my $this_program = 'help2man';
|
||||||
my $this_version = '1.47.4';
|
my $this_version = '1.47.4';
|
||||||
|
|
@ -405,10 +409,10 @@ s/\n\n+/\n\n/g;
|
||||||
s/([A-Za-z])-\n *([A-Za-z])/$1$2/g;
|
s/([A-Za-z])-\n *([A-Za-z])/$1$2/g;
|
||||||
|
|
||||||
# Temporarily exchange leading dots, apostrophes and backslashes for
|
# Temporarily exchange leading dots, apostrophes and backslashes for
|
||||||
# tokens.
|
# tokens. \xE000..\xF8FF are so called "private-use codepoints".
|
||||||
s/^\./\x80/mg;
|
s/^\./\xE080/mg;
|
||||||
s/^'/\x81/mg;
|
s/^'/\xE081/mg;
|
||||||
s/\\/\x82/g;
|
s/\\/\xE082/g;
|
||||||
|
|
||||||
# Translators: patterns are used to match common program output. In the source
|
# Translators: patterns are used to match common program output. In the source
|
||||||
# these strings are all of the form of "my $PAT_something = _('...');" and are
|
# these strings are all of the form of "my $PAT_something = _('...');" and are
|
||||||
|
|
@ -524,7 +528,7 @@ while (length)
|
||||||
{
|
{
|
||||||
$matched .= $& if %append_match;
|
$matched .= $& if %append_match;
|
||||||
$indent = length ($4 || "$1$3");
|
$indent = length ($4 || "$1$3");
|
||||||
$content = ".TP\n\x84$2\n\x84$5\n";
|
$content = ".TP\n\xE084$2\n\xE084$5\n";
|
||||||
unless ($4)
|
unless ($4)
|
||||||
{
|
{
|
||||||
# Indent may be different on second line.
|
# Indent may be different on second line.
|
||||||
|
|
@ -536,7 +540,7 @@ while (length)
|
||||||
elsif (s/^ {1,10}([+-]\S.*)\n//)
|
elsif (s/^ {1,10}([+-]\S.*)\n//)
|
||||||
{
|
{
|
||||||
$matched .= $& if %append_match;
|
$matched .= $& if %append_match;
|
||||||
$content = ".HP\n\x84$1\n";
|
$content = ".HP\n\xE084$1\n";
|
||||||
$indent = 80; # not continued
|
$indent = 80; # not continued
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -545,7 +549,7 @@ while (length)
|
||||||
{
|
{
|
||||||
$matched .= $& if %append_match;
|
$matched .= $& if %append_match;
|
||||||
$indent = length ($4 || "$1$3");
|
$indent = length ($4 || "$1$3");
|
||||||
$content = ".TP\n\x84$2\n\x84$5\n";
|
$content = ".TP\n\xE084$2\n\xE084$5\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
# Indented paragraph.
|
# Indented paragraph.
|
||||||
|
|
@ -553,7 +557,7 @@ while (length)
|
||||||
{
|
{
|
||||||
$matched .= $& if %append_match;
|
$matched .= $& if %append_match;
|
||||||
$indent = length $1;
|
$indent = length $1;
|
||||||
$content = ".IP\n\x84$2\n";
|
$content = ".IP\n\xE084$2\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
# Left justified paragraph.
|
# Left justified paragraph.
|
||||||
|
|
@ -569,7 +573,7 @@ while (length)
|
||||||
while ($indent ? s/^ {$indent}(\S.*)\n// : s/^(\S.*)\n//)
|
while ($indent ? s/^ {$indent}(\S.*)\n// : s/^(\S.*)\n//)
|
||||||
{
|
{
|
||||||
$matched .= $& if %append_match;
|
$matched .= $& if %append_match;
|
||||||
$content .= "\x84$1\n";
|
$content .= "\xE084$1\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
# Move to next paragraph.
|
# Move to next paragraph.
|
||||||
|
|
@ -578,9 +582,9 @@ while (length)
|
||||||
for ($content)
|
for ($content)
|
||||||
{
|
{
|
||||||
# Leading dot and apostrophe protection.
|
# Leading dot and apostrophe protection.
|
||||||
s/\x84\./\x80/g;
|
s/\xE084\./\xE080/g;
|
||||||
s/\x84'/\x81/g;
|
s/\xE084'/\xE081/g;
|
||||||
s/\x84//g;
|
s/\xE084//g;
|
||||||
|
|
||||||
# Examples should be verbatim.
|
# Examples should be verbatim.
|
||||||
unless ($sect eq _('EXAMPLES'))
|
unless ($sect eq _('EXAMPLES'))
|
||||||
|
|
@ -603,7 +607,7 @@ while (length)
|
||||||
}
|
}
|
||||||
|
|
||||||
# Escape remaining hyphens.
|
# Escape remaining hyphens.
|
||||||
s/-/\x83/g;
|
s/-/\xE083/g;
|
||||||
|
|
||||||
if ($sect eq _('COPYRIGHT'))
|
if ($sect eq _('COPYRIGHT'))
|
||||||
{
|
{
|
||||||
|
|
@ -675,6 +679,7 @@ while (my ($sect, $text) = each %replace)
|
||||||
# Output header.
|
# Output header.
|
||||||
print <<EOT;
|
print <<EOT;
|
||||||
.\\" DO NOT MODIFY THIS FILE! It was generated by $this_program $this_version.
|
.\\" DO NOT MODIFY THIS FILE! It was generated by $this_program $this_version.
|
||||||
|
.\\" -*- coding: utf-8 -*-
|
||||||
.TH $PROGRAM "$section" "$date" "$source" "$manual"
|
.TH $PROGRAM "$section" "$date" "$source" "$manual"
|
||||||
EOT
|
EOT
|
||||||
|
|
||||||
|
|
@ -699,14 +704,13 @@ for my $sect (@pre, (grep !$filter{$_}, @sections), @post)
|
||||||
{
|
{
|
||||||
# Replace leading dot, apostrophe, backslash and hyphen
|
# Replace leading dot, apostrophe, backslash and hyphen
|
||||||
# tokens.
|
# tokens.
|
||||||
s/\x80/\\&./g;
|
s/\xE080/\\&./g;
|
||||||
s/\x81/\\&'/g;
|
s/\xE081/\\&'/g;
|
||||||
s/\x82/\\e/g;
|
s/\xE082/\\e/g;
|
||||||
s/\x83/\\-/g;
|
s/\xE083/\\-/g;
|
||||||
|
|
||||||
# Convert some latin1 chars to troff equivalents
|
# Convert some latin1 chars to troff equivalents
|
||||||
s/\xa0/\\ /g; # non-breaking space
|
s/\xA0/\\ /g; # non-breaking space
|
||||||
|
|
||||||
print enc $_;
|
print enc $_;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -756,7 +760,7 @@ sub convert_option
|
||||||
{
|
{
|
||||||
local $_ = '\fB' . shift;
|
local $_ = '\fB' . shift;
|
||||||
|
|
||||||
s/-/\x83/g;
|
s/-/\xE083/g;
|
||||||
if (s/\[=(.*)\]$/\\fR[=\\fI$1\\fR]/)
|
if (s/\[=(.*)\]$/\\fR[=\\fI$1\\fR]/)
|
||||||
{
|
{
|
||||||
s/\|/\\fR|\\:\\fI/g;
|
s/\|/\\fR|\\:\\fI/g;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue