diff --git a/unicodedomino.sty b/unicodedomino.sty index f020c1c4eeb718a295e50940f61aa9840d318a92..a070a61fb7efdfb9cc9c46f078413736cf078b4f 100644 --- a/unicodedomino.sty +++ b/unicodedomino.sty @@ -28,7 +28,7 @@ % makes all UTF-8 available in listings. \NeedsTeXFormat{LaTeX2e}% -\ProvidesPackage{unicodedomino}[2018/08/05 1.4 Domino for unknown codepoints]% +\ProvidesPackage{unicodedomino}[2018/08/06 1.7 Domino for unknown codepoints]% \makeatletter% \ifx\numexpr\@undefined% @@ -37,16 +37,14 @@ \ifx\UTFviii@defined\@undefined% \PackageError{unicodedomino}{This package requires UTF-8 input encoding}% \fi% -\ifx\decode@UTFviii\@undefined% - \PackageError{unicodedomino}% - {Your utf8.def is too old, consider updating it}% - {You will need v1.1o 2015/08/28, Debian stretch or newer}% -\fi% % pull code from other files \ifx\UTFviii@check@three\@undefined% \input{unicodedomino_kernel_fixup_f4_and_checkseq.def}% \fi% +\ifx\UTFviii@decode\@undefined% + \input{unicodedomino_kernel_better_decode.def}% +\fi% \input{unicodedomino_kernel_cosmetics.def}% \input{unicodedomino_compat.def}% @@ -64,16 +62,31 @@ % collect octet tokens, trim them and pass them on to handler \long\def\UTFviii@two@octets#1#2{% - \edef\unicodedomino@tmp{#1\unicodedomino@last{\string#2}}% - \expandafter\unicodedomino@octets\unicodedomino@tmp\empty% + \ifx\protect\@typeset@protect% + \edef\unicodedomino@tmp{#1\unicodedomino@last{\string#2}}% + \expandafter\unicodedomino@octets\unicodedomino@tmp\empty% + \else% + % protected, just write the original multibyte character + \string#1\string#2% + \fi% }% \long\def\UTFviii@three@octets#1#2#3{% - \edef\unicodedomino@tmp{#1\unicodedomino@last{\string#2}\unicodedomino@last{\string#3}}% - \expandafter\unicodedomino@octets\unicodedomino@tmp\empty% + \ifx\protect\@typeset@protect% + \edef\unicodedomino@tmp{#1\unicodedomino@last{\string#2}\unicodedomino@last{\string#3}}% + \expandafter\unicodedomino@octets\unicodedomino@tmp\empty% + \else% + % protected, just write the original multibyte character + \string#1\string#2\string#3% + \fi% }% \long\def\UTFviii@four@octets#1#2#3#4{% - \edef\unicodedomino@tmp{#1\unicodedomino@last{\string#2}\unicodedomino@last{\string#3}\unicodedomino@last{\string#4}}% - \expandafter\unicodedomino@octets\unicodedomino@tmp\empty% + \ifx\protect\@typeset@protect% + \edef\unicodedomino@tmp{#1\unicodedomino@last{\string#2}\unicodedomino@last{\string#3}\unicodedomino@last{\string#4}}% + \expandafter\unicodedomino@octets\unicodedomino@tmp\empty% + \else% + % protected, just write the original multibyte character + \string#1\string#2\string#3\string#4% + \fi% }% % handle trimmed octets @@ -83,32 +96,26 @@ % main handler \def\UTFviii@defined#1{% - \ifx\protect\@typeset@protect% - % not protected - \ifx#1\relax% - % unknown character - \if\relax\expandafter\UTFviii@checkseq\string#1\empty\relax% - % valid codepoint / multibyte sequence - \PackageWarning{inputenc}{% - Unicode character \expandafter\UTFviii@splitcsname\string#1\relax\space% - not set up for use\MessageBreak with LaTeX, replacing% - }% - % note: warning is the same as in utf8ienc.dtx v1.2d except - % with the string “, replacing†added at the end - \expandafter\unicodedomino@domino\string#1\relax% - \else% - % invalid multibyte character - \PackageError{inputenc}{Invalid UTF-8 byte sequence:% - \expandafter\UTFviii@splitseq\string#1\relax}% - \UTFviii@invalid@help - \fi% + \ifx#1\relax% + % unknown character + \if\relax\expandafter\UTFviii@checkseq\string#1\empty\relax% + % valid codepoint / multibyte sequence + \PackageWarning{inputenc}{% + Unicode character \expandafter\UTFviii@splitcsname\string#1\relax\space% + not set up for use\MessageBreak with LaTeX, replacing% + }% + % note: warning is the same as in utf8ienc.dtx v1.2d except + % with the string “, replacing†added at the end + \expandafter\unicodedomino@domino\string#1\relax% \else% - % known character, expand - \expandafter#1% + % invalid multibyte character + \PackageError{inputenc}{Invalid UTF-8 byte sequence:% + \expandafter\UTFviii@splitseq\string#1\relax}% + \UTFviii@invalid@help \fi% \else% - % protected, just write the original multibyte character - \expandafter\@gobblefour\string#1% + % known character, expand + \expandafter#1% \fi% }% diff --git a/unicodedomino_compat.def b/unicodedomino_compat.def index a581c2a61a567e5900126404fb09f3996b667095..12e62ef939891c5f164098fd38f860d7c8c27909 100644 --- a/unicodedomino_compat.def +++ b/unicodedomino_compat.def @@ -20,6 +20,28 @@ %- % Compatibility code with older utf8.def versions +% added in release 2015/01/01 +\ifx\textsubscript\@undefined% + \RequirePackage{fixltx2e}% +\fi% + +% added in v1.1o 2015/08/28 +\ifx\UTFviii@hexnumber\@undefined% + % taken from utf8ienc.dtx, no claim on the code but it’s really trivial + \gdef\UTFviii@hexnumber#1{% + \ifnum#1>15 % + \expandafter\UTFviii@hexnumber\expandafter{\the\numexpr(#1-8)/16\relax}% + \fi% + \UTFviii@hexdigit{\numexpr#1\ifnum#1>0-((#1-8)/16)*16\fi\relax}% + }% + \gdef\UTFviii@hexdigit#1{% + \ifcase\numexpr#1\relax% + 0\or1\or2\or3\or4\or5\or6\or7\or8\or9\or% + A\or B\or C\or D\or E\or F% + \fi% + }% +\fi% + % added in v1.2a 2018/03/24 \ifx\UTFviii@invalid@help\@undefined% \def\UTFviii@invalid@help{% diff --git a/unicodedomino_kernel_better_decode.def b/unicodedomino_kernel_better_decode.def new file mode 100644 index 0000000000000000000000000000000000000000..aa0a72323275cf614d97700591fe9e03aadc8bae --- /dev/null +++ b/unicodedomino_kernel_better_decode.def @@ -0,0 +1,60 @@ +% -*- mode: tex -*- +%- +% Copyright © 2018 +% mirabilos <t.glaser@tarent.de> +% +% Provided that these terms and disclaimer and all copyright notices +% are retained or reproduced in an accompanying document, permission +% is granted to deal in this work without restriction, including un†+% limited rights to use, publicly perform, distribute, sell, modify, +% merge, give away, or sublicence. +% +% This work is provided “AS IS†and WITHOUT WARRANTY of any kind, to +% the utmost extent permitted by applicable law, neither express nor +% implied; without malicious intent or gross negligence. In no event +% may a licensor, author or contributor be held liable for indirect, +% direct, other damage, loss, or other issues arising in any way out +% of dealing in the work, even if advised of the possibility of such +% damage or existence of a defect, except proven that it results out +% of said person’s immediate fault when using the work as intended. +% +% This file is dual-licenced under the LPPL version 1.3c or later. +%- +% Improved Unicode decoding using the fixed-up checkseq code. + +% override stock function, calling safer decode below +\gdef\decode@UTFviii#1\relax{% + \the\numexpr(\UTFviii@decode0:#1\relax)% +}% + +% safer decode, returns 0x1FFFFF for illegal sequences +\gdef\UTFviii@decode#1\relax{% + \if\relax\expandafter\UTFviii@checkseq\string#1\empty\relax% + \UTFviii@dec@lead#1\relax% + \else% + 2097151% + \fi% +}% + +\gdef\UTFviii@dec@lead#1:#2#3\relax{% + % we know #2 is in 00..7F, C2..F4 + \ifnum`#2<"80 % + `#2% + \else% + \ifnum`#2<"E0 % + (`#2-"C0% + \else% + \ifnum`#2<"F0 % + ((`#2-"E0% + \else% + (((`#2-"F0% + \fi% + \fi% + \UTFviii@dec@trail#3\relax% + \fi% +}% + +\gdef\UTFviii@dec@trail#1#2\relax{% + )*64+(`#1-"80)% + \ifx\relax#2\else\UTFviii@dec@trail#2\relax\fi% +}% diff --git a/unicodedomino_kernel_cosmetics.def b/unicodedomino_kernel_cosmetics.def index a21165f73f5297315ef80c1b2891286d909f0ecb..23de9eee4e0dd8684de007cf9d3757c212ee7bc5 100644 --- a/unicodedomino_kernel_cosmetics.def +++ b/unicodedomino_kernel_cosmetics.def @@ -25,7 +25,7 @@ %: https://github.com/latex3/latex2e/pull/62 \ifx\UTFviii@hexbyte\@undefined% % format a number as two-digit hex - \def\UTFviii@hexbyte#1{% + \gdef\UTFviii@hexbyte#1{% \ifnum#1<16 0\fi% \UTFviii@hexnumber{#1}% }% @@ -40,7 +40,7 @@ %: https://github.com/latex3/latex2e/pull/63 \ifx\UTFviii@hexcodepoint\@undefined% % format a number as Unicode codepoint hex - \def\UTFviii@hexcodepoint#1{% + \gdef\UTFviii@hexcodepoint#1{% \ifnum#1<16 U+000% \else\ifnum#1<256 U+00% \else\ifnum#1<4096 U+0% diff --git a/unicodedomino_kernel_fixup_f4_and_checkseq.def b/unicodedomino_kernel_fixup_f4_and_checkseq.def index 334a75c7804ee6dcefa443b0dde11cbe2b90a3ac..545deae4d197bd3378961af7f87cff78fca662c4 100644 --- a/unicodedomino_kernel_fixup_f4_and_checkseq.def +++ b/unicodedomino_kernel_fixup_f4_and_checkseq.def @@ -26,7 +26,7 @@ %: not forwarded yet, will only do so once PR#60 (see below) is in % new check for illegal sequences -\def\UTFviii@checkseq#1:#2#3\empty{% +\gdef\UTFviii@checkseq#1:#2#3\empty{% \ifnum`#2<"80 % \ifx\empty#3\empty% \else% @@ -70,7 +70,7 @@ \fi% \fi% }% -\def\UTFviii@check@one#1#2\empty{% +\gdef\UTFviii@check@one#1#2\empty{% \ifx\empty#2\empty% \ifnum`#1<"80 % 1% @@ -83,7 +83,7 @@ 1% \fi% }% -\def\UTFviii@check@two#1.#2#3\empty{% +\gdef\UTFviii@check@two#1.#2#3\empty{% \ifx\empty#3\empty% 1% \else% @@ -98,7 +98,7 @@ \fi% \fi% }% -\def\UTFviii@check@three#1.#2.#3#4\empty{% +\gdef\UTFviii@check@three#1.#2.#3#4\empty{% \ifx\empty#4\empty% 1% \else%