diff --git a/unicodedomino.sty b/unicodedomino.sty index d8d95ba2c3a12458f3477e11420b7bcc4e562ed6..f020c1c4eeb718a295e50940f61aa9840d318a92 100644 --- a/unicodedomino.sty +++ b/unicodedomino.sty @@ -28,7 +28,7 @@ % makes all UTF-8 available in listings. \NeedsTeXFormat{LaTeX2e}% -\ProvidesPackage{unicodedomino}[2018/08/05 1.3 Domino for unknown codepoints]% +\ProvidesPackage{unicodedomino}[2018/08/05 1.4 Domino for unknown codepoints]% \makeatletter% \ifx\numexpr\@undefined% @@ -40,127 +40,17 @@ \ifx\decode@UTFviii\@undefined% \PackageError{unicodedomino}% {Your utf8.def is too old, consider updating it}% - {You will need Debian stretch or newer}% + {You will need v1.1o 2015/08/28, Debian stretch or newer}% \fi% -% patch up bug in utf8.def that forbade 0xF4 lead byte -\begingroup% -\catcode`\~13 -\uccode`\~"F4 -\def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@four@octets\string~}}% -\uppercase\expandafter{\UTFviii@tmp}% -\endgroup% - -% now fixup to disallow too large definitions -\let\unicodedomino@parse@XML@charref\parse@XML@charref% -\gdef\parse@XML@charref{% - \ifnum\count@>"10FFFF\relax% - \PackageError{inputenc}{% - Cannot define Unicode char value\space% - \unicodedomino@shex\the\count@\relax\space (too large)% - }% - \fi% - \unicodedomino@parse@XML@charref% -}% - -% nicer printing of codepoint hex numbers, not strictly necessary -\def\unicodedomino@codepoint#1{% - \ifnum#1>1048575% - U-00% - \else\ifnum#1>65535% - U-000% - \else\ifnum#1>4095% - U+% - \else\ifnum#1>255% - U+0% - \else\ifnum#1>15% - U+00% - \else% - U+000% - \fi\fi\fi\fi\fi% - \expandafter\UTFviii@hexnumber\expandafter{#1}% -}% - -% same for bytes -\def\unicodedomino@bytehex#1{% - \expandafter\UTFviii@hexdigit\expandafter{\the\numexpr(#1-8)/16\relax}% - \UTFviii@hexdigit{\numexpr#1\ifnum#1>0-((#1-8)/16)*16\fi\relax}% -}% - -% and arbitrary numbers -\def\unicodedomino@shex#1\relax{% - \ifnum#1>15 % - \expandafter\unicodedomino@shex\expandafter\the\numexpr(#1-8)/16\relax% - \fi% - \UTFviii@hexdigit{\numexpr#1\ifnum#1>0-((#1-8)/16)*16\fi\relax}% -}% - -% override to beautify the output, not strictly necessary but requested -\gdef\UTFviii@splitcsname#1:#2\relax{% - #2 (\expandafter\unicodedomino@codepoint\expandafter{% - \the\numexpr\decode@UTFviii#2\relax})% -}% -\def\UTFviii@invalid@err#1{% - \PackageError{inputenc}{% - Invalid UTF-8 byte 0x\unicodedomino@bytehex{\number`#1}% - }\UTFviii@invalid@help% -}% - -% for invalid encoding output -\gdef\unicodedomino@splith@x#1#2\relax{% - \space 0x\UTFviii@hexnumber{`#1}% - \ifx\relax#2\relax\else\unicodedomino@splith@x#2\relax\fi% -}% -\gdef\unicodedomino@splithex#1:#2\relax{% - \unicodedomino@splith@x#2\relax% -}% - -% render the actual domino piece -\def\unicodedomino@box#1#2{% - \begingroup% - \fboxsep=.1em% - \fboxrule=.4pt% - \texttt{\fbox{\makebox[0pt][l]{\textsuperscript{#1}}\textsubscript{#2}}}% - \endgroup% -}% - -% expand one hex nybble -\def\unicodedomino@hex@ne#1{% - \ifnum#1>15 % - \expandafter\unicodedomino@hex@ne\expandafter{\the\numexpr(#1-8)/16\relax}% - \fi% - \UTFviii@hexdigit{\numexpr#1\ifnum#1>0-((#1-8)/16)*16\fi\relax}.% -}% -% expand all hex nybbles, zero-padded -\def\unicodedomino@hex@ll#1{% - \ifnum#1<1048576 0.\fi% - \ifnum#1<65536 0.\fi% - \ifnum#1<4096 0.\fi% - \ifnum#1<256 0.\fi% - \ifnum#1<16 0.\fi% - \expandafter\unicodedomino@hex@ne\expandafter{#1}% -}% -% call the appropriate box function -\def\unicodedomino@hex@do#1.#2.#3.#4.#5.#6.{% - \ifnum"#1#2=0% - \unicodedomino@box{#3#4}{#5#6}% - \else% - \unicodedomino@box{#1#2#3}{#4#5#6}% - \fi% -}% -% split nybbles and pass on -\protected\def\unicodedomino@hex#1{% - \edef\unicodedomino@tmp{\expandafter\unicodedomino@hex@ll\expandafter{#1}}% - \expandafter\unicodedomino@hex@do\unicodedomino@tmp\relax% -}% -% split, decode and pass on -\def\unicodedomino@decode#1:#2\relax{% - \expandafter\unicodedomino@hex\expandafter{% - \the\numexpr\decode@UTFviii#2\relax% - }% -}% +% pull code from other files +\ifx\UTFviii@check@three\@undefined% + \input{unicodedomino_kernel_fixup_f4_and_checkseq.def}% +\fi% +\input{unicodedomino_kernel_cosmetics.def}% +\input{unicodedomino_compat.def}% -% retrieve the last octet (lstlistings compatibility) +% retrieve the last octet \def\unicodedomino@last#1{% \expandafter\unicodedomino@l@st#1\empty\empty\empty% }% @@ -172,12 +62,7 @@ \fi% }% -% handle trimmed octets -\def\unicodedomino@octets#1\empty{% - \expandafter\UTFviii@defined\csname u8:#1\endcsname% -}% - -% collect octet tokens, trim them, pass on to handler +% collect octet tokens, trim them and pass them on to handler \long\def\UTFviii@two@octets#1#2{% \edef\unicodedomino@tmp{#1\unicodedomino@last{\string#2}}% \expandafter\unicodedomino@octets\unicodedomino@tmp\empty% @@ -191,171 +76,78 @@ \expandafter\unicodedomino@octets\unicodedomino@tmp\empty% }% +% handle trimmed octets +\def\unicodedomino@octets#1\empty{% + \expandafter\UTFviii@defined\csname u8:#1\endcsname% +}% + % main handler \def\UTFviii@defined#1{% - \ifx#1\relax% - \if\relax\expandafter\UTFviii@chkseq\string#1\empty\relax% - % unknown char - \ifx\protect\@typeset@protect% - % not protected + \ifx\protect\@typeset@protect% + % not protected + \ifx#1\relax% + % unknown character + \if\relax\expandafter\UTFviii@checkseq\string#1\empty\relax% + % valid codepoint / multibyte sequence \PackageWarning{inputenc}{% - Unicode\space char\space\expandafter\UTFviii@splitcsname\string#1\relax% - \space not\space set\space up\space for\space use\MessageBreak with\space% - LaTeX, replacing% + Unicode character \expandafter\UTFviii@splitcsname\string#1\relax\space% + not set up for use\MessageBreak with LaTeX, replacing% }% - % note: same warning as in \UTFviii@undefined@err in utf8.def v1.2c, - % except adding “, replacing†at the end and not being an error - \expandafter\unicodedomino@decode\string#1\relax% + % note: warning is the same as in utf8ienc.dtx v1.2d except + % with the string “, replacing†added at the end + \expandafter\unicodedomino@domino\string#1\relax% \else% - % protected, just write the original character - \expandafter\@gobblefour\string#1% + % invalid multibyte character + \PackageError{inputenc}{Invalid UTF-8 byte sequence:% + \expandafter\UTFviii@splitseq\string#1\relax}% + \UTFviii@invalid@help \fi% \else% - % invalid encoding - \PackageError{inputenc}{% - Invalid\space UTF-8\space byte\space sequence:% - \expandafter\unicodedomino@splithex\string#1\relax% - }{Do ensure the source document is saved in UTF-8 encoding}% + % known character, expand + \expandafter#1% \fi% \else% - % known char, expand - \expandafter#1% + % protected, just write the original multibyte character + \expandafter\@gobblefour\string#1% \fi% }% -% input validation, including security-relevant checks -\def\UTFviii@chkseq#1:#2#3\empty{% - \ifnum`#2<"80 % - \ifx\empty#3\empty% - \else% - 1% - \fi% - \else% - \ifnum`#2<"C2 % - 1% - \else% - \ifnum`#2<"E0 % - % one 80-BF - \UTFviii@chksq@onetrail#3\empty% - \else% - \ifnum`#2<"E1 % - % A0-BF + one 80-BF - \UTFviii@chksq@a@trail#3\empty% - \else% - \ifnum`#2<"F0 % - % two 80-BF - \UTFviii@chksq@twotrail#3\empty% - \else% - \ifnum`#2<"F1 % - % 90-BF + two 80-BF - \UTFviii@chksq@ninetytrails#3\empty% - \else% - \ifnum`#2<"F4 % - % three 80-BF - \UTFviii@chksq@threetrail#3\empty% - \else% - \ifnum`#2<"F5 % - % 80-8F + two 80-BF - \UTFviii@chksq@belowninetytrails#3\empty% - \else% - 1% - \fi% - \fi% - \fi% - \fi% - \fi% - \fi% - \fi% - \fi% -}% -\def\UTFviii@chksq@onetrail#1#2\empty{% - \ifx\empty#2\empty% - \ifnum`#1<"80 % - 1% - \else% - \ifnum`#1<"C0 % - \else% - 1% - \fi% - \fi% - \else% - 1% - \fi% -}% -\def\UTFviii@chksq@a@trail#1#2\empty{% - \ifx\empty#2\empty% - 1% - \else% - \ifnum`#1<"A0 % - 1% - \else% - \ifnum`#1<"C0 % - \UTFviii@chksq@onetrail#2\empty% - \else% - 1% - \fi% - \fi% - \fi% -}% -\def\UTFviii@chksq@twotrail#1#2\empty{% - \ifx\empty#2\empty% - 1% - \else% - \ifnum`#1<"80 % - 1% - \else% - \ifnum`#1<"C0 % - \UTFviii@chksq@onetrail#2\empty% - \else% - 1% - \fi% - \fi% - \fi% +% prepare for outputting the domino block +\def\unicodedomino@domino#1:#2\relax{% + \edef\unicodedomino@tmp{% + \expandafter\unicodedomino@domino@hex\expandafter{% + \the\numexpr\decode@UTFviii#2\relax% + }% + }% + \expandafter\unicodedomino@domino@switch\unicodedomino@tmp% }% -\def\UTFviii@chksq@ninetytrails#1#2\empty{% - \ifx\empty#2\empty% - 1% - \else% - \ifnum`#1<"90 % - 1% - \else% - \ifnum`#1<"C0 % - \UTFviii@chksq@twotrail#2\empty% - \else% - 1% - \fi% - \fi% - \fi% + +% convert to six nybbles +\def\unicodedomino@domino@hex#1{% + \ifnum#1<1048576 0\fi% + \ifnum#1<65536 0\fi% + \ifnum#1<4096 0\fi% + \ifnum#1<256 0\fi% + \ifnum#1<16 0\fi% + \UTFviii@hexnumber{#1}% }% -\def\UTFviii@chksq@threetrail#1#2\empty{% - \ifx\empty#2\empty% - 1% + +% decide whether to output a BMP or astral planes block +\def\unicodedomino@domino@switch#1#2#3#4#5#6{% + \ifnum"#1#2=0% + \unicodedomino@box{#3#4}{#5#6}% \else% - \ifnum`#1<"80 % - 1% - \else% - \ifnum`#1<"C0 % - \UTFviii@chksq@twotrail#2\empty% - \else% - 1% - \fi% - \fi% + \unicodedomino@box{#1#2#3}{#4#5#6}% \fi% }% -\def\UTFviii@chksq@belowninetytrails#1#2\empty{% - \ifx\empty#2\empty% - 1% - \else% - \ifnum`#1<"80 % - 1% - \else% - \ifnum`#1<"90 % - \UTFviii@chksq@twotrail#2\empty% - \else% - 1% - \fi% - \fi% - \fi% + +% render the actual domino piece +\def\unicodedomino@box#1#2{% + \begingroup% + \fboxsep=.1em% + \fboxrule=.4pt% + \texttt{\fbox{\makebox[0pt][l]{\textsuperscript{#1}}\textsubscript{#2}}}% + \endgroup% }% % clean up after ourselves diff --git a/unicodedomino_compat.def b/unicodedomino_compat.def new file mode 100644 index 0000000000000000000000000000000000000000..a581c2a61a567e5900126404fb09f3996b667095 --- /dev/null +++ b/unicodedomino_compat.def @@ -0,0 +1,28 @@ +% -*- mode: tex -*- +%- +% Copyright © 2018 +% mirabilos <m@mirbsd.org> +% +% Provided that these terms and disclaimer and all copyright notices +% are retained or reproduced in an accompanying document, permission +% is granted to deal in this work without restriction, including un†+% limited rights to use, publicly perform, distribute, sell, modify, +% merge, give away, or sublicence. +% +% This work is provided “AS IS†and WITHOUT WARRANTY of any kind, to +% the utmost extent permitted by applicable law, neither express nor +% implied; without malicious intent or gross negligence. In no event +% may a licensor, author or contributor be held liable for indirect, +% direct, other damage, loss, or other issues arising in any way out +% of dealing in the work, even if advised of the possibility of such +% damage or existence of a defect, except proven that it results out +% of said person’s immediate fault when using the work as intended. +%- +% Compatibility code with older utf8.def versions + +% added in v1.2a 2018/03/24 +\ifx\UTFviii@invalid@help\@undefined% + \def\UTFviii@invalid@help{% + Do ensure the source document is saved in UTF-8 encoding% + }% +\fi% diff --git a/unicodedomino_kernel_cosmetics.def b/unicodedomino_kernel_cosmetics.def new file mode 100644 index 0000000000000000000000000000000000000000..a21165f73f5297315ef80c1b2891286d909f0ecb --- /dev/null +++ b/unicodedomino_kernel_cosmetics.def @@ -0,0 +1,73 @@ +% -*- mode: tex -*- +%- +% Copyright © 2017, 2018 +% mirabilos <m@mirbsd.org> +% +% Provided that these terms and disclaimer and all copyright notices +% are retained or reproduced in an accompanying document, permission +% is granted to deal in this work without restriction, including un†+% limited rights to use, publicly perform, distribute, sell, modify, +% merge, give away, or sublicence. +% +% This work is provided “AS IS†and WITHOUT WARRANTY of any kind, to +% the utmost extent permitted by applicable law, neither express nor +% implied; without malicious intent or gross negligence. In no event +% may a licensor, author or contributor be held liable for indirect, +% direct, other damage, loss, or other issues arising in any way out +% of dealing in the work, even if advised of the possibility of such +% damage or existence of a defect, except proven that it results out +% of said person’s immediate fault when using the work as intended. +% +% This file is dual-licenced under the LPPL version 1.3c or later. +%- +% Improve error and warning formatting. Fully submitted upstream. + +%: https://github.com/latex3/latex2e/pull/62 +\ifx\UTFviii@hexbyte\@undefined% + % format a number as two-digit hex + \def\UTFviii@hexbyte#1{% + \ifnum#1<16 0\fi% + \UTFviii@hexnumber{#1}% + }% + + % override message to give the byte in hex + \def\UTFviii@invalid@err#1{% + \PackageError{inputenc}{Invalid UTF-8 byte "\UTFviii@hexbyte{`#1}}% + \UTFviii@invalid@help% + }% +\fi% + +%: https://github.com/latex3/latex2e/pull/63 +\ifx\UTFviii@hexcodepoint\@undefined% + % format a number as Unicode codepoint hex + \def\UTFviii@hexcodepoint#1{% + \ifnum#1<16 U+000% + \else\ifnum#1<256 U+00% + \else\ifnum#1<4096 U+0% + \else\ifnum#1<65536 U+% + \else\ifnum#1<1048576 U-000% + \else U-00% + \fi\fi\fi\fi\fi% + \UTFviii@hexnumber{#1}% + }% + + % override message to format the codepoint correctly + \gdef\UTFviii@splitcsname#1:#2\relax{% + #2 (\expandafter\UTFviii@hexcodepoint\expandafter{% + \the\numexpr\decode@UTFviii#2\relax})% + }% +\fi% + +%: https://github.com/latex3/latex2e/pull/62 +% split an invalid byte sequence for output +\ifx\UTFviii@splitseq\@undefined% + \gdef\UTFviii@splitseq#1:#2\relax{% + \UTFviii@hexseq#2\relax% + }% + \gdef\UTFviii@hexseq#1#2\relax{% + % display first octet + \space "\UTFviii@hexbyte{`#1}% + % recursively handle remaining octets + \ifx\relax#2\relax\else\UTFviii@hexseq#2\relax\fi% + }% +\fi% diff --git a/unicodedomino_kernel_fixup_f4_and_checkseq.def b/unicodedomino_kernel_fixup_f4_and_checkseq.def new file mode 100644 index 0000000000000000000000000000000000000000..334a75c7804ee6dcefa443b0dde11cbe2b90a3ac --- /dev/null +++ b/unicodedomino_kernel_fixup_f4_and_checkseq.def @@ -0,0 +1,139 @@ +% -*- mode: tex -*- +%- +% Copyright © 2018 +% mirabilos <m@mirbsd.org> +% +% Provided that these terms and disclaimer and all copyright notices +% are retained or reproduced in an accompanying document, permission +% is granted to deal in this work without restriction, including un†+% limited rights to use, publicly perform, distribute, sell, modify, +% merge, give away, or sublicence. +% +% This work is provided “AS IS†and WITHOUT WARRANTY of any kind, to +% the utmost extent permitted by applicable law, neither express nor +% implied; without malicious intent or gross negligence. In no event +% may a licensor, author or contributor be held liable for indirect, +% direct, other damage, loss, or other issues arising in any way out +% of dealing in the work, even if advised of the possibility of such +% damage or existence of a defect, except proven that it results out +% of said person’s immediate fault when using the work as intended. +% +% This file is dual-licenced under the LPPL version 1.3c or later. +%- +% Fix check for illegal sequences to fail overlong encoded sequences +% as well as codepoints outside of the Unicode range [0;10FFFF]. Add +% "F4 to the list of permitted lead octets. + +%: not forwarded yet, will only do so once PR#60 (see below) is in +% new check for illegal sequences +\def\UTFviii@checkseq#1:#2#3\empty{% + \ifnum`#2<"80 % + \ifx\empty#3\empty% + \else% + 1% + \fi% + \else% + \ifnum`#2<"C2 % + 1% + \else% + \ifnum`#2<"E0 % + % one 80-BF + \UTFviii@check@one#3\empty% + \else% + \ifnum`#2<"E1 % + % A0-BF + one 80-BF + \UTFviii@check@two"A0.#3\empty% + \else% + \ifnum`#2<"F0 % + % two 80-BF + \UTFviii@check@two"80.#3\empty% + \else% + \ifnum`#2<"F1 % + % 90-BF + two 80-BF + \UTFviii@check@three"90."BF.#3\empty% + \else% + \ifnum`#2<"F4 % + % three 80-BF + \UTFviii@check@three"80."BF.#3\empty% + \else% + \ifnum`#2<"F5 % + % 80-8F + two 80-BF + \UTFviii@check@three"80."8F.#3\empty% + \else% + 1% + \fi% + \fi% + \fi% + \fi% + \fi% + \fi% + \fi% + \fi% +}% +\def\UTFviii@check@one#1#2\empty{% + \ifx\empty#2\empty% + \ifnum`#1<"80 % + 1% + \else% + \ifnum`#1>"BF % + 1% + \fi% + \fi% + \else% + 1% + \fi% +}% +\def\UTFviii@check@two#1.#2#3\empty{% + \ifx\empty#3\empty% + 1% + \else% + \ifnum`#2<#1 % + 1% + \else% + \ifnum`#2>"BF % + 1% + \else% + \UTFviii@check@one#3\empty% + \fi% + \fi% + \fi% +}% +\def\UTFviii@check@three#1.#2.#3#4\empty{% + \ifx\empty#4\empty% + 1% + \else% + \ifnum`#3<#1 % + 1% + \else% + \ifnum`#3>#2 % + 1% + \else% + \UTFviii@check@two"80.#4\empty% + \fi% + \fi% + \fi% +}% + +%: https://github.com/latex3/latex2e/pull/60 +% bugfix: disallow too large definitions +\let\unicodedomino@parse@XML@charref\parse@XML@charref% +\gdef\parse@XML@charref{% + \ifnum\count@>"10FFFF\relax% + \PackageError{inputenc}{% + Cannot define Unicode char value\space% + \UTFviii@hexnumber\count@\space% + (too large)% + }% + \fi% + \unicodedomino@parse@XML@charref% +}% + +%: https://github.com/latex3/latex2e/pull/60 +% bugfix: add "F4 to the list of permitted lead octets +% (needs the above fix) +\begingroup% +\catcode`\~13 +\uccode`\~"F4 +\def\UTFviii@tmp{\xdef~{\noexpand\UTFviii@four@octets\string~}}% +\uppercase\expandafter{\UTFviii@tmp}% +\endgroup%