From dff7cc430763df9a0682bc83e4bb7982dd141eec Mon Sep 17 00:00:00 2001 From: mirabilos <m@mirbsd.org> Date: Tue, 12 Nov 2019 19:10:20 +0100 Subject: [PATCH] split unicodedomino_kernel_fixup_f4_and_checkseq.def: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • rename merged part (10FFFF upper bound in \parse@XML@charref and fix "F4 lead octet) ⇒ unicodedomino_kernel_fixup_f4.def • move not merged part (better \UTFviii@checkseq) together with the remaining PR to unicodedomino_kernel_better_decode.def • reorder inclusion; for now, include fixup_f4 unconditionally --- unicodedomino.sty | 4 +- unicodedomino_kernel_better_decode.def | 94 ++++++++++++++++++++++++- unicodedomino_kernel_fixup_f4.def | 95 +------------------------- 3 files changed, 96 insertions(+), 97 deletions(-) diff --git a/unicodedomino.sty b/unicodedomino.sty index 82aebe1..ad98073 100644 --- a/unicodedomino.sty +++ b/unicodedomino.sty @@ -41,12 +41,10 @@ \input{unicodedomino_compat.def}% % pull code from other files -\ifx\UTFviii@check@three\@undefined% - \input{unicodedomino_kernel_fixup_f4_and_checkseq.def}% -\fi% \ifx\UTFviii@decode\@undefined% \input{unicodedomino_kernel_better_decode.def}% \fi% +\input{unicodedomino_kernel_fixup_f4.def}% \input{unicodedomino_kernel_cosmetics.def}% % retrieve the last octet diff --git a/unicodedomino_kernel_better_decode.def b/unicodedomino_kernel_better_decode.def index 9412650..3287ab4 100644 --- a/unicodedomino_kernel_better_decode.def +++ b/unicodedomino_kernel_better_decode.def @@ -3,7 +3,99 @@ % See unicodedomino.sty for copyright and licence terms. Furthermore % this file is dual-licenced under the LPPL version 1.3c or later. %- -% Improved Unicode decoding using the fixed-up checkseq code. +% Fix check for illegal sequences to fail overlong encoded sequences +% as well as codepoints outside of the Unicode range [0;10FFFF]. Use +% fixed-up check code to improve Unicode decoding. + +%: https://github.com/latex3/latex2e/pull/83 +% new check for illegal sequences +\gdef\UTFviii@checkseq#1:#2#3\empty{% + \ifnum`#2<"80 % + \ifx\empty#3\empty% + \else% + 1% + \fi% + \else% + \ifnum`#2<"C2 % + 1% + \else% + \ifnum`#2<"E0 % + % one 80-BF + \UTFviii@check@one#3\empty% + \else% + \ifnum`#2<"E1 % + % A0-BF + one 80-BF + \UTFviii@check@two"A0.#3\empty% + \else% + \ifnum`#2<"F0 % + % two 80-BF + \UTFviii@check@two"80.#3\empty% + \else% + \ifnum`#2<"F1 % + % 90-BF + two 80-BF + \UTFviii@check@three"90."BF.#3\empty% + \else% + \ifnum`#2<"F4 % + % three 80-BF + \UTFviii@check@three"80."BF.#3\empty% + \else% + \ifnum`#2<"F5 % + % 80-8F + two 80-BF + \UTFviii@check@three"80."8F.#3\empty% + \else% + 1% + \fi% + \fi% + \fi% + \fi% + \fi% + \fi% + \fi% + \fi% +}% +\gdef\UTFviii@check@one#1#2\empty{% + \ifx\empty#2\empty% + \ifnum`#1<"80 % + 1% + \else% + \ifnum`#1>"BF % + 1% + \fi% + \fi% + \else% + 1% + \fi% +}% +\gdef\UTFviii@check@two#1.#2#3\empty{% + \ifx\empty#3\empty% + 1% + \else% + \ifnum`#2<#1 % + 1% + \else% + \ifnum`#2>"BF % + 1% + \else% + \UTFviii@check@one#3\empty% + \fi% + \fi% + \fi% +}% +\gdef\UTFviii@check@three#1.#2.#3#4\empty{% + \ifx\empty#4\empty% + 1% + \else% + \ifnum`#3<#1 % + 1% + \else% + \ifnum`#3>#2 % + 1% + \else% + \UTFviii@check@two"80.#4\empty% + \fi% + \fi% + \fi% +}% %: https://github.com/latex3/latex2e/pull/83 % override stock function, calling safer decode below diff --git a/unicodedomino_kernel_fixup_f4.def b/unicodedomino_kernel_fixup_f4.def index f535b4b..bc028c1 100644 --- a/unicodedomino_kernel_fixup_f4.def +++ b/unicodedomino_kernel_fixup_f4.def @@ -3,99 +3,8 @@ % See unicodedomino.sty for copyright and licence terms. Furthermore % this file is dual-licenced under the LPPL version 1.3c or later. %- -% Fix check for illegal sequences to fail overlong encoded sequences -% as well as codepoints outside of the Unicode range [0;10FFFF]. Add -% "F4 to the list of permitted lead octets. - -%: https://github.com/latex3/latex2e/pull/83 -% new check for illegal sequences -\gdef\UTFviii@checkseq#1:#2#3\empty{% - \ifnum`#2<"80 % - \ifx\empty#3\empty% - \else% - 1% - \fi% - \else% - \ifnum`#2<"C2 % - 1% - \else% - \ifnum`#2<"E0 % - % one 80-BF - \UTFviii@check@one#3\empty% - \else% - \ifnum`#2<"E1 % - % A0-BF + one 80-BF - \UTFviii@check@two"A0.#3\empty% - \else% - \ifnum`#2<"F0 % - % two 80-BF - \UTFviii@check@two"80.#3\empty% - \else% - \ifnum`#2<"F1 % - % 90-BF + two 80-BF - \UTFviii@check@three"90."BF.#3\empty% - \else% - \ifnum`#2<"F4 % - % three 80-BF - \UTFviii@check@three"80."BF.#3\empty% - \else% - \ifnum`#2<"F5 % - % 80-8F + two 80-BF - \UTFviii@check@three"80."8F.#3\empty% - \else% - 1% - \fi% - \fi% - \fi% - \fi% - \fi% - \fi% - \fi% - \fi% -}% -\gdef\UTFviii@check@one#1#2\empty{% - \ifx\empty#2\empty% - \ifnum`#1<"80 % - 1% - \else% - \ifnum`#1>"BF % - 1% - \fi% - \fi% - \else% - 1% - \fi% -}% -\gdef\UTFviii@check@two#1.#2#3\empty{% - \ifx\empty#3\empty% - 1% - \else% - \ifnum`#2<#1 % - 1% - \else% - \ifnum`#2>"BF % - 1% - \else% - \UTFviii@check@one#3\empty% - \fi% - \fi% - \fi% -}% -\gdef\UTFviii@check@three#1.#2.#3#4\empty{% - \ifx\empty#4\empty% - 1% - \else% - \ifnum`#3<#1 % - 1% - \else% - \ifnum`#3>#2 % - 1% - \else% - \UTFviii@check@two"80.#4\empty% - \fi% - \fi% - \fi% -}% +% Disallow codepoints outside of the Unicode range [0;10FFFF]; allow +% "F4 as lead octet. %: fixed upstream % bugfix: disallow too large definitions -- GitLab