% \iffalse meta-comment % % Copyright (C) 2026 Alan J. Cain % % This file may be distributed and/or modified under the conditions of the LaTeX Project Public License, either version % 1.3c of this license or (at your option) any later version. The latest version of this license is in: % % http://www.latex-project.org/lppl.txt % % and version 1.3c or later is part of all distributions of LaTeX version 2008-05-04 or later. % % \fi % % \iffalse %<*driver> \PassOptionsToPackage{inline}{enumitem} \documentclass{l3doc} \usepackage{polyglossia} \setmainlanguage[variant=british]{english} \makeatletter \ExplSyntaxOn \cs_gset:Npn \l@subsection { \@dottedtocline{2}{2.5em}{2.8em} } % #2 = 1.5em \cs_gset:Npn \l@subsubsection { \@dottedtocline{3}{5.3em}{3.5em} } % #2 = 1.5em \cs_gset:Npn \l@paragraph { \@dottedtocline{4}{8.8em}{3.2em} } % #2 = 1.5em \ExplSyntaxOff \makeatother \usepackage{xcolor} \definecolor{linkcolor}{rgb}{0.0,0.4,0.7} \colorlet{citecolor}{linkcolor} \colorlet{urlcolor}{linkcolor} \hypersetup{ linkcolor=linkcolor,% citecolor=citecolor,% urlcolor=urlcolor,% } \usepackage{xurl} \renewcommand*\UrlBigBreaks{} \newcommand*\fullref[2]{% \hyperref[#2]{#1\penalty 200\ \ref*{#2}}% } \newcommand*\fullpageref[1]{% \hyperref[#1]{page\penalty 200\ \pageref*{#1}}% } \setcounter{tocdepth}{7} \numberwithin{figure}{section} \usepackage{lua-list-hyphen} \usepackage{lipsum} \usepackage{tikz} \newcommand*\key[1]{\texttt{#1}} \newcommand*\val[1]{\texttt{#1}} \newcommand*\keyvalue[2]{\texttt{#1=#2}} \newlist{vallist}{description}{1} \setlist[vallist]{ leftmargin=3em, style=unboxed, labelsep=1em, font=\descriptionitemcolon, nosep, } \newcommand*{\descriptionitemcolon}[1]{\kern 1em #1:} \NewDocumentCommand{\default}{ m }{(\textit{Default:}\nobreakspace #1)} \newcommand*\luafunc[1]{\texttt{#1}} \newcommand*\luavar[1]{\texttt{#1}} \newcommand*\prefixedurl[1]{\textsc{url}:~\url{#1}} \begin{document} \DocInput{lua-list-hyphen.dtx} \PrintIndex \end{document} % % \fi % % % % \GetFileInfo{lua-list-hyphen.sty} % % % % \title{^^A % \pkg{lua-list-hyphen} ^^A % --- Per-language listing of hyphenated words for Lua\LaTeX^^A % \footnote{This document describes \fileversion, last revised \filedate.}^^A % } % % \author{^^A % Alan J. Cain\footnote{\texttt{a.j.cain (AT) gmail.com}}^^A % } % % \date{Released \filedate} % % \maketitle % % % % \begin{abstract} % This Lua\LaTeX\ package writes each word that has been hyphenated across lines to a file, using a different file for % each language, for subsequent external checking. % \end{abstract} % % % % \tableofcontents % % % % \begin{documentation} % % % % \section{Introduction} % % \TeX's algorithm for finding points where a word can be hyphenated is good, but not perfect.\footnote{For a % description of the algorithm and its limitations, see Knuth's account in Appendix~H of \textit{The \TeX book} % (Addison-Wesley, 2021. ISBN:~\texttt{978-0-201-13447-6})} The present author writes in British English, where the % valid division points can depend on both the pronunciation of a word and its internal structure (and hence its % etymology). Currently, \TeX's pattern-based approach produces \textit{bio-lo-gic}, \textit{bio-logy}, % \textit{bio-lo-gist}, rather than the standard \textit{bio-logic}, \textit{biol-ogy}, % \textit{biolo-gist},\footnote{See the \textit{New Oxford Spelling Dictionary}, which is the authority for word % divisions in British English (Oxford University Press, 2005. ISBN:~\texttt{978-0-19-860881-3}).} To deal with such % cases, at least a substantially larger number of patterns would be required than are available at present. There are % also various words where the valid division points in British English cannot be deduced from their spelling alone: for % instance, the verbs \textit{at-trib-ute}, \textit{pre-sent}, \textit{pro-duce}, \textit{re-cord} have different % division points from the orthographically identical nouns \textit{at-tri-bute}, \textit{pres-ent}, \textit{prod-uce}, % \textit{rec-ord}. For another example, compare \textit{cur-ric-ulum vitae} and \textit{school cur-ricu-lum}. % % Easy checking of the chosen hyphenations is desirable. With Lua\TeX, it is possible to extract the hyphenated words. % The Lua\LaTeX\ package \pkg{lua-check-hyphen} offers this facility. It checks hyphenated words against a whitelist, % visually flags unknown hyphenations, and writes unknown hyphenations to a file. But it was first written in 2012, when % Lua\TeX\ was at an earlier stage of development, and so it has certain problems, such as with words containing % ligatures. It also lacks multi-language support. % % This Lua\LaTeX\ package, \pkg{lua-list-hyphen}, uses some ideas from \pkg{lua-check-hyphen} but was written from % scratch to work with a modern Lua\TeX. It simply writes hyphenated words from each language to a separate file, so % that they can be checked (manually or by an external program). % % [The author has written a simple Python application \texttt{hyphenassist}\footnote{\textsc{url}: % \url{https://codeberg.org/ajcain/hyphenassist}.} that checks the listed hyphenations against a dictionary of valid % divisions and allows the user to quickly choose to add entries to the division dictionary, add hyphenation exceptions, % or ignore particular hyphenations. He has used this program in conjunction with code incorporated into this package to % check hyphenations in his own books.\footnote{In particular, \textit{Form \& Number: A History of Mathematical % Beauty}. \textsc{url}: \url{https://archive.org/details/cain_formandnumber_ebook_large}.}] % % % % \paragraph*{Licence.} \noindent\pkg{lua-list-hyphen} is released under the \LaTeX\ Project Public Licence v1.3c or % later.\footnote{\textsc{url}: \url{https://www.latex-project.org/lppl.txt}} % % % % \paragraph*{Acknowledgements.} The author thanks Keno Wehr for corrections and comments on the documentation. % % % % \paragraph*{Feature requests and bug reports} % % The development code and issue tracker are hosted at Codeberg.\footnote{\textsc{url}: % \url{https://codeberg.org/ajcain/lua-list-hyphen}} % % % % \section{Requirements} % % \pkg{lua-list-hyphen} requires % \begin{enumerate}[label={(\arabic*)}] % \item Lua\LaTeX, % \item a recent \LaTeX\ kernel with \pkg{expl3} support (any kernel version since 2020-02-02 should suffice). % \end{enumerate} % It does not depend on any other packages, but will interface with \pkg{babel} or \pkg{polyglossia} (if one of them is % loaded) to determine language names. % % % % \section{Installation} % % To install \pkg{lua-list-hyphen} manually, run \texttt{luatex lua-list-hyphen.ins} and copy % \texttt{lua-list-hyphen.sty} and \texttt{lua-list-hyphen.lua} to somewhere Lua\LaTeX\ can find them. % % % % \section{Getting started} % % Simply load the package; the hyphenated words are by default written to the file % \cs{jobname}\file{-}\meta{lang-id}\file{.hyph}, without being sorted or having duplicates removed. The \meta{lang-id} % is either a Lua\TeX\ numerical language~ID, or a \pkg{babel} or \pkg{polyglossia} name of the language, if one of % these packages is in use. The prefix \cs{jobname}\file{-} and the extension \file{.hyph} can be customized; see % \fullref{Section}{sec:options}. % % % % \section{Package options} % \label{sec:options} % % \DescribeOption{verbose} % The boolean option \key{verbose} controls how much information is written to the file about each hyphenated word. % When \val{true}, for each hyphenated word, both the undivided original and the divided word are written out (on the % same line). When \val{false}, only the hyphenated word is written. \default{\val{false}} % % \DescribeOption{unique} % The option \key{unique} controls removal of duplicates from the list of hyphenated words written out. It can be be % set to one of the following three values: % \begin{vallist} % \item[\val{none}] Duplicate hyphenations are not removed. % \item[\val{case}] Hyphenations that are duplicate (case-sensitively) are removed. In this case, the hyphenations % \texttt{geo-metry} and \texttt{Geo-metry} are considered to be distinct. % \item[\val{nocase}] Hyphenations that are duplicate (case-insensitively) are removed. In this case, the hyphenations % \texttt{geo-metry} and \texttt{Geo-metry} are considered to be duplicates. The case of each listed hyphenation % will be that of the first appearance of that hyphenation. % \end{vallist} % \default{\val{none}} % % \DescribeOption{sort} % The option \key{sort} controls sorting of the list of hyphenated words. It can be be % set to one of the following three values: % \begin{vallist} % \item[\val{none}] Hyphenations appear in the same order as the occur in the document, or, if duplicates are removed, % in the order of first appearance in the document. % \item[\val{case}] Hyphenations are sorted case-sensitively. In this case, \texttt{Geo-metry} precedes % \texttt{geo-meter}. % \item[\val{nocase}] Hyphenations are sorted case-insensitively. In this case, \texttt{geo-meter} precedes % \texttt{Geo-metry}. % \end{vallist} % \default{\val{none}} % % \medskip % The two options \key{prefix} and \key{extension} specify the files to which hyphenations are written. Between the % prefix and the extension is either a Lua\TeX\ numerical language~ID, or a \pkg{babel} or \pkg{polyglossia} % name of the language, if one of these packages is in use. % % \DescribeOption{prefix} % The \key{prefix} is the part of the file name to which the list of hyphenated words is written, before the % language~ID. % \default{\cs{jobname}\file{-} (note the hyphen).} % % \DescribeOption{extension} % The extension of the file (including the \file{.}) to which the list of hyphenated words for each language is written. % \default{\file{.hyph}} % % \medskip % \DescribeOption{debug} % The boolean option \key{debug} controls whether debugging information is written to the terminal. % \default{\val{false}} % % % % \section{Usage notes} % % \subsection{Languages} % % To determine the language of a word, \pkg{lua-list-hyphen} looks at what language is applied at the first possible % hyphenation point, first considering the part of the word before it, then the part after it. In the (presumably rare) % case of a ‘mixed-language’ word like ‘near-Zugzwang’ being specified (using, for example, \pkg{babel}) with % \texttt{near-\cs{foreignlanguage}\{german\}\{Zugzwang\}}, it would be assigned to the language in which \hbox{‘near-’} % is set. % % Duplicates are removed within each language. If the same hyphenation occurs in two different languages, it will appear % in both files, regardless of the value of \key{unique}. % % % % \subsection{Limitations} % % \pkg{lua-list-hyphen} uses Lua\TeX's built-in functions for pattern matching and converting between upper and lower % case, which are based on the \texttt{slnunicode} library. This library has not been updated for some time and is based % on an out-of-date version of the Unicode standard. Thus there may be problems with languages added to Unicode more % recently. Hyphenated words from such languages should still be listed, but may contain extraneous characters and may % not be sorted correctly. Users may prefer to leave sorting and removal of duplicates to an external program that % adheres to the current Unicode standard. % % % % \end{documentation} % % % % \clearpage % \begin{implementation} % % % % \section{Implementation (\LaTeX\ package)} % % \begin{macrocode} %<*package> %<@@=lualisthyphen> % \end{macrocode} % % % % \subsection{Initial set-up} % % Package identification/version information. % \begin{macrocode} \NeedsTeXFormat{LaTeX2e}[2020-02-02] \ProvidesExplPackage{lua-list-hyphen}{2026-04-28}{0.2.50} {Listing hyphenated words for LuaLaTeX} % \end{macrocode} % Check that Lua\TeX\ is in use. % \begin{macrocode} \sys_if_engine_luatex:F { \msg_new:nnn{ lua-list-hyphen }{ lualatex_required } { LuaLaTeX~required.~Package~loading~will~abort. } \msg_critical:nn{ lua-list-hyphen }{ lualatex_required } } % \end{macrocode} % % % % \subsection{Options} % % \begin{macro}{\l_@@_verbose_bool} % Boolean option to indicate whether lists of hyphenations should be written verbosely. % \begin{macrocode} \keys_define:nn { lua-list-hyphen }{ verbose .bool_set:N = \l_@@_verbose_bool, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_unique_int} % Choice option to indicate whether lists of hyphenations should have duplicates removed, case-sensitively or % case-insensitively. % \begin{macrocode} \int_new:N\l_@@_unique_int \keys_define:nn { lua-list-hyphen }{ unique .choices:nn = { none, case, nocase }{ \int_set:Nn\l_@@_unique_int{ \l_keys_choice_int - 1 } }, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_sort_int} % Choice option to indicate whether lists of hyphenations should be sorted, case-sensitively or case-insensitively. % \begin{macrocode} \int_new:N\l_@@_sort_int \keys_define:nn { lua-list-hyphen }{ sort .choices:nn = { none, case, nocase }{ \int_set:Nn\l_@@_sort_int{ \l_keys_choice_int - 1 } }, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_file_prefix_str} % String option for the prefix of file files. % \begin{macrocode} \keys_define:nn { lua-list-hyphen }{ prefix .str_set:N = \l_@@_file_prefix_str, prefix .initial:e = { \c_sys_jobname_str- }, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{\l_@@_file_extension_str} % String option for the file extension of file files. % \begin{macrocode} \keys_define:nn { lua-list-hyphen }{ extension .str_set:N = \l_@@_file_extension_str, extension .initial:n = { .hyph }, } % \end{macrocode} % \end{macro} % % % % \begin{macro}{ % \l_@@_debug_int % } % Option to specify whether debug information is written to the terminal. Not intended for end users. % \begin{macrocode} \int_new:N\l_@@_debug_int \keys_define:nn { lua-list-hyphen }{ debug .code:n = {\int_set_eq:NN\l_@@_debug_int\c_one_int} } % \end{macrocode} % \end{macro} % % % % Process package options. % \begin{macrocode} \ProcessKeyOptions [ lua-list-hyphen ] % \end{macrocode} % % % % Convert boolean options to integers (which can be accessed from Lua). % \begin{macrocode} \int_new:N\l_@@_verbose_int \bool_if:NT\l_@@_verbose_bool { \int_set_eq:NN\l_@@_verbose_int\c_one_int } % \end{macrocode} % % % % \subsection{Lua backend and interface} % % Load the Lua backend. % \begin{macrocode} \lua_now:n{ lualisthyphen = require('lua-list-hyphen') } % \end{macrocode} % % % % \subsection{Saving language names} % % At \texttt{enddocument/afterlastpage}, if possible save \pkg{babel}'s language names. (\pkg{polyglossia}'s names can % be found directly from Lua.) % \begin{macrocode} \hook_gput_code:nnn{ enddocument/afterlastpage }{ lua-list-hyphen } { \@@_babel_save_language_names: } % \end{macrocode} % % \begin{macro}{\@@_babel_save_language_names:} % If \pkg{babel} is in use, get language names from \cs{bbl@languages}. % \begin{macrocode} \cs_new:Npn \@@_babel_save_language_names: { \cs_if_exist:NT\bbl@languages { % \end{macrocode} % Iterate through \cs{bbl@languages} to get language names. Items stored in this macro are quadruples prefixed with % \cs{bbl@elt}, so locally redefine this latter macro to an auxiliary function that passes language ID/name pairs to % the Lua backend. % \begin{macrocode} \group_begin: \cs_set_eq:NN \bbl@elt \@@_babel_save_language_names_elt:nnnn \bbl@languages \group_end: } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_babel_save_language_names_elt:nnnn} % Auxiliary function that takes a quadruple stored in \cs{bbl@languages} and passes language ID/name pairs to the Lua % backend. % \begin{macrocode} \cs_new:Npn \@@_babel_save_language_names_elt:nnnn #1#2#3#4 { \lua_now:n{ lualisthyphen.babel_save_language_name(#2,'#1') } } % \end{macrocode} % \end{macro} % % % % \subsection{Processing and writing hyphenation lists} % % At \texttt{enddocument/info}, write the list of hyphenations for each language. % \begin{macrocode} \hook_gput_code:nnn{ enddocument/info }{ lua-list-hyphen } { \@@_process_write_hyphenation_lists:ee {\str_use:N\l_@@_file_prefix_str} {\str_use:N\l_@@_file_extension_str} } % \end{macrocode} % % % % \begin{macro}{\@@_process_write_hyphenation_lists:nn} % Write hyphenations lists to files with prefix given in the first parameter and suffix in the second. % \begin{macrocode} \cs_new:Npn \@@_process_write_hyphenation_lists:nn #1#2 { \lua_now:e{ lualisthyphen.process_write_hyphenation_lists( '\luaescapestring{#1}', '\luaescapestring{#2}' ) } } \cs_generate_variant:Nn \@@_process_write_hyphenation_lists:nn { ee } % \end{macrocode} % \end{macro} % % % % \begin{macrocode} % % \end{macrocode} % % % % \section{Implementation (Lua backend)} % % \begin{macrocode} %<*lua> % \end{macrocode} % % % % \subsection{Debugging function} % % \begin{macro}[int]{debug} % Debugging function. Initially defined to do nothing, then overridden to become a function that actually writes % debugging information if the package option was set. % \begin{macrocode} local function debug(s) end if tex.count['l__lualisthyphen_debug_int'] ~= 0 then debug = function(s) print('lua-list-hyphen DEBUG: ' .. s) end end % \end{macrocode} % \end{macro} % % % % \subsection{Table key constants} % % Keys for tables containing hyphenatable/hyphenated word data. % \begin{macrocode} local KEY_WORD = 'word' local KEY_LANG = 'lang' local KEY_DIVISION = 'division' local KEY_INDEX = 'index' % \end{macrocode} % % % % \subsection{Node ID and subtype constants} % % Define constants for the node IDs that need to be recognized. % \begin{macrocode} local NODE_ID_HLIST = node.id('hlist') local NODE_ID_DISC = node.id('disc') local NODE_ID_GLUE = node.id('glue') local NODE_ID_KERN = node.id('kern') local NODE_ID_MARGIN_KERN = node.id('margin_kern') local NODE_ID_GLYPH = node.id('glyph') % \end{macrocode} % Define a constant for the kern node subtype that needs to be recognized. There seems to be no automatic way to get % the numerical value fro the subtype other than searching the \luavar{node.subtype('kern')} table. % \begin{macrocode} local NODE_KERN_SUBTYPE_FONTKERN for k,v in pairs(node.subtypes('kern')) do if v == 'fontkern' then NODE_KERN_SUBTYPE_FONTKERN = k break end end % \end{macrocode} % % % % \subsection{Utility functions} % % \begin{macro}[int]{list_filter} % Take a list \luavar{t} and remove from it any elements for which the function % \luavar{f} does not return true. (The index \luavar{j} is always the destination index to which a ‘keep’ element % is moved.)\footnote{Code adapted from \url{https://stackoverflow.com/a/53038524}.} % \begin{macrocode} local function list_filter(t, f) local j = 1 local n = #t for i=1,n do if (f(t[i])) then if (i ~= j) then t[j] = t[i] t[i] = nil end j = j + 1 else t[i] = nil end end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{list_uniq} % Take a list \luavar{t} and remove from it adjacent elements for which the function \luavar{f} returns true. (The % index \luavar{j} is always the last ‘kept’ element.) % \begin{macrocode} local function list_uniq(t, f) local j = 1 local n = #t for i=2,n do if (f(t[i],t[j])) then t[i] = nil else j = i end end list_filter( t, function(a) return a end ) end % \end{macrocode} % \end{macro} % % % % \subsection{Getting text from nodes} % % Getting the components of the ligatures that have Unicode code points can be problematic, at least for some fonts, % so define a lookup table for these cases. % \begin{macrocode} local LIGATURE_COMPONENTS = { [0xfb00] = {'f','f'}, [0xfb01] = {'f','i'}, [0xfb02] = {'f','l'}, [0xfb03] = {'f','f','i'}, [0xfb04] = {'f','f','l'}, } % \end{macrocode} % % % % Extracting text from nodes uses two functions that call each other, so the names have to be defined ahead of time. % \begin{macrocode} local get_node_text local get_nodelist_text % \end{macrocode} % % % % \begin{macro}[int]{get_node_text} % Return the text content of a glyph node (which might be a normal glyph, a ligature, etc.). % \begin{macrocode} get_node_text = function(n) if n.id == NODE_ID_GLYPH then if LIGATURE_COMPONENTS[n.char] ~= nil then local text = '' for _,c in ipairs(LIGATURE_COMPONENTS[n.char]) do text = text .. c end return text elseif n.components then return get_nodelist_text(n.components) else -- See [https://tug.org/pipermail/luatex/2018-March/006786.html] local u = fonts.hashes.identifiers[n.font].characters[n.char].tounicode return utf8.char(tonumber(u,16)) end elseif n.id == NODE_ID_DISC then if n.replace then return get_nodelist_text(n.replace) else return '' end else return '' end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_nodelist_text} % Return the text content of the glyph nodes in the list starting at \luavar{head} up to and including the node % \luavar{last}, or up to the end of the list if \luavar{last} is not specified. % \begin{macrocode} get_nodelist_text = function (head,last) local text = '' for item in node.traverse(head) do text = text .. get_node_text(item) if item == last then break end end return text end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{is_possible_word_node} % Return boolean indicating if node \luavar{n} could be part of a word. Assume that \luavar{glyph}, \luavar{disc}, % and \luavar{margin_kern} nodes could be part of a word, as could a \luavar{kern} node with subtype % \luavar{fontkern}. % \begin{macrocode} local function is_possible_word_node(n) return ( n.id == NODE_ID_GLYPH or n.id == NODE_ID_DISC or (n.id == NODE_ID_KERN and n.subtype == NODE_KERN_SUBTYPE_FONTKERN) or n.id == NODE_ID_MARGIN_KERN ) end % \end{macrocode} % \end{macro} % % % % \subsection{String manipulation} % % \begin{macro}[int]{trim_nonletters_both} % Remove non-letter characters from both the start and end of a string. % \begin{macrocode} local function trim_nonletters_both(s) return unicode.utf8.match(s,'^%A*(.-)%A*$') end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{trim_nonletters_start} % Remove non-letter characters from the start of a string. % \begin{macrocode} local function trim_nonletters_start(s) return unicode.utf8.match(s,'^%A*(.-)$') end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{trim_nonletters_end} % Remove non-letter characters from the end of a string. % \begin{macrocode} local function trim_nonletters_end(s) return unicode.utf8.match(s,'^(.-)%A*$') end % \end{macrocode} % \end{macro} % % % % \subsection{Pre-linebreak processing} % % Before each line has been broken, find all potential division points and store the words in which they occur, % linking each potential break point to the corresponding word. % % Declare a new attribute, which will be used to store in each disc node the index of the corresponding word in the % table \luavar{hlist_hyphenatable_word_list}. % \begin{macrocode} local hyphen_attr = luatexbase.new_attribute('hyphen_attr') % \end{macrocode} % % % % Table to hold hyphenatable words found in the hlist that will be broken. This table will be cleared after the % post-linebreak processing. % \begin{macrocode} local hlist_hyphenatable_word_list = {} % \end{macrocode} % % % % \begin{macro}[int]{get_first_glyph_lang} % Return the lang attribute of the first glyph in the the part of the list starting n that could be part of a word. % (Currently unused; see the documentation of \luafunc{get_disc_lang}.) % \begin{macrocode} -- local function get_first_glyph_lang(n) -- local item = n -- while item and is_possible_word_node(item) do -- if item.id == NODE_ID_GLYPH then -- return item.lang -- end -- item = item.next -- end -- return nil -- end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_disc_lang} % Try to find the language ID in force at a given disc node by looking at (1)~the last glyph in the word % before the disc node; (2)~the first glyph in the word after the disc node. Default to language ID \luavar{0}. % % (Looking at \luavar{replace}, \luavar{pre}, \luavar{post} is possible, but is unreliable and so disabled for the % present. The author has encountered the situation where an explicit hyphen results in the hyphen characters in % \luavar{replace} and \luavar{pre} having different language IDs. He has not had time to investigate how this % arises from the interaction of \pkg{babel}/\pkg{polyglossia} and Lua\LaTeX.) % \begin{macrocode} local function get_disc_lang(n) -- lang = get_first_glyph_lang(n.replace) -- if lang then -- print(lang) -- return lang -- end -- lang = get_first_glyph_lang(n.pre) -- if lang then -- print(lang) -- return lang -- end -- lang = get_first_glyph_lang(n.post) -- if lang then -- return lang -- end local item item = n while item and is_possible_word_node(item) do if item.id == NODE_ID_GLYPH then return item.lang end item = item.prev end item = n while item and is_possible_word_node(item) do if item.id == NODE_ID_GLYPH then return item.lang end item = item.next end return 0 end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{pre_linebreak} % For every word containing a disc node (a potential division point) in the hlist at \luavar{hlist_head}, add the % word it which it appears, along with the language of that word to \luavar{hlist_hyphenatable_word_list}, and store % its index in that table in the \luavar{hyphen_attr} attribute (declared above) of each disc node in the word. % \begin{macrocode} local function pre_linebreak(hlist_head,groupcode) % \end{macrocode} % When non-\luavar{nil}, \luavar{word_start_node} is the first node of the ‘current’ word. When non-\luavar{nil}, % \luavar{hyphenatable_index} is the index in \luavar{hlist_hyphenatable_word_list} where the word will be added. % \luavar{hyphenatable_count} is the number of potential hyphenatable words found so far, which is used to set % \luavar{hyphenatable_index} when the first disc node is found in a new word. % \begin{macrocode} local word_start_node = nil local hyphenatable_index = nil local hyphenatable_count = 0 local lang = nil debug('Pre-linebreak processing start') for item in node.traverse(hlist_head) do % \end{macrocode} % If \luavar{item} is a glyph node, check for a new word start. % \begin{macrocode} if item.id == NODE_ID_GLYPH and not word_start_node then word_start_node = item end % \end{macrocode} % If \luavar{item} is a disc node, check whether it is the first one found in the ‘current’ word (indicated by) % \luavar{hyphenatable_index} being \luavar{nil}. If so, set \luavar{hyphenatable_index} and determine the language % currently being used. Set the attribute of the disc node. % \begin{macrocode} if item.id == NODE_ID_DISC then if not hyphenatable_index then hyphenatable_count = hyphenatable_count + 1 hyphenatable_index = hyphenatable_count lang = get_disc_lang(item) end node.set_attribute(item,hyphen_attr,hyphenatable_index) end % \end{macrocode} % If \luavar{item} is not a node that can appear in a word assume that the word end has been reached. % \begin{macrocode} if not is_possible_word_node(item) then if word_start_node and hyphenatable_index then % \end{macrocode} % Extract the text of the hyphenatable word. In fact, the ‘word’ might be something other than a genuine word, such % as an ISBN (with hyphen separators). So only store the word in \luavar{hlist_hyphenatable_word_list} if something % non-empty is left after trimming non-letters from both sides. % \begin{macrocode} local hyphenatable_word = trim_nonletters_both( get_nodelist_text(word_start_node,item.prev) ) if hyphenatable_word ~= '' then debug( ' Hyphenatable word (index ' .. hyphenatable_index .. ') "' .. hyphenatable_word .. '"' ) hlist_hyphenatable_word_list[hyphenatable_index] = { [KEY_WORD] = hyphenatable_word, [KEY_LANG] = lang, } end end % \end{macrocode} % Reset \luavar{word_start_node} and \luavar{hyphenatable_index}, ready for the next word. % \begin{macrocode} word_start_node = nil hyphenatable_index = nil end end debug('Pre-linebreak processing finish') return true end % \end{macrocode} % \end{macro} % % % % \subsection{Post-linebeak processing} % % After linebreaking, look for a discretionary node at the end of each line, which indicates that a word has been % divided between the end of that line and the start of the next. Extract the two word-pieces from the lines and store % them in the appropriate language table. % % \begin{macro}[int]{get_used_disc} % If at the tail of the hlist at \luavar{hlist_head} (which will be a line) there is a disc node not followed by a % glyph node, return that disc node. Otherwise return \luavar{nil}. % \begin{macrocode} local function get_used_disc(hlist_head) local item = node.tail(hlist_head) while item and item.id ~= NODE_ID_GLYPH do if item.id == NODE_ID_DISC then return item end item = item.prev end return nil end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_disc_word_start} % Return the node starting the word that includes a given disc node \luavar{n}, or \luavar{nil} if there is no such % node. % \begin{macrocode} local function get_disc_word_start(hlist_head,n) local item = n while item do local prev = item.prev if not (prev and is_possible_word_node(prev)) then return item end item = prev end return nil end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_next_hlist} % Return the next hlist in the list containing the given node \luavar{n}, or \luavar{nil} if there is no such hlist % node. % \begin{macrocode} local function get_next_hlist(n) local item = n.next while item do if item.id == NODE_ID_HLIST then return item end item = item.next end return nil end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_line_first_word} % Return the first word in the hlist at \luavar{hlist_head}, or \luavar{nil} if there is no such word. % \begin{macrocode} local function get_line_first_word(hlist_head) % \end{macrocode} % \luavar{word_start_node} is either \luavar{nil} or the (glyph) node that starts the word. % \begin{macrocode} local word_start_node = nil for item in node.traverse(hlist_head) do if item.id == NODE_ID_GLYPH then if not word_start_node then word_start_node = item end end if not is_possible_word_node(item) then if word_start_node then return get_nodelist_text(word_start_node,item.prev) end end end % \end{macrocode} % It is possible that the word ends at the end of the hlist, so check if a word has been started. % \begin{macrocode} if word_start_node then return get_nodelist_text(word_start_node,node.tail(hlist_head)) else return nil end end % \end{macrocode} % \end{macro} % % % % Table for lists for hyphenated words in various languages. This table will be indexed by (numerical) language IDs. % Each value will be a list, and each entry in the list will be a table containing the original word, the hyphenation, % and the index of the table in the list (which is needed later for stable sorting and sorting into the original % order). % \begin{macrocode} local hyphenation_table = {} % \end{macrocode} % % % % \begin{macro}[int]{check_line_hyphenation} % Check whether there is a hyphenated word at the end of the given hlist; if so, save the word to % \luavar{hyphenation_list}. % \begin{macrocode} local function check_line_hyphenation(hlist) % \end{macrocode} % First, is there a disc node at the end of the list? (‘End’ modulo certain other node types; see the documentation % of \luafunc{get_used_disc}.) % \begin{macrocode} local last_disc = get_used_disc(hlist.head) if not last_disc then debug(' No disc node found at end of line') return end % \end{macrocode} % The \luavar{hyphen_attr} may or may not contain the index of a word in \luavar{hlist_hyphenatable_word_list}. (See % the documentation for \luafunc{pre_linebreak}.) % \begin{macrocode} local hyphenation_index = node.has_attribute(last_disc,hyphen_attr) local t = hlist_hyphenatable_word_list[hyphenation_index] if not t then debug(' Disc node not associated to a stored word') return end local word = t[KEY_WORD] local lang = t[KEY_LANG] % \end{macrocode} % There should always be a next line, since there is a disc node at the end of \luavar{hlist}, but check anyway. % \begin{macrocode} local next_line = get_next_hlist(hlist) if not next_line then debug(' No following line found (which should not happen)') return end % \end{macrocode} % Get the hyphenation list for the language of the word, ensuring that the list has been created. % \begin{macrocode} lang_hyphenation_list = hyphenation_table[lang] if not lang_hyphenation_list then hyphenation_table[lang] = {} lang_hyphenation_list = hyphenation_table[lang] end % \end{macrocode} % For the pre-linebreak part of the word, get the word that ends the line, and trim any leading non-letters. This % could leave an empty word; for example, if $n$-dimensional is broken at the hyphen, the word ending the line is % just the hyphen. If an empty word is left, just use the non-trimmed result. % \begin{macrocode} local pre = get_nodelist_text(get_disc_word_start(hlist.head,last_disc)) local pre_temp = trim_nonletters_start(pre) if pre_temp ~= '' then pre = pre_temp end % \end{macrocode} % For the post-linebreak part, just get the word at the start of the next line, and trim and trailing non-letters. % \begin{macrocode} local post = trim_nonletters_end(get_line_first_word(next_line.head)) debug( ' Hyphenated word found: "' .. word .. '" -> "' .. pre .. '<>' .. post .. '"' ) % \end{macrocode} % Store everything in the language hyphenation list. % \begin{macrocode} table.insert( lang_hyphenation_list, { [KEY_WORD] = word, [KEY_DIVISION] = pre .. post, [KEY_INDEX] = #lang_hyphenation_list, } ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{post_linebreak} % For every line in the vlist at \luavar{vlist_head}, check whether there is a hyphenated word at the end; if so, % save the word to \luavar{hyphenation_list}. % \begin{macrocode} local function post_linebreak(vlist_head,groupcode) debug('Post-linebreak processing start') local line_no = 0 for item in node.traverse(vlist_head) do if item.id == NODE_ID_HLIST then line_no = line_no + 1 debug(' Line no.' .. line_no) check_line_hyphenation(item) end end hlist_hyphenatable_word_list = {} debug('Post-linebreak processing end') return true end % \end{macrocode} % \end{macro} % % % % \subsection{Callbacks} % % Add \luafunc{pre_linebreak} and \luafunc{post_linebreak} to the relevant callbacks. % \begin{macrocode} local LUA_LIST_HYPHEN_PRE_LINEBREAK = 'LUA_LIST_HYPHEN_PRE_LINEBREAK' luatexbase.add_to_callback( 'pre_linebreak_filter', pre_linebreak, LUA_LIST_HYPHEN_PRE_LINEBREAK ) % \end{macrocode} % % % % \begin{macrocode} local LUA_LIST_HYPHEN_POST_LINEBREAK = 'LUA_LIST_HYPHEN_POST_LINEBREAK' luatexbase.add_to_callback( 'post_linebreak_filter', post_linebreak, LUA_LIST_HYPHEN_POST_LINEBREAK ) % \end{macrocode} % % % % \subsection{Language settings} % % Table mapping language IDs to textual names. % \begin{macrocode} local language_table = {} % \end{macrocode} % % Populating \luavar{language_table} is done differently for \pkg{babel} and \pkg{polyglossia}. If \pkg{babel} is in % use, the \LaTeX\ frontend iterates through \cs{bbl@languages} and calls \luafunc{babel_save_language_name}. If % \pkg{polyglossia} is in use, \luavar{language_table} is populated by \luafunc{polyglossia_get_language_names}, which % is called just before the hyphenation lists are written. % % \begin{macro}[int]{babel_save_language_name} % Store the association of a language ID to \pkg{babel}'s texual name, if no name has been assigned to that ID % already. % \begin{macrocode} local function babel_save_language_name(lang_id,name) if not language_table[lang_id] then language_table[lang_id] = name end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{polyglossia_get_language_names} % If polyglossia has been loaded, use it to build the table mapping language IDs to textual names. % \begin{macrocode} local function polyglossia_get_language_names() if not polyglossia then return end for name,language in pairs(polyglossia.newloader_loaded_languages) do language_table[lang.id(language)] = name end end % \end{macrocode} % \end{macro} % % % % \subsection{Processing hyphenation lists} % % Before writing out hyphenation lists, remove duplicates and/or perform sorting, in accordance with the set options. % % % % \subsubsection{Comparisons and equality checks} % % \begin{macro}[int]{equal_hyphenation_case_sensitive} % Equality check for deduplicating the list of hyphenations case-sensitively. % \begin{macrocode} local function equal_hyphenation_case_sensitive(a,b) return ( a[KEY_WORD] == b[KEY_WORD] and a[KEY_DIVISION] == b[KEY_DIVISION] ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{equal_hyphenation_case_insensitive} % Equality check for deduplicating the list of hyphenations case-insensitively. % \begin{macrocode} local function equal_hyphenation_case_insensitive(a,b) return ( unicode.utf8.lower(a[KEY_WORD]) == unicode.utf8.lower(b[KEY_WORD]) and unicode.utf8.lower(a[KEY_DIVISION]) == unicode.utf8.lower(b[KEY_DIVISION]) ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{lessthan_hyphenation_case_sensitive} % Comparison for sorting the list of hyphenations case-sensitively. % % The comparison of index keys ensures that the sorting is stable. % \begin{macrocode} local function lessthan_hyphenation_case_sensitive(a,b) return ( a[KEY_WORD] < b[KEY_WORD] or ( a[KEY_WORD] == b[KEY_WORD] and a[KEY_DIVISION] < b[KEY_DIVISION] ) or ( a[KEY_WORD] == b[KEY_WORD] and a[KEY_DIVISION] == b[KEY_DIVISION] and a[KEY_INDEX] == b[KEY_INDEX] ) ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{lessthan_hyphenation_case_insensitive} % Comparison for sorting the list of hyphenations case-insensitively. % % The comparison of index keys ensures that the sorting is stable. % \begin{macrocode} local function lessthan_hyphenation_case_insensitive(a,b) return ( unicode.utf8.lower(a[KEY_WORD]) < unicode.utf8.lower(b[KEY_WORD]) or ( unicode.utf8.lower(a[KEY_WORD]) == unicode.utf8.lower(b[KEY_WORD]) and unicode.utf8.lower(a[KEY_DIVISION]) < unicode.utf8.lower(b[KEY_DIVISION]) ) or ( unicode.utf8.lower(a[KEY_WORD]) == unicode.utf8.lower(b[KEY_WORD]) and unicode.utf8.lower(a[KEY_DIVISION]) < unicode.utf8.lower(b[KEY_DIVISION]) and a[KEY_INDEX] == b[KEY_INDEX] ) ) end % \end{macrocode} % \end{macro} % % % % \subsubsection{Sorting} % % \begin{macro}[int]{sort_lang_hyphenation_list_none} % Sort \luavar{hyphenation_list} into its original order of appearance. % \begin{macrocode} local function sort_lang_hyphenation_list_none(hyphenation_list) table.sort( hyphenation_list, function(a,b) return a[KEY_INDEX] < b[KEY_INDEX] end ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{sort_lang_hyphenation_list_case_sensitive} % Sort \luavar{hyphenation_list} case-sensitively. % \begin{macrocode} local function sort_lang_hyphenation_list_case_sensitive(hyphenation_list) table.sort( hyphenation_list, lessthan_hyphenation_case_sensitive ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{sort_lang_hyphenation_list_case_insensitive} % Sort \luavar{hyphenation_list} case-insensitively. % \begin{macrocode} local function sort_lang_hyphenation_list_case_insensitive(hyphenation_list) table.sort( hyphenation_list, lessthan_hyphenation_case_insensitive ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{process_lang_hyphenation_list_sort} % Select the appropriate function for sorting. % \begin{macrocode} local sort_lang_hyphenation_list if tex.count['l__lualisthyphen_sort_int'] == 1 then sort_lang_hyphenation_list = sort_lang_hyphenation_list_case_sensitive elseif tex.count['l__lualisthyphen_sort_int'] == 2 then sort_lang_hyphenation_list = sort_lang_hyphenation_list_case_insensitive else sort_lang_hyphenation_list = sort_lang_hyphenation_list_none end % \end{macrocode} % \end{macro} % % % % \subsubsection{Deduplication} % % \begin{macro}[int]{deduplicate_lang_hyphenation_list_none} % Dummy function; does not deduplicate \luavar{hyphenation_list}. % \begin{macrocode} local function deduplicate_lang_hyphenation_list_none(hyphenation_list) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{deduplicate_lang_hyphenation_list_case_sensitive} % Remove duplicates from \luavar{hyphenation_list} case-sensitively. % \begin{macrocode} local function deduplicate_lang_hyphenation_list_case_sensitive(hyphenation_list) table.sort( hyphenation_list, lessthan_hyphenation_case_sensitive ) list_uniq( hyphenation_list, equal_hyphenation_case_sensitive ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{deduplicate_lang_hyphenation_list_case_insensitive} % Remove duplicates from \luavar{hyphenation_list} case-insensitively. % \begin{macrocode} local function deduplicate_lang_hyphenation_list_case_insensitive(hyphenation_list) table.sort( hyphenation_list, lessthan_hyphenation_case_insensitive ) list_uniq( hyphenation_list, equal_hyphenation_case_insensitive ) end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{deduplicate_lang_hyphenation_list} % Select the appropriate function for whether duplicates whould be removed. % \begin{macrocode} local deduplicate_lang_hyphenation_list if tex.count['l__lualisthyphen_unique_int'] == 1 then deduplicate_lang_hyphenation_list = deduplicate_lang_hyphenation_list_case_sensitive elseif tex.count['l__lualisthyphen_unique_int'] == 2 then deduplicate_lang_hyphenation_list = deduplicate_lang_hyphenation_list_case_insensitive else deduplicate_lang_hyphenation_list = deduplicate_lang_hyphenation_list_none end % \end{macrocode} % \end{macro} % % % % \subsubsection{Combined processing} % % \begin{macro}[int]{process_lang_hyphenation_list} % Remove duplicates and sort \luavar{hyphenation_list}. % \begin{macrocode} local function process_lang_hyphenation_list(hyphenation_list) deduplicate_lang_hyphenation_list(hyphenation_list) sort_lang_hyphenation_list(hyphenation_list) end % \end{macrocode} % \end{macro} % % % % \subsection{Writing} % % \begin{macro}[int]{write_lang_hyphenation_list_standard} % Write out just the hyphenated words. % \begin{macrocode} local function write_lang_hyphenation_list_standard(f,hyphenation_list) for i,v in ipairs(hyphenation_list) do if v then f:write(v[KEY_DIVISION] .. '\n') end end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{write_lang_hyphenation_list_verbose} % Write out all hyphenation information. % \begin{macrocode} local function write_lang_hyphenation_list_verbose(f,hyphenation_list) for i,v in ipairs(hyphenation_list) do if v then f:write(v[KEY_WORD] .. ' -> ' .. v[KEY_DIVISION] .. '\n') end end end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{write_lang_hyphenation_list} % Set \luafunc{write_lang_hyphenation_list} to be either \luafunc{write_lang_hyphenation_list_standard} or % \luafunc{write_lang_hyphenation_list_verbose}, depending on the % package options. % \begin{macrocode} local write_lang_hyphenation_list if tex.count['l__lualisthyphen_verbose_int'] == 0 then write_lang_hyphenation_list = write_lang_hyphenation_list_standard else write_lang_hyphenation_list = write_lang_hyphenation_list_verbose end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{get_hyphenation_file_path} % Get the file to which the list of hyphenated words will be written, based on the given \luavar{prefix}, % \luavar{extension}, numerical \luavar{lang_id}, and taking into account any specified output directory for % Lua\TeX. % \begin{macrocode} local function get_hyphenation_file_path(prefix,extension,lang_id) local lang_name = language_table[lang_id] if not lang_name then lang_name = lang_id end local hyphenation_file_path = prefix .. tostring(lang_name) .. extension if not status.output_directory then return hyphenation_file_path end if string.sub(status.output_directory,-1,-1) == '/' then hyphenation_file_path = status.output_directory .. hyphenation_file_path else hyphenation_file_path = status.output_directory .. '/' .. hyphenation_file_path end return hyphenation_file_path end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{process_write_lang_hyphenation_list} % Process and write out the \luavar{hyphenation_list} (which will be for the language with the numerical % \luavar{lang_id}) to a file with the given \luavar{prefix} and \luavar{extension}. % \begin{macrocode} local function process_write_lang_hyphenation_list( prefix,extension,lang_id,hyphenation_list ) process_lang_hyphenation_list(hyphenation_list) local f = io.open(get_hyphenation_file_path(prefix,extension,lang_id),'w') write_lang_hyphenation_list(f,hyphenation_list) f:close() end % \end{macrocode} % \end{macro} % % % % \begin{macro}[int]{process_write_hyphenation_lists} % If polyglossia is in use, populate \luavar{language_table}. Then, for each language, process and write out the % hyphenation lists to a file. % \begin{macrocode} local function process_write_hyphenation_lists(prefix,extension) polyglossia_get_language_names() for k,v in pairs(hyphenation_table) do process_write_lang_hyphenation_list(prefix,extension,k,v) end end % \end{macrocode} % \end{macro} % % % % \subsection{Export public functions} % % Finally, make available the functions that will be called from the \LaTeX\ frontend using \cs{lua_now:n}. % \begin{macrocode} return { process_write_hyphenation_lists = process_write_hyphenation_lists, babel_save_language_name = babel_save_language_name, } % \end{macrocode} % % % % \begin{macrocode} % % \end{macrocode} % % % % \clearpage % \end{implementation} % % % % \iffalse %<*metadriver> \input{lua-list-hyphen.dtx} % % \fi