"""
LaTeX Codec
~~~~~~~~~~~
The :mod:`latexcodec.codec` module
contains all classes and functions for LaTeX code
translation. For practical use,
you should only ever need to import the :mod:`latexcodec` module,
which will automatically register the codec
so it can be used by :meth:`str.encode`, :meth:`str.decode`,
and any of the functions defined in the :mod:`codecs` module
such as :func:`codecs.open` and so on.
The other functions and classes
are exposed in case someone would want to extend them.
.. autofunction:: register
.. autofunction:: find_latex
.. autoclass:: LatexIncrementalEncoder
:show-inheritance:
:members:
.. autoclass:: LatexIncrementalDecoder
:show-inheritance:
:members:
.. autoclass:: LatexCodec
:show-inheritance:
:members:
.. autoclass:: LatexUnicodeTable
:members:
"""
# Copyright (c) 2003, 2008 David Eppstein
# Copyright (c) 2011-2020 Matthias C. M. Troffaes
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
import codecs
import dataclasses
import importlib.resources as pkg_resources
import unicodedata
from codecs import CodecInfo
from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union
from latexcodec import lexer
[docs]
def register():
"""Register the :func:`find_latex` codec search function.
.. seealso:: :func:`codecs.register`
"""
codecs.register(find_latex)
# returns the codec search function
# this is used if latex_codec.py were to be placed in stdlib
def getregentry() -> Optional[CodecInfo]:
"""Encodings module API."""
return find_latex("latex")
@dataclasses.dataclass
class UnicodeLatexTranslation:
unicode: str
latex: str
encode: bool #: Suitable for unicode -> latex.
decode: bool #: Suitable for latex -> unicode.
text_mode: bool #: Latex works in text mode.
math_mode: bool #: Latex works in math mode.
def load_unicode_latex_table() -> Iterator[UnicodeLatexTranslation]:
with (pkg_resources.files("latexcodec") / "table.txt").open(
"r", encoding="utf-8", errors="strict"
) as datafile:
for line in datafile:
marker, unicode_names, latex = line.rstrip("\r\n").split("\u0009")
unicode = "".join(
unicodedata.lookup(name) for name in unicode_names.split(",")
)
yield UnicodeLatexTranslation(
unicode=unicode,
latex=latex,
encode=marker[1] in {"-", ">"},
decode=marker[1] in {"-", "<"},
text_mode=marker[0] in {"A", "T"},
math_mode=marker[0] in {"A", "M"},
)
[docs]
class LatexUnicodeTable:
"""Tabulates a translation between LaTeX and unicode."""
def __init__(self, lexer_):
self.lexer: lexer.LatexIncrementalLexer = lexer_
self.unicode_map: Dict[Tuple[lexer.Token, ...], str] = {}
self.max_length: int = 0
self.latex_map: Dict[str, Tuple[str, Tuple[lexer.Token, ...]]] = {}
self.register_all()
[docs]
def register_all(self):
"""Register all symbols and their LaTeX equivalents
(called by constructor).
"""
# register special symbols
self.register(
UnicodeLatexTranslation(
unicode="\n\n",
latex=" \\par",
encode=False,
decode=True,
text_mode=True,
math_mode=False,
)
)
self.register(
UnicodeLatexTranslation(
unicode="\n\n",
latex="\\par",
encode=False,
decode=True,
text_mode=True,
math_mode=False,
)
)
for trans in load_unicode_latex_table():
self.register(trans)
[docs]
def register(self, trans: UnicodeLatexTranslation):
"""Register a correspondence between *unicode_text* and *latex_text*.
:param UnicodeLatexTranslation trans: Description of translation.
"""
if trans.math_mode and not trans.text_mode:
# also register text version
self.register(
UnicodeLatexTranslation(
unicode=trans.unicode,
latex="$" + trans.latex + "$",
text_mode=True,
math_mode=False,
decode=trans.decode,
encode=trans.encode,
)
)
self.register(
UnicodeLatexTranslation(
unicode=trans.unicode,
latex=r"\(" + trans.latex + r"\)",
text_mode=True,
math_mode=False,
decode=trans.decode,
encode=trans.encode,
)
)
# for the time being, we do not perform in-math substitutions
return
# tokenize, and register unicode translation
self.lexer.reset()
self.lexer.state = "M"
tokens = tuple(self.lexer.get_tokens(trans.latex, final=True))
if trans.decode:
if tokens not in self.unicode_map:
self.max_length = max(self.max_length, len(tokens))
self.unicode_map[tokens] = trans.unicode
# also register token variant with brackets, if appropriate
# for instance, "\'{e}" for "\'e", "\c{c}" for "\c c", etc.
# note: we do not remove brackets (they sometimes matter,
# e.g. bibtex uses them to prevent lower case transformation)
if (
len(tokens) == 2
and tokens[0].name.startswith("control")
and tokens[1].name == "chars"
):
self.register(
UnicodeLatexTranslation(
unicode=f"{{{trans.unicode}}}",
latex=f"{tokens[0].text}{{{tokens[1].text}}}",
decode=True,
encode=False,
math_mode=trans.math_mode,
text_mode=trans.text_mode,
)
)
if (
len(tokens) == 4
and tokens[0].text in {"$", r"\("}
and tokens[1].name.startswith("control")
and tokens[2].name == "chars"
and tokens[3].text in {"$", r"\)"}
):
# drop brackets in this case, since it is math mode
self.register(
UnicodeLatexTranslation(
unicode=f"{trans.unicode}",
latex=f"{tokens[0].text}{tokens[1].text}"
f"{{{tokens[2].text}}}{tokens[3].text}",
decode=True,
encode=False,
math_mode=trans.math_mode,
text_mode=trans.text_mode,
)
)
if trans.encode and trans.unicode not in self.latex_map:
assert len(trans.unicode) == 1
self.latex_map[trans.unicode] = (trans.latex, tokens)
_LATEX_UNICODE_TABLE = LatexUnicodeTable(lexer.LatexIncrementalDecoder())
# incremental encoder does not need a buffer
# but decoder does
[docs]
class LatexIncrementalEncoder(lexer.LatexIncrementalEncoder):
"""Translating incremental encoder for latex. Maintains a state to
determine whether control spaces etc. need to be inserted.
"""
emptytoken = lexer.Token("unknown", "") #: The empty token.
table = _LATEX_UNICODE_TABLE #: Translation table.
state: str
def __init__(self, errors="strict"):
super().__init__(errors=errors)
self.reset()
[docs]
def reset(self):
super(LatexIncrementalEncoder, self).reset()
self.state = "M"
[docs]
def get_space_bytes(self, bytes_: str) -> Tuple[str, str]:
"""Inserts space bytes in space eating mode."""
if self.state == "S":
# in space eating mode
# control space needed?
if bytes_.startswith(" "):
# replace by control space
return "\\ ", bytes_[1:]
else:
# insert space (it is eaten, but needed for separation)
return " ", bytes_
else:
return "", bytes_
def _get_latex_chars_tokens_from_char(
self, c: str
) -> Tuple[str, Tuple[lexer.Token, ...]]:
# if ascii, try latex equivalents
# (this covers \, #, &, and other special LaTeX characters)
if ord(c) < 128:
try:
return self.table.latex_map[c]
except KeyError:
pass
# next, try input encoding
try:
c.encode(self.inputenc, "strict")
except UnicodeEncodeError:
pass
else:
return c, (lexer.Token(name="chars", text=c),)
# next, try latex equivalents of common unicode characters
try:
return self.table.latex_map[c]
except KeyError:
# translation failed
if self.errors == "strict":
raise UnicodeEncodeError(
"latex", # codec
c, # problematic input
0,
1, # location of problematic character
"don't know how to translate {0} into latex".format(repr(c)),
)
elif self.errors == "ignore":
return "", (self.emptytoken,)
elif self.errors == "replace":
# use the \\char command
# this assumes
# \usepackage[T1]{fontenc}
# \usepackage[utf8]{inputenc}
bytes_ = "{\\char" + str(ord(c)) + "}"
return bytes_, (lexer.Token(name="chars", text=bytes_),)
elif self.errors == "keep":
return c, (lexer.Token(name="chars", text=c),)
else:
raise ValueError(
"latex codec does not support {0} errors".format(self.errors)
)
[docs]
def get_latex_chars(self, unicode_: str, final: bool = False) -> Iterator[str]:
if not isinstance(unicode_, str):
raise TypeError(
"expected unicode for encode input, but got {0} instead".format(
unicode_.__class__.__name__
)
)
# convert character by character
for pos, c in enumerate(unicode_):
bytes_, tokens = self._get_latex_chars_tokens_from_char(c)
space, bytes_ = self.get_space_bytes(bytes_)
# update state
if tokens and tokens[-1].name == "control_word":
# we're eating spaces
self.state = "S"
elif tokens:
self.state = "M"
if space:
yield space
yield bytes_
[docs]
class LatexIncrementalDecoder(lexer.LatexIncrementalDecoder):
"""Translating incremental decoder for LaTeX."""
table = _LATEX_UNICODE_TABLE #: Translation table.
token_buffer: List[lexer.Token] #: The token buffer of this decoder.
def __init__(self, errors="strict"):
lexer.LatexIncrementalDecoder.__init__(self, errors=errors)
[docs]
def reset(self):
lexer.LatexIncrementalDecoder.reset(self)
self.token_buffer = []
# python codecs API does not support multibuffer incremental decoders
[docs]
def getstate(self) -> Any:
raise NotImplementedError
[docs]
def setstate(self, state: Any) -> None:
raise NotImplementedError
[docs]
def get_unicode_tokens(self, chars: str, final: bool = False) -> Iterator[str]:
for token in self.get_tokens(chars, final=final):
# at this point, token_buffer does not match anything
self.token_buffer.append(token)
# new token appended at the end, see if we have a match now
# note: match is only possible at the *end* of the buffer
# because all other positions have already been checked in
# earlier iterations
for i in range(len(self.token_buffer), 0, -1):
last_tokens = tuple(self.token_buffer[-i:]) # last i tokens
try:
unicode_text = self.table.unicode_map[last_tokens]
except KeyError:
# no match: continue
continue
else:
# match!! flush buffer, and translate last bit
# exclude last i tokens
for token2 in self.token_buffer[:-i]:
yield self.decode_token(token2)
yield unicode_text
self.token_buffer = []
break
# flush tokens that can no longer match
while len(self.token_buffer) >= self.table.max_length:
yield self.decode_token(self.token_buffer.pop(0))
# also flush the buffer at the end
if final:
for token in self.token_buffer:
yield self.decode_token(token)
self.token_buffer = []
[docs]
class LatexCodec(codecs.Codec):
IncrementalEncoder: Type[LatexIncrementalEncoder]
IncrementalDecoder: Type[LatexIncrementalDecoder]
[docs]
def encode(
self, unicode_: str, errors="strict" # type: ignore
) -> Tuple[Union[bytes, str], int]:
"""Convert unicode string to LaTeX bytes."""
encoder = self.IncrementalEncoder(errors=errors)
return encoder.encode(unicode_, final=True), len(unicode_)
[docs]
def decode(self, bytes_: Union[bytes, str], errors="strict") -> Tuple[str, int]:
"""Convert LaTeX bytes to unicode string."""
decoder = self.IncrementalDecoder(errors=errors)
return decoder.decode(bytes_, final=True), len(bytes_) # type: ignore
class UnicodeLatexIncrementalDecoder(LatexIncrementalDecoder):
def decode(self, bytes_: str, final: bool = False) -> str: # type: ignore
return self.udecode(bytes_, final)
class UnicodeLatexIncrementalEncoder(LatexIncrementalEncoder):
def encode(self, unicode_: str, final: bool = False) -> str: # type: ignore
return self.uencode(unicode_, final)
[docs]
def find_latex(encoding: str) -> Optional[CodecInfo]:
"""Return a :class:`codecs.CodecInfo` instance for the requested
LaTeX *encoding*, which must be equal to ``latex``,
or to ``latex+<encoding>``
where ``<encoding>`` describes another encoding.
"""
if "_" in encoding:
# Python 3.9 now normalizes "latex+latin1" to "latex_latin1"
# https://bugs.python.org/issue37751
encoding, _, inputenc_ = encoding.partition("_")
else:
encoding, _, inputenc_ = encoding.partition("+")
if not inputenc_:
inputenc_ = "ascii"
if encoding == "latex":
incremental_encoder = type(
"incremental_encoder", (LatexIncrementalEncoder,), dict(inputenc=inputenc_)
)
incremental_decoder = type(
"incremental_encoder", (LatexIncrementalDecoder,), dict(inputenc=inputenc_)
)
elif encoding == "ulatex":
incremental_encoder = type(
"incremental_encoder",
(UnicodeLatexIncrementalEncoder,),
dict(inputenc=inputenc_),
)
incremental_decoder = type(
"incremental_encoder",
(UnicodeLatexIncrementalDecoder,),
dict(inputenc=inputenc_),
)
else:
return None
class Codec(LatexCodec):
IncrementalEncoder = incremental_encoder
IncrementalDecoder = incremental_decoder
class StreamWriter(Codec, codecs.StreamWriter):
pass
class StreamReader(Codec, codecs.StreamReader):
pass
return codecs.CodecInfo(
encode=Codec().encode, # type: ignore
decode=Codec().decode, # type: ignore
incrementalencoder=Codec.IncrementalEncoder,
incrementaldecoder=Codec.IncrementalDecoder,
streamreader=StreamReader,
streamwriter=StreamWriter,
)