Current File : //usr/share/texlive/texmf-dist/doc/generic/enctex/unimap.py
#!/usr/bin/python3
###################################################################
# unimap.py
# Generates utf8raw.tex file containing math character definitions
# from modified Unicode character database unimap.txt.
#
# Copyright (C) 2003 David Necas (Yeti)
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
###################################################################
# Usage: ./unimap.py
# It takes no arguments, input and output file names are `unimap.txt' and
# `utf8raw.tex' and are hardcoded below (should be fixed once someone finds
# a reason for running it on different files).
#
# The source file unimap.txt is basically Unicode character name list
# http://www.unicode.org/Public/UNIDATA/NamesList.txt
# with additional lines defining TeX expansions of particular characters.
# These lines have format similar to other character info lines, with
# backslash (\) as the line type mark:
# <Tab>\<space>\TeXcontrolsequence
# \TeXcontrolsequence is the control sequence the character should be
# mapped to.
#
# The NamesList.txt file is huge and the number of supported characters is
# still relatively small.  Thus only diff NamesList.txt -> unimap.txt is
# normally distributed (unimap.diff).  Once you have NamesList.txt, you can
# create unimap.txt with following command:
# patch -o unimap.txt NamesList.txt unimap.diff

import re
from time import asctime, gmtime

database = 'unimap.txt'    # Input file
output = 'utf8raw.tex'   # Output file

# Compatibility with Pyhton-2.1
if not __builtins__.__dict__.has_key('file'):
    file = open
if not __builtins__.__dict__.has_key('dict'):
    def dict(l):
        d = {}
        for x in l: d[x[0]] = x[1]
        return d

charline_re = re.compile(r'^[0-9A-F]{4,}\t')
comsect_re = re.compile(r'^@+\t')
line_template = '\\mubyte %s %s\\endmubyte %% U+%04X %s\n'

class LineType:
    """NamesList.txt line types. Something between an enum and a hash."""
    Empty = 0
    Comment = '++'
    Section = '@@'
    Character = 'AA'
    IsNot = 'x'
    Alias = '='
    Note = '*'
    Combining = ':'
    Render = '#'
    TeX = '\\'

LineType.map = dict([(val, name) for name, val in LineType.__dict__.items()
                     if name[0].isupper()])

def linetype(line):
    """Determine line type of a NamesList.txt file and extract the text."""
    if not line:
        return LineType.Empty, None
    if line.startswith('@'):
        if line[1:].startswith('@') or line[1:].startswith('+'):
            return LineType.Comment, comsect_re.sub('', line).strip()
        return LineType.Section, comsect_re.sub('', line).strip()
    m = charline_re.match(line)
    if m:
        return LineType.Character, (int(line[:m.end()], 16),
                                    line[m.end():].strip().lower())
    if not line.startswith('\t'):
        raise ValueError('Queer line doesn\'t start with @ or Tab')
    line = line.strip()
    if not line:
        return LineType.Empty, None
    if not LineType.map.has_key(line[0]):
        raise ValueError('Queer character info line (marker %s)' % line[0])
    return line[0], line[1:].strip()

def utf8chars(u):
    """Format an Unicode character in a \\mubyte-friendly style.

    character ordinal value should be < 0x10000."""
    if u < 0x80:
        return '^^%02x' % u
    if u < 0x800:
        return '^^%02x^^%02x' % (0xc0 | (u >> 6),
                                 0x80 | (0x3f & u))
    return '^^%02x^^%02x^^%02x' % (0xe0 | (u >> 12),
                                   0x80 | (0x3f & (u >> 6)),
                                   0x80 | (0x3f & u))

fh = file(database, 'r')
# skip some initial noise
while True:
    line = fh.readline()
    try:
        typ, val = linetype(line)
    except ValueError:
        continue
    if typ == LineType.Section:
        break

fw = file(output, 'w')
fw.write('%% Generated from %s %s\n' % (database, asctime(gmtime())))
while typ:
    if typ == LineType.Section:
        sect = val
    elif typ == LineType.Character:
        char = val
    elif typ == LineType.TeX:
        if not val.startswith('\\'):
            raise ValueError('%s is not a control seq (U%X)' % (val, char[0]))
        if sect:
            fw.write('\n%% %s\n' % sect)
            sect = None
        fw.write(line_template % (val, utf8chars(char[0]), char[0], char[1]))
    typ, val = linetype(fh.readline())
fh.close()
fw.write('\n\\endinput\n')
fw.close()