gcc/gcc/rust/util/make-rust-unicode.py

# Copyright (C) 2020-2024 Free Software Foundation, Inc.

# This file is part of GCC.

# GCC is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 3, or (at your option) any later
# version.

# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.

# You should have received a copy of the GNU General Public License
# along with GCC; see the file COPYING3.  If not see
# <http://www.gnu.org/licenses/>.

# Run this program as
# 	python ./make-rust-unicode.py UnicodeData.txt \
#       DerivedNormalizationProps.txt DerivedCoreProperties.txt \
#       > rust-unicode-data.h

import sys
from typing import Tuple

Codepoint = int
Range = Tuple[Codepoint, Codepoint]

COPYRIGHT = (
    "// Copyright (C) 2020-2024 Free Software Foundation, Inc.\n"
    "\n"
    "// This file is part of GCC.\n"
    "\n"
    "// GCC is free software; you can redistribute it and/or modify it under\n"
    "// the terms of the GNU General Public License as published by the Free\n"
    "// Software Foundation; either version 3, or (at your option) any later\n"
    "// version.\n"
    "\n"
    "// GCC is distributed in the hope that it will be useful, but WITHOUT ANY\n"
    "// WARRANTY; without even the implied warranty of MERCHANTABILITY or\n"
    "// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License\n"
    "// for more details.\n"
    "\n"
    "// You should have received a copy of the GNU General Public License\n"
    "// along with GCC; see the file COPYING3.  If not see\n"
    "// <http://www.gnu.org/licenses/>."
)

# Decomposition_Mapping table
decomposition_map: dict[Codepoint, list[Codepoint]] = {}
# Canonical_Combining_Class table
ccc_table: dict[Codepoint, int] = {}
# Ranges of codepoints with the Full_Composition_Exclusion property
composition_exclusion_ranges: list[Range] = []
# Ranges of codepoints with the Full_Composition_Exclusion property
alphabetic_ranges: list[Range] = []
# Ranges of codepoints with NFC_QC=No
nfc_qc_no_ranges: list[Range] = []
# Ranges of codepoints with NFC_QC=Maybe
nfc_qc_maybe_ranges: list[Range] = []
numeric_codepoints: list[Codepoint] = []

# Note that an element of range `[m, n]` (a list in python) represents [m, n)


def binary_search_ranges(ranges: list[Range], target: Codepoint) -> int:
    low: int = 0
    high: int = len(ranges) - 1
    while low <= high:
        mid = (low + high) // 2
        start, end = ranges[mid]
        if start <= target <= end - 1:
            return mid  # target found. returns index.
        elif target < start:
            high = mid - 1
        else:
            low = mid + 1
    # target not found.
    return -1


# Utility function to parse '<codepoint>...<codepoint>' or '<codepoint>'
def parse_codepoint_range(range_str: str) -> Range:
    codepoint_range: list[str] = range_str.split("..")
    assert len(codepoint_range) == 1 or len(codepoint_range) == 2, "Invalid format"
    start_cp, end_cp = 0, 0
    if len(codepoint_range) == 1:
        # m..n => [m, n+1)
        start_cp = int(codepoint_range[0], 16)
        end_cp = start_cp + 1
    else:
        # m => [m, m+1)
        start_cp = int(codepoint_range[0], 16)
        end_cp = int(codepoint_range[1], 16) + 1
    return start_cp, end_cp


def read_unicode_data_txt(filepath: str) -> None:
    def process_line(line: str) -> None:
        rows = line.split(";")
        if len(rows) != 15:
            return
        # Parse codepoint
        cp = int(rows[0], 16)
        # Parse general category
        category = rows[2]
        if category == "Nd" or category == "Nl" or category == "No":
            numeric_codepoints.append(cp)

        # Parse CCC
        ccc = int(rows[3], 10)
        if ccc != 0:
            ccc_table[cp] = ccc
        # Parse decomposition mapping
        # Ignore compatibility decomposition mapping because
        # it is not required for **NFC** normalization.
        if not rows[5].startswith("<"):
            decomp_cp_strs = rows[5].split(" ")
            decomp_cps = []
            for s in decomp_cp_strs:
                if s == "":
                    continue
                decomp_cps.append(int(s, 16))
            assert (
                len(decomp_cps) <= 2
            ), "Decomposition_Mapping must not contain more than 2 characters."
            if len(decomp_cps) > 0:
                decomposition_map[cp] = decomp_cps

    with open(filepath, "r", encoding="UTF-8") as file:
        while line := file.readline():
            process_line(line.rstrip())


def read_derived_norm_props_txt(filepath: str) -> None:
    def process_line(line) -> None:
        # Ignore comments
        line = line.split("#")[0]
        rows = line.split(";")
        # Too few rows. Skipped.
        if len(rows) < 2:
            return
        rows[0] = rows[0].lstrip().rstrip()
        rows[1] = rows[1].lstrip().rstrip()
        cp_range = parse_codepoint_range(rows[0])
        if rows[1] == "Full_Composition_Exclusion":
            composition_exclusion_ranges.append(cp_range)
        elif rows[1] == "NFC_QC":
            assert len(rows) >= 3, "Too few rows for NFC_QC"
            rows[2] = rows[2].lstrip().rstrip()
            if rows[2] == "N":
                nfc_qc_no_ranges.append(cp_range)
            elif rows[2] == "M":
                nfc_qc_maybe_ranges.append(cp_range)
            else:
                raise RuntimeError("Value of NFC_QC must be N or M")

    with open(filepath, "r", encoding="UTF-8") as file:
        while line := file.readline():
            process_line(line.rstrip())


def read_derived_core_props_txt(filepath: str) -> None:
    def process_line(line: str) -> None:
        # Ignore comments
        line = line.split("#")[0]
        rows = line.split(";")
        # Too few rows. Skipped.
        if len(rows) < 2:
            return
        rows[0] = rows[0].lstrip().rstrip()
        rows[1] = rows[1].lstrip().rstrip()
        if rows[1] != "Alphabetic":
            return
        cp_range: Range = parse_codepoint_range(rows[0])
        alphabetic_ranges.append(cp_range)

    with open(filepath, "r", encoding="UTF-8") as file:
        while line := file.readline():
            process_line(line.rstrip())


def write_decomposition() -> None:
    print("const std::map<uint32_t, std::vector<uint32_t>> DECOMPOSITION_MAP = {")
    print("  // clang-format off")
    for cp in sorted(decomposition_map):
        print("  {{{:#06x}, ".format(cp), end="")
        print("{", end="")
        for decomp_cp in decomposition_map[cp]:
            print("{:#06x}, ".format(decomp_cp), end="")
        print("}},")
    print("  // clang-format on")
    print("};")


def write_recomposition() -> None:
    print(
        "const std::map<std::pair<uint32_t, uint32_t>, uint32_t> RECOMPOSITION_MAP = {{"
    )
    print("  // clang-format off")
    for cp in decomposition_map:
        if binary_search_ranges(composition_exclusion_ranges, cp) != -1:
            continue
        d1: Codepoint
        d2: Codepoint
        if len(decomposition_map[cp]) == 1:
            d1 = decomposition_map[cp][0]
            d2 = 0
        else:
            d1 = decomposition_map[cp][0]
            d2 = decomposition_map[cp][1]
        print("  {{{{{:#06x}, {:#06x}}}, {:#06x}}},".format(d1, d2, cp))
    print("  // clang-format on")
    print("}};")


def write_ccc() -> None:
    print("const std::map<uint32_t, int32_t> CCC_TABLE = {")
    print("  // clang-format off")
    for cp in ccc_table:
        print("  {{{:#06x}, {}}},".format(cp, ccc_table[cp]))
    print("  // clang-format on")
    print("};")


def write_alphabetic() -> None:
    print(
        "const std::array<std::pair<uint32_t, uint32_t>, NUM_ALPHABETIC_RANGES> ALPHABETIC_RANGES = {{"
    )
    print("  // clang-format off")
    for r in alphabetic_ranges:
        print("  {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
    print("  // clang-format on")
    print("}};")


def write_numeric() -> None:
    print("const std::array<uint32_t, NUM_NUMERIC_CODEPOINTS> NUMERIC_CODEPOINTS = {{")
    print("  // clang-format off")
    for i, cp in enumerate(numeric_codepoints):
        if i % 16 == 0:
            print("  ", end="")
        print("{:#06x}, ".format(cp), end="")
        if i % 16 == 15:
            print()
    if i % 16 != 15:
        print()
    print("  // clang-format on")
    print("}};")


def write_nfc_qc():
    print(
        "const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_NO_RANGES = {{{{".format(
            len(nfc_qc_no_ranges)
        )
    )
    print("  // clang-format off")
    for r in nfc_qc_no_ranges:
        print("  {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
    print("  // clang-format on")
    print("}};")

    print(
        "const std::array<std::pair<uint32_t, uint32_t>, {}> NFC_QC_MAYBE_RANGES = {{{{".format(
            len(nfc_qc_maybe_ranges)
        )
    )
    print("  // clang-format off")
    for r in nfc_qc_maybe_ranges:
        print("  {{{:#06x}, {:#06x}}},".format(r[0], r[1]))
    print("  // clang-format on")
    print("}};")


def main() -> None:
    if len(sys.argv) != 4:
        print("too few arguments", file=sys.stderr)
        exit(-1)
    unicode_txt_path: str = sys.argv[1]
    norm_props_txt_path: str = sys.argv[2]
    core_props_txt_path: str = sys.argv[3]

    read_unicode_data_txt(unicode_txt_path)
    read_derived_norm_props_txt(norm_props_txt_path)
    read_derived_core_props_txt(core_props_txt_path)

    print(COPYRIGHT)
    print()

    print('#include "rust-system.h"\n')
    print("namespace Rust {\n")
    print("const uint32_t NUM_ALPHABETIC_RANGES = {};".format(len(alphabetic_ranges)))
    print(
        "const uint32_t NUM_NUMERIC_CODEPOINTS = {};\n".format(len(numeric_codepoints))
    )

    write_decomposition()
    print()
    write_recomposition()
    print()
    write_ccc()
    print()
    write_alphabetic()
    print()
    write_numeric()
    print()
    write_nfc_qc()
    print()

    print("} // namespace Rust")


if __name__ == "__main__":
    main()