123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106 |
- #!/usr/bin/env python3
- #
- # Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
- #
- # This file is part of GCC.
- #
- # GCC is free software; you can redistribute it and/or modify it under
- # the terms of the GNU General Public License as published by the Free
- # Software Foundation; either version 3, or (at your option) any later
- # version.
- #
- # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
- # WARRANTY; without even the implied warranty of MERCHANTABILITY or
- # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- # for more details.
- #
- # You should have received a copy of the GNU General Public License
- # along with GCC; see the file COPYING3. If not see
- # <http://www.gnu.org/licenses/>. */
- import sys
- import os
- if len(sys.argv) != 2:
- print("usage: %s <unicode version>", file=sys.stderr)
- sys.exit(1)
- unicode_version = sys.argv[1]
- # Parse a codepoint in the format output by glibc tools.
- def parse_ucn(s):
- if not (s.startswith("<U") and s.endswith(">")):
- raise ValueError
- return int(s[2:-1], base=16)
- # Process a line of width output from utf_gen.py and update global array.
- widths = [1] * (1 + 0x10FFFF)
- def process_width(line):
- # Example lines:
- # <UA8FF> 0
- # <UA926>...<UA92D> 0
- s = line.split()
- width = int(s[1])
- r = s[0].split("...")
- if len(r) == 1:
- begin = parse_ucn(r[0])
- end = begin + 1
- elif len(r) == 2:
- begin = parse_ucn(r[0])
- end = parse_ucn(r[1]) + 1
- else:
- raise ValueError
- widths[begin:end] = [width] * (end - begin)
- # To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a
- # file named UTF-8, which is not configurable. Then we parse this into the form
- # we want it.
- os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
- processing = False
- for line in open("UTF-8", "r"):
- if processing:
- if line == "END WIDTH\n":
- processing = False
- else:
- try:
- process_width(line)
- except (ValueError, IndexError):
- print(e, "warning: ignored unexpected line: %s" % line,
- file=sys.stderr, end="")
- elif line == "WIDTH\n":
- processing = True
- # All bytes < 256 we treat as width 1.
- widths[0:255] = [1] * 255
- # Condense the list to contiguous ranges.
- cur_range = [-1, 1]
- all_ranges = []
- for i, width in enumerate(widths):
- if width == cur_range[1]:
- cur_range[0] = i
- else:
- all_ranges.append(cur_range)
- cur_range = [i, width]
- # Output the arrays for generated_cpp_wcwidth.h
- print("/* Generated by contrib/unicode/gen_wcwidth.py,",
- "with the help of glibc's")
- print(" utf8_gen.py, using version %s" % unicode_version,
- "of the Unicode standard. */")
- print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
- for i, r in enumerate(all_ranges):
- if i % 8:
- print(" ", end="")
- else:
- print("\n ", end="")
- print("0x%x," % (r[0]), end="")
- print("\n};\n")
- print("static const unsigned char wcwidth_widths[] = {", end="")
- for i, r in enumerate(all_ranges):
- if i % 24:
- print(" ", end="")
- else:
- print("\n ", end="")
- print("%d," % r[1], end="")
- print("\n};")
|