gen_wcwidth.py 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. #!/usr/bin/env python3
  2. #
  3. # Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
  4. #
  5. # This file is part of GCC.
  6. #
  7. # GCC is free software; you can redistribute it and/or modify it under
  8. # the terms of the GNU General Public License as published by the Free
  9. # Software Foundation; either version 3, or (at your option) any later
  10. # version.
  11. #
  12. # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  13. # WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14. # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
  15. # for more details.
  16. #
  17. # You should have received a copy of the GNU General Public License
  18. # along with GCC; see the file COPYING3. If not see
  19. # <http://www.gnu.org/licenses/>. */
  20. import sys
  21. import os
  22. if len(sys.argv) != 2:
  23. print("usage: %s <unicode version>", file=sys.stderr)
  24. sys.exit(1)
  25. unicode_version = sys.argv[1]
  26. # Parse a codepoint in the format output by glibc tools.
  27. def parse_ucn(s):
  28. if not (s.startswith("<U") and s.endswith(">")):
  29. raise ValueError
  30. return int(s[2:-1], base=16)
  31. # Process a line of width output from utf_gen.py and update global array.
  32. widths = [1] * (1 + 0x10FFFF)
  33. def process_width(line):
  34. # Example lines:
  35. # <UA8FF> 0
  36. # <UA926>...<UA92D> 0
  37. s = line.split()
  38. width = int(s[1])
  39. r = s[0].split("...")
  40. if len(r) == 1:
  41. begin = parse_ucn(r[0])
  42. end = begin + 1
  43. elif len(r) == 2:
  44. begin = parse_ucn(r[0])
  45. end = parse_ucn(r[1]) + 1
  46. else:
  47. raise ValueError
  48. widths[begin:end] = [width] * (end - begin)
  49. # To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a
  50. # file named UTF-8, which is not configurable. Then we parse this into the form
  51. # we want it.
  52. os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
  53. processing = False
  54. for line in open("UTF-8", "r"):
  55. if processing:
  56. if line == "END WIDTH\n":
  57. processing = False
  58. else:
  59. try:
  60. process_width(line)
  61. except (ValueError, IndexError):
  62. print(e, "warning: ignored unexpected line: %s" % line,
  63. file=sys.stderr, end="")
  64. elif line == "WIDTH\n":
  65. processing = True
  66. # All bytes < 256 we treat as width 1.
  67. widths[0:255] = [1] * 255
  68. # Condense the list to contiguous ranges.
  69. cur_range = [-1, 1]
  70. all_ranges = []
  71. for i, width in enumerate(widths):
  72. if width == cur_range[1]:
  73. cur_range[0] = i
  74. else:
  75. all_ranges.append(cur_range)
  76. cur_range = [i, width]
  77. # Output the arrays for generated_cpp_wcwidth.h
  78. print("/* Generated by contrib/unicode/gen_wcwidth.py,",
  79. "with the help of glibc's")
  80. print(" utf8_gen.py, using version %s" % unicode_version,
  81. "of the Unicode standard. */")
  82. print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
  83. for i, r in enumerate(all_ranges):
  84. if i % 8:
  85. print(" ", end="")
  86. else:
  87. print("\n ", end="")
  88. print("0x%x," % (r[0]), end="")
  89. print("\n};\n")
  90. print("static const unsigned char wcwidth_widths[] = {", end="")
  91. for i, r in enumerate(all_ranges):
  92. if i % 24:
  93. print(" ", end="")
  94. else:
  95. print("\n ", end="")
  96. print("%d," % r[1], end="")
  97. print("\n};")