ada-unicode.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. #!/usr/bin/env python3
  2. # Generate Unicode case-folding table for Ada.
  3. # Copyright (C) 2022 Free Software Foundation, Inc.
  4. # This file is part of GDB.
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 3 of the License, or
  8. # (at your option) any later version.
  9. # This program is distributed in the hope that it will be useful,
  10. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  12. # GNU General Public License for more details.
  13. # You should have received a copy of the GNU General Public License
  14. # along with this program. If not, see <http://www.gnu.org/licenses/>.
  15. # This generates the ada-casefold.h header.
  16. # Usage:
  17. # python ada-unicode.py
  18. import gdbcopyright
  19. # The start of the current range of case-conversions we are
  20. # processing. If RANGE_START is None, then we're outside of a range.
  21. range_start = None
  22. # End of the current range.
  23. range_end = None
  24. # The delta between RANGE_START and the upper-case variant of that
  25. # character.
  26. upper_delta = None
  27. # The delta between RANGE_START and the lower-case variant of that
  28. # character.
  29. lower_delta = None
  30. # All the ranges found and completed so far.
  31. # Each entry is a tuple of the form (START, END, UPPER_DELTA, LOWER_DELTA).
  32. all_ranges = []
  33. def finish_range():
  34. global range_start
  35. global range_end
  36. global upper_delta
  37. global lower_delta
  38. if range_start is not None:
  39. all_ranges.append((range_start, range_end, upper_delta, lower_delta))
  40. range_start = None
  41. range_end = None
  42. upper_delta = None
  43. lower_delta = None
  44. def process_codepoint(val):
  45. global range_start
  46. global range_end
  47. global upper_delta
  48. global lower_delta
  49. c = chr(val)
  50. low = c.lower()
  51. up = c.upper()
  52. # U+00DF ("LATIN SMALL LETTER SHARP S", aka eszsett) traditionally
  53. # upper-cases to the two-character string "SS" (the capital form
  54. # is a relatively recent addition -- 2017). Our simple scheme
  55. # can't handle this, so we skip it. Also, because our approach
  56. # just represents runs of characters with identical folding
  57. # deltas, this change must terminate the current run.
  58. if (c == low and c == up) or len(low) != 1 or len(up) != 1:
  59. finish_range()
  60. return
  61. updelta = ord(up) - val
  62. lowdelta = ord(low) - val
  63. if range_start is not None and (updelta != upper_delta or lowdelta != lower_delta):
  64. finish_range()
  65. if range_start is None:
  66. range_start = val
  67. upper_delta = updelta
  68. lower_delta = lowdelta
  69. range_end = val
  70. for c in range(0, 0x10FFFF):
  71. process_codepoint(c)
  72. with open("ada-casefold.h", "w") as f:
  73. print(
  74. gdbcopyright.copyright("ada-unicode.py", "UTF-32 case-folding for GDB"),
  75. file=f,
  76. )
  77. for r in all_ranges:
  78. print(f" {{{r[0]}, {r[1]}, {r[2]}, {r[3]}}},", file=f)