utf8-dump.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. #!/usr/bin/env python3
  2. #
  3. # Script to dump a UTF-8 file as a list of numbered lines (mimicking GCC's
  4. # diagnostic output format), interleaved with lines per character showing
  5. # the Unicode codepoints, the UTF-8 encoding bytes, the name of the
  6. # character, and, where printable, the characters themselves.
  7. # The lines are printed in logical order, which may help the reader to grok
  8. # the relationship between visual and logical ordering in bi-di files.
  9. #
  10. # SPDX-License-Identifier: MIT
  11. #
  12. # Copyright (C) 2021 David Malcolm <dmalcolm@redhat.com>.
  13. #
  14. # Permission is hereby granted, free of charge, to any person obtaining a
  15. # copy of this software and associated documentation files (the "Software"),
  16. # to deal in the Software without restriction, including without limitation
  17. # the rights to use, copy, modify, merge, publish, distribute, sublicense,
  18. # and/or sell copies of the Software, and to permit persons to whom the
  19. # Software is furnished to do so, subject to the following conditions:
  20. #
  21. # The above copyright notice and this permission notice shall be included
  22. # in all copies or substantial portions of the Software.
  23. #
  24. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  25. # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  26. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  27. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  28. # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
  29. # OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
  30. # OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  31. import sys
  32. import unicodedata
  33. def get_name(ch):
  34. try:
  35. return unicodedata.name(ch)
  36. except ValueError:
  37. if ch == '\n':
  38. return 'LINE FEED (LF)'
  39. return '(unknown)'
  40. def get_printable(ch):
  41. cat = unicodedata.category(ch)
  42. if cat == 'Cc':
  43. return '(control character)'
  44. elif cat == 'Cf':
  45. return '(format control)'
  46. elif cat[0] == 'Z':
  47. return '(separator)'
  48. return ch
  49. def dump_file(f_in):
  50. line_num = 1
  51. for line in f_in:
  52. print('%4i | %s' % (line_num, line.rstrip()))
  53. for ch in line:
  54. utf8_desc = '%15s' % (' '.join(['0x%02x' % b
  55. for b in ch.encode('utf-8')]))
  56. print('%4s | U+%04X %s %40s %s'
  57. % ('', ord(ch), utf8_desc, get_name(ch), get_printable(ch)))
  58. line_num += 1
  59. with open(sys.argv[1], mode='r') as f_in:
  60. dump_file(f_in)