search.go 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package strings
  5. // stringFinder efficiently finds strings in a source text. It's implemented
  6. // using the Boyer-Moore string search algorithm:
  7. // https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
  8. // https://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
  9. // document uses 1-based indexing)
  10. type stringFinder struct {
  11. // pattern is the string that we are searching for in the text.
  12. pattern string
  13. // badCharSkip[b] contains the distance between the last byte of pattern
  14. // and the rightmost occurrence of b in pattern. If b is not in pattern,
  15. // badCharSkip[b] is len(pattern).
  16. //
  17. // Whenever a mismatch is found with byte b in the text, we can safely
  18. // shift the matching frame at least badCharSkip[b] until the next time
  19. // the matching char could be in alignment.
  20. badCharSkip [256]int
  21. // goodSuffixSkip[i] defines how far we can shift the matching frame given
  22. // that the suffix pattern[i+1:] matches, but the byte pattern[i] does
  23. // not. There are two cases to consider:
  24. //
  25. // 1. The matched suffix occurs elsewhere in pattern (with a different
  26. // byte preceding it that we might possibly match). In this case, we can
  27. // shift the matching frame to align with the next suffix chunk. For
  28. // example, the pattern "mississi" has the suffix "issi" next occurring
  29. // (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
  30. // shift+len(suffix) == 3+4 == 7.
  31. //
  32. // 2. If the matched suffix does not occur elsewhere in pattern, then the
  33. // matching frame may share part of its prefix with the end of the
  34. // matching suffix. In this case, goodSuffixSkip[i] will contain how far
  35. // to shift the frame to align this portion of the prefix to the
  36. // suffix. For example, in the pattern "abcxxxabc", when the first
  37. // mismatch from the back is found to be in position 3, the matching
  38. // suffix "xxabc" is not found elsewhere in the pattern. However, its
  39. // rightmost "abc" (at position 6) is a prefix of the whole pattern, so
  40. // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
  41. goodSuffixSkip []int
  42. }
  43. func makeStringFinder(pattern string) *stringFinder {
  44. f := &stringFinder{
  45. pattern: pattern,
  46. goodSuffixSkip: make([]int, len(pattern)),
  47. }
  48. // last is the index of the last character in the pattern.
  49. last := len(pattern) - 1
  50. // Build bad character table.
  51. // Bytes not in the pattern can skip one pattern's length.
  52. for i := range f.badCharSkip {
  53. f.badCharSkip[i] = len(pattern)
  54. }
  55. // The loop condition is < instead of <= so that the last byte does not
  56. // have a zero distance to itself. Finding this byte out of place implies
  57. // that it is not in the last position.
  58. for i := 0; i < last; i++ {
  59. f.badCharSkip[pattern[i]] = last - i
  60. }
  61. // Build good suffix table.
  62. // First pass: set each value to the next index which starts a prefix of
  63. // pattern.
  64. lastPrefix := last
  65. for i := last; i >= 0; i-- {
  66. if HasPrefix(pattern, pattern[i+1:]) {
  67. lastPrefix = i + 1
  68. }
  69. // lastPrefix is the shift, and (last-i) is len(suffix).
  70. f.goodSuffixSkip[i] = lastPrefix + last - i
  71. }
  72. // Second pass: find repeats of pattern's suffix starting from the front.
  73. for i := 0; i < last; i++ {
  74. lenSuffix := longestCommonSuffix(pattern, pattern[1:i+1])
  75. if pattern[i-lenSuffix] != pattern[last-lenSuffix] {
  76. // (last-i) is the shift, and lenSuffix is len(suffix).
  77. f.goodSuffixSkip[last-lenSuffix] = lenSuffix + last - i
  78. }
  79. }
  80. return f
  81. }
  82. func longestCommonSuffix(a, b string) (i int) {
  83. for ; i < len(a) && i < len(b); i++ {
  84. if a[len(a)-1-i] != b[len(b)-1-i] {
  85. break
  86. }
  87. }
  88. return
  89. }
  90. // next returns the index in text of the first occurrence of the pattern. If
  91. // the pattern is not found, it returns -1.
  92. func (f *stringFinder) next(text string) int {
  93. i := len(f.pattern) - 1
  94. for i < len(text) {
  95. // Compare backwards from the end until the first unmatching character.
  96. j := len(f.pattern) - 1
  97. for j >= 0 && text[i] == f.pattern[j] {
  98. i--
  99. j--
  100. }
  101. if j < 0 {
  102. return i + 1 // match
  103. }
  104. i += max(f.badCharSkip[text[i]], f.goodSuffixSkip[j])
  105. }
  106. return -1
  107. }
  108. func max(a, b int) int {
  109. if a > b {
  110. return a
  111. }
  112. return b
  113. }