onepass.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. // Copyright 2014 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package regexp
  5. import (
  6. "regexp/syntax"
  7. "sort"
  8. "strings"
  9. "unicode"
  10. "unicode/utf8"
  11. )
  12. // "One-pass" regexp execution.
  13. // Some regexps can be analyzed to determine that they never need
  14. // backtracking: they are guaranteed to run in one pass over the string
  15. // without bothering to save all the usual NFA state.
  16. // Detect those and execute them more quickly.
  17. // A onePassProg is a compiled one-pass regular expression program.
  18. // It is the same as syntax.Prog except for the use of onePassInst.
  19. type onePassProg struct {
  20. Inst []onePassInst
  21. Start int // index of start instruction
  22. NumCap int // number of InstCapture insts in re
  23. }
  24. // A onePassInst is a single instruction in a one-pass regular expression program.
  25. // It is the same as syntax.Inst except for the new 'Next' field.
  26. type onePassInst struct {
  27. syntax.Inst
  28. Next []uint32
  29. }
  30. // OnePassPrefix returns a literal string that all matches for the
  31. // regexp must start with. Complete is true if the prefix
  32. // is the entire match. Pc is the index of the last rune instruction
  33. // in the string. The OnePassPrefix skips over the mandatory
  34. // EmptyBeginText
  35. func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
  36. i := &p.Inst[p.Start]
  37. if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
  38. return "", i.Op == syntax.InstMatch, uint32(p.Start)
  39. }
  40. pc = i.Out
  41. i = &p.Inst[pc]
  42. for i.Op == syntax.InstNop {
  43. pc = i.Out
  44. i = &p.Inst[pc]
  45. }
  46. // Avoid allocation of buffer if prefix is empty.
  47. if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
  48. return "", i.Op == syntax.InstMatch, uint32(p.Start)
  49. }
  50. // Have prefix; gather characters.
  51. var buf strings.Builder
  52. for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
  53. buf.WriteRune(i.Rune[0])
  54. pc, i = i.Out, &p.Inst[i.Out]
  55. }
  56. if i.Op == syntax.InstEmptyWidth &&
  57. syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
  58. p.Inst[i.Out].Op == syntax.InstMatch {
  59. complete = true
  60. }
  61. return buf.String(), complete, pc
  62. }
  63. // OnePassNext selects the next actionable state of the prog, based on the input character.
  64. // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
  65. // One of the alternates may ultimately lead without input to end of line. If the instruction
  66. // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
  67. func onePassNext(i *onePassInst, r rune) uint32 {
  68. next := i.MatchRunePos(r)
  69. if next >= 0 {
  70. return i.Next[next]
  71. }
  72. if i.Op == syntax.InstAltMatch {
  73. return i.Out
  74. }
  75. return 0
  76. }
  77. func iop(i *syntax.Inst) syntax.InstOp {
  78. op := i.Op
  79. switch op {
  80. case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  81. op = syntax.InstRune
  82. }
  83. return op
  84. }
  85. // Sparse Array implementation is used as a queueOnePass.
  86. type queueOnePass struct {
  87. sparse []uint32
  88. dense []uint32
  89. size, nextIndex uint32
  90. }
  91. func (q *queueOnePass) empty() bool {
  92. return q.nextIndex >= q.size
  93. }
  94. func (q *queueOnePass) next() (n uint32) {
  95. n = q.dense[q.nextIndex]
  96. q.nextIndex++
  97. return
  98. }
  99. func (q *queueOnePass) clear() {
  100. q.size = 0
  101. q.nextIndex = 0
  102. }
  103. func (q *queueOnePass) contains(u uint32) bool {
  104. if u >= uint32(len(q.sparse)) {
  105. return false
  106. }
  107. return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
  108. }
  109. func (q *queueOnePass) insert(u uint32) {
  110. if !q.contains(u) {
  111. q.insertNew(u)
  112. }
  113. }
  114. func (q *queueOnePass) insertNew(u uint32) {
  115. if u >= uint32(len(q.sparse)) {
  116. return
  117. }
  118. q.sparse[u] = q.size
  119. q.dense[q.size] = u
  120. q.size++
  121. }
  122. func newQueue(size int) (q *queueOnePass) {
  123. return &queueOnePass{
  124. sparse: make([]uint32, size),
  125. dense: make([]uint32, size),
  126. }
  127. }
  128. // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
  129. // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
  130. // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
  131. // NextIp array with the single element mergeFailed is returned.
  132. // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
  133. const mergeFailed = uint32(0xffffffff)
  134. var (
  135. noRune = []rune{}
  136. noNext = []uint32{mergeFailed}
  137. )
  138. func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
  139. leftLen := len(*leftRunes)
  140. rightLen := len(*rightRunes)
  141. if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
  142. panic("mergeRuneSets odd length []rune")
  143. }
  144. var (
  145. lx, rx int
  146. )
  147. merged := make([]rune, 0)
  148. next := make([]uint32, 0)
  149. ok := true
  150. defer func() {
  151. if !ok {
  152. merged = nil
  153. next = nil
  154. }
  155. }()
  156. ix := -1
  157. extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
  158. if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
  159. return false
  160. }
  161. merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
  162. *newLow += 2
  163. ix += 2
  164. next = append(next, pc)
  165. return true
  166. }
  167. for lx < leftLen || rx < rightLen {
  168. switch {
  169. case rx >= rightLen:
  170. ok = extend(&lx, leftRunes, leftPC)
  171. case lx >= leftLen:
  172. ok = extend(&rx, rightRunes, rightPC)
  173. case (*rightRunes)[rx] < (*leftRunes)[lx]:
  174. ok = extend(&rx, rightRunes, rightPC)
  175. default:
  176. ok = extend(&lx, leftRunes, leftPC)
  177. }
  178. if !ok {
  179. return noRune, noNext
  180. }
  181. }
  182. return merged, next
  183. }
  184. // cleanupOnePass drops working memory, and restores certain shortcut instructions.
  185. func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
  186. for ix, instOriginal := range original.Inst {
  187. switch instOriginal.Op {
  188. case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
  189. case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
  190. prog.Inst[ix].Next = nil
  191. case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  192. prog.Inst[ix].Next = nil
  193. prog.Inst[ix] = onePassInst{Inst: instOriginal}
  194. }
  195. }
  196. }
  197. // onePassCopy creates a copy of the original Prog, as we'll be modifying it
  198. func onePassCopy(prog *syntax.Prog) *onePassProg {
  199. p := &onePassProg{
  200. Start: prog.Start,
  201. NumCap: prog.NumCap,
  202. Inst: make([]onePassInst, len(prog.Inst)),
  203. }
  204. for i, inst := range prog.Inst {
  205. p.Inst[i] = onePassInst{Inst: inst}
  206. }
  207. // rewrites one or more common Prog constructs that enable some otherwise
  208. // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
  209. // ip A, that points to ips B & C.
  210. // A:BC + B:DA => A:BC + B:CD
  211. // A:BC + B:DC => A:DC + B:DC
  212. for pc := range p.Inst {
  213. switch p.Inst[pc].Op {
  214. default:
  215. continue
  216. case syntax.InstAlt, syntax.InstAltMatch:
  217. // A:Bx + B:Ay
  218. p_A_Other := &p.Inst[pc].Out
  219. p_A_Alt := &p.Inst[pc].Arg
  220. // make sure a target is another Alt
  221. instAlt := p.Inst[*p_A_Alt]
  222. if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
  223. p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
  224. instAlt = p.Inst[*p_A_Alt]
  225. if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
  226. continue
  227. }
  228. }
  229. instOther := p.Inst[*p_A_Other]
  230. // Analyzing both legs pointing to Alts is for another day
  231. if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
  232. // too complicated
  233. continue
  234. }
  235. // simple empty transition loop
  236. // A:BC + B:DA => A:BC + B:DC
  237. p_B_Alt := &p.Inst[*p_A_Alt].Out
  238. p_B_Other := &p.Inst[*p_A_Alt].Arg
  239. patch := false
  240. if instAlt.Out == uint32(pc) {
  241. patch = true
  242. } else if instAlt.Arg == uint32(pc) {
  243. patch = true
  244. p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
  245. }
  246. if patch {
  247. *p_B_Alt = *p_A_Other
  248. }
  249. // empty transition to common target
  250. // A:BC + B:DC => A:DC + B:DC
  251. if *p_A_Other == *p_B_Alt {
  252. *p_A_Alt = *p_B_Other
  253. }
  254. }
  255. }
  256. return p
  257. }
  258. // runeSlice exists to permit sorting the case-folded rune sets.
  259. type runeSlice []rune
  260. func (p runeSlice) Len() int { return len(p) }
  261. func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
  262. func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
  263. var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
  264. var anyRune = []rune{0, unicode.MaxRune}
  265. // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
  266. // the match engine can always tell which branch to take. The routine may modify
  267. // p if it is turned into a onepass Prog. If it isn't possible for this to be a
  268. // onepass Prog, the Prog nil is returned. makeOnePass is recursive
  269. // to the size of the Prog.
  270. func makeOnePass(p *onePassProg) *onePassProg {
  271. // If the machine is very long, it's not worth the time to check if we can use one pass.
  272. if len(p.Inst) >= 1000 {
  273. return nil
  274. }
  275. var (
  276. instQueue = newQueue(len(p.Inst))
  277. visitQueue = newQueue(len(p.Inst))
  278. check func(uint32, []bool) bool
  279. onePassRunes = make([][]rune, len(p.Inst))
  280. )
  281. // check that paths from Alt instructions are unambiguous, and rebuild the new
  282. // program as a onepass program
  283. check = func(pc uint32, m []bool) (ok bool) {
  284. ok = true
  285. inst := &p.Inst[pc]
  286. if visitQueue.contains(pc) {
  287. return
  288. }
  289. visitQueue.insert(pc)
  290. switch inst.Op {
  291. case syntax.InstAlt, syntax.InstAltMatch:
  292. ok = check(inst.Out, m) && check(inst.Arg, m)
  293. // check no-input paths to InstMatch
  294. matchOut := m[inst.Out]
  295. matchArg := m[inst.Arg]
  296. if matchOut && matchArg {
  297. ok = false
  298. break
  299. }
  300. // Match on empty goes in inst.Out
  301. if matchArg {
  302. inst.Out, inst.Arg = inst.Arg, inst.Out
  303. matchOut, matchArg = matchArg, matchOut
  304. }
  305. if matchOut {
  306. m[pc] = true
  307. inst.Op = syntax.InstAltMatch
  308. }
  309. // build a dispatch operator from the two legs of the alt.
  310. onePassRunes[pc], inst.Next = mergeRuneSets(
  311. &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
  312. if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
  313. ok = false
  314. break
  315. }
  316. case syntax.InstCapture, syntax.InstNop:
  317. ok = check(inst.Out, m)
  318. m[pc] = m[inst.Out]
  319. // pass matching runes back through these no-ops.
  320. onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
  321. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  322. for i := range inst.Next {
  323. inst.Next[i] = inst.Out
  324. }
  325. case syntax.InstEmptyWidth:
  326. ok = check(inst.Out, m)
  327. m[pc] = m[inst.Out]
  328. onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
  329. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  330. for i := range inst.Next {
  331. inst.Next[i] = inst.Out
  332. }
  333. case syntax.InstMatch, syntax.InstFail:
  334. m[pc] = inst.Op == syntax.InstMatch
  335. case syntax.InstRune:
  336. m[pc] = false
  337. if len(inst.Next) > 0 {
  338. break
  339. }
  340. instQueue.insert(inst.Out)
  341. if len(inst.Rune) == 0 {
  342. onePassRunes[pc] = []rune{}
  343. inst.Next = []uint32{inst.Out}
  344. break
  345. }
  346. runes := make([]rune, 0)
  347. if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
  348. r0 := inst.Rune[0]
  349. runes = append(runes, r0, r0)
  350. for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
  351. runes = append(runes, r1, r1)
  352. }
  353. sort.Sort(runeSlice(runes))
  354. } else {
  355. runes = append(runes, inst.Rune...)
  356. }
  357. onePassRunes[pc] = runes
  358. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  359. for i := range inst.Next {
  360. inst.Next[i] = inst.Out
  361. }
  362. inst.Op = syntax.InstRune
  363. case syntax.InstRune1:
  364. m[pc] = false
  365. if len(inst.Next) > 0 {
  366. break
  367. }
  368. instQueue.insert(inst.Out)
  369. runes := []rune{}
  370. // expand case-folded runes
  371. if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
  372. r0 := inst.Rune[0]
  373. runes = append(runes, r0, r0)
  374. for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
  375. runes = append(runes, r1, r1)
  376. }
  377. sort.Sort(runeSlice(runes))
  378. } else {
  379. runes = append(runes, inst.Rune[0], inst.Rune[0])
  380. }
  381. onePassRunes[pc] = runes
  382. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  383. for i := range inst.Next {
  384. inst.Next[i] = inst.Out
  385. }
  386. inst.Op = syntax.InstRune
  387. case syntax.InstRuneAny:
  388. m[pc] = false
  389. if len(inst.Next) > 0 {
  390. break
  391. }
  392. instQueue.insert(inst.Out)
  393. onePassRunes[pc] = append([]rune{}, anyRune...)
  394. inst.Next = []uint32{inst.Out}
  395. case syntax.InstRuneAnyNotNL:
  396. m[pc] = false
  397. if len(inst.Next) > 0 {
  398. break
  399. }
  400. instQueue.insert(inst.Out)
  401. onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
  402. inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
  403. for i := range inst.Next {
  404. inst.Next[i] = inst.Out
  405. }
  406. }
  407. return
  408. }
  409. instQueue.clear()
  410. instQueue.insert(uint32(p.Start))
  411. m := make([]bool, len(p.Inst))
  412. for !instQueue.empty() {
  413. visitQueue.clear()
  414. pc := instQueue.next()
  415. if !check(pc, m) {
  416. p = nil
  417. break
  418. }
  419. }
  420. if p != nil {
  421. for i := range p.Inst {
  422. p.Inst[i].Rune = onePassRunes[i]
  423. }
  424. }
  425. return p
  426. }
  427. // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
  428. // can be recharacterized as a one-pass regexp program, or syntax.nil if the
  429. // Prog cannot be converted. For a one pass prog, the fundamental condition that must
  430. // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
  431. func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
  432. if prog.Start == 0 {
  433. return nil
  434. }
  435. // onepass regexp is anchored
  436. if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
  437. syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
  438. return nil
  439. }
  440. // every instruction leading to InstMatch must be EmptyEndText
  441. for _, inst := range prog.Inst {
  442. opOut := prog.Inst[inst.Out].Op
  443. switch inst.Op {
  444. default:
  445. if opOut == syntax.InstMatch {
  446. return nil
  447. }
  448. case syntax.InstAlt, syntax.InstAltMatch:
  449. if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
  450. return nil
  451. }
  452. case syntax.InstEmptyWidth:
  453. if opOut == syntax.InstMatch {
  454. if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
  455. continue
  456. }
  457. return nil
  458. }
  459. }
  460. }
  461. // Creates a slightly optimized copy of the original Prog
  462. // that cleans up some Prog idioms that block valid onepass programs
  463. p = onePassCopy(prog)
  464. // checkAmbiguity on InstAlts, build onepass Prog if possible
  465. p = makeOnePass(p)
  466. if p != nil {
  467. cleanupOnePass(p, prog)
  468. }
  469. return p
  470. }