exec.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package regexp
  5. import (
  6. "io"
  7. "regexp/syntax"
  8. "sync"
  9. )
  10. // A queue is a 'sparse array' holding pending threads of execution.
  11. // See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
  12. type queue struct {
  13. sparse []uint32
  14. dense []entry
  15. }
  16. // An entry is an entry on a queue.
  17. // It holds both the instruction pc and the actual thread.
  18. // Some queue entries are just place holders so that the machine
  19. // knows it has considered that pc. Such entries have t == nil.
  20. type entry struct {
  21. pc uint32
  22. t *thread
  23. }
  24. // A thread is the state of a single path through the machine:
  25. // an instruction and a corresponding capture array.
  26. // See https://swtch.com/~rsc/regexp/regexp2.html
  27. type thread struct {
  28. inst *syntax.Inst
  29. cap []int
  30. }
  31. // A machine holds all the state during an NFA simulation for p.
  32. type machine struct {
  33. re *Regexp // corresponding Regexp
  34. p *syntax.Prog // compiled program
  35. q0, q1 queue // two queues for runq, nextq
  36. pool []*thread // pool of available threads
  37. matched bool // whether a match was found
  38. matchcap []int // capture information for the match
  39. inputs inputs
  40. }
  41. type inputs struct {
  42. // cached inputs, to avoid allocation
  43. bytes inputBytes
  44. string inputString
  45. reader inputReader
  46. }
  47. func (i *inputs) newBytes(b []byte) input {
  48. i.bytes.str = b
  49. return &i.bytes
  50. }
  51. func (i *inputs) newString(s string) input {
  52. i.string.str = s
  53. return &i.string
  54. }
  55. func (i *inputs) newReader(r io.RuneReader) input {
  56. i.reader.r = r
  57. i.reader.atEOT = false
  58. i.reader.pos = 0
  59. return &i.reader
  60. }
  61. func (i *inputs) clear() {
  62. // We need to clear 1 of these.
  63. // Avoid the expense of clearing the others (pointer write barrier).
  64. if i.bytes.str != nil {
  65. i.bytes.str = nil
  66. } else if i.reader.r != nil {
  67. i.reader.r = nil
  68. } else {
  69. i.string.str = ""
  70. }
  71. }
  72. func (i *inputs) init(r io.RuneReader, b []byte, s string) (input, int) {
  73. if r != nil {
  74. return i.newReader(r), 0
  75. }
  76. if b != nil {
  77. return i.newBytes(b), len(b)
  78. }
  79. return i.newString(s), len(s)
  80. }
  81. func (m *machine) init(ncap int) {
  82. for _, t := range m.pool {
  83. t.cap = t.cap[:ncap]
  84. }
  85. m.matchcap = m.matchcap[:ncap]
  86. }
  87. // alloc allocates a new thread with the given instruction.
  88. // It uses the free pool if possible.
  89. func (m *machine) alloc(i *syntax.Inst) *thread {
  90. var t *thread
  91. if n := len(m.pool); n > 0 {
  92. t = m.pool[n-1]
  93. m.pool = m.pool[:n-1]
  94. } else {
  95. t = new(thread)
  96. t.cap = make([]int, len(m.matchcap), cap(m.matchcap))
  97. }
  98. t.inst = i
  99. return t
  100. }
  101. // A lazyFlag is a lazily-evaluated syntax.EmptyOp,
  102. // for checking zero-width flags like ^ $ \A \z \B \b.
  103. // It records the pair of relevant runes and does not
  104. // determine the implied flags until absolutely necessary
  105. // (most of the time, that means never).
  106. type lazyFlag uint64
  107. func newLazyFlag(r1, r2 rune) lazyFlag {
  108. return lazyFlag(uint64(r1)<<32 | uint64(uint32(r2)))
  109. }
  110. func (f lazyFlag) match(op syntax.EmptyOp) bool {
  111. if op == 0 {
  112. return true
  113. }
  114. r1 := rune(f >> 32)
  115. if op&syntax.EmptyBeginLine != 0 {
  116. if r1 != '\n' && r1 >= 0 {
  117. return false
  118. }
  119. op &^= syntax.EmptyBeginLine
  120. }
  121. if op&syntax.EmptyBeginText != 0 {
  122. if r1 >= 0 {
  123. return false
  124. }
  125. op &^= syntax.EmptyBeginText
  126. }
  127. if op == 0 {
  128. return true
  129. }
  130. r2 := rune(f)
  131. if op&syntax.EmptyEndLine != 0 {
  132. if r2 != '\n' && r2 >= 0 {
  133. return false
  134. }
  135. op &^= syntax.EmptyEndLine
  136. }
  137. if op&syntax.EmptyEndText != 0 {
  138. if r2 >= 0 {
  139. return false
  140. }
  141. op &^= syntax.EmptyEndText
  142. }
  143. if op == 0 {
  144. return true
  145. }
  146. if syntax.IsWordChar(r1) != syntax.IsWordChar(r2) {
  147. op &^= syntax.EmptyWordBoundary
  148. } else {
  149. op &^= syntax.EmptyNoWordBoundary
  150. }
  151. return op == 0
  152. }
  153. // match runs the machine over the input starting at pos.
  154. // It reports whether a match was found.
  155. // If so, m.matchcap holds the submatch information.
  156. func (m *machine) match(i input, pos int) bool {
  157. startCond := m.re.cond
  158. if startCond == ^syntax.EmptyOp(0) { // impossible
  159. return false
  160. }
  161. m.matched = false
  162. for i := range m.matchcap {
  163. m.matchcap[i] = -1
  164. }
  165. runq, nextq := &m.q0, &m.q1
  166. r, r1 := endOfText, endOfText
  167. width, width1 := 0, 0
  168. r, width = i.step(pos)
  169. if r != endOfText {
  170. r1, width1 = i.step(pos + width)
  171. }
  172. var flag lazyFlag
  173. if pos == 0 {
  174. flag = newLazyFlag(-1, r)
  175. } else {
  176. flag = i.context(pos)
  177. }
  178. for {
  179. if len(runq.dense) == 0 {
  180. if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
  181. // Anchored match, past beginning of text.
  182. break
  183. }
  184. if m.matched {
  185. // Have match; finished exploring alternatives.
  186. break
  187. }
  188. if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() {
  189. // Match requires literal prefix; fast search for it.
  190. advance := i.index(m.re, pos)
  191. if advance < 0 {
  192. break
  193. }
  194. pos += advance
  195. r, width = i.step(pos)
  196. r1, width1 = i.step(pos + width)
  197. }
  198. }
  199. if !m.matched {
  200. if len(m.matchcap) > 0 {
  201. m.matchcap[0] = pos
  202. }
  203. m.add(runq, uint32(m.p.Start), pos, m.matchcap, &flag, nil)
  204. }
  205. flag = newLazyFlag(r, r1)
  206. m.step(runq, nextq, pos, pos+width, r, &flag)
  207. if width == 0 {
  208. break
  209. }
  210. if len(m.matchcap) == 0 && m.matched {
  211. // Found a match and not paying attention
  212. // to where it is, so any match will do.
  213. break
  214. }
  215. pos += width
  216. r, width = r1, width1
  217. if r != endOfText {
  218. r1, width1 = i.step(pos + width)
  219. }
  220. runq, nextq = nextq, runq
  221. }
  222. m.clear(nextq)
  223. return m.matched
  224. }
  225. // clear frees all threads on the thread queue.
  226. func (m *machine) clear(q *queue) {
  227. for _, d := range q.dense {
  228. if d.t != nil {
  229. m.pool = append(m.pool, d.t)
  230. }
  231. }
  232. q.dense = q.dense[:0]
  233. }
  234. // step executes one step of the machine, running each of the threads
  235. // on runq and appending new threads to nextq.
  236. // The step processes the rune c (which may be endOfText),
  237. // which starts at position pos and ends at nextPos.
  238. // nextCond gives the setting for the empty-width flags after c.
  239. func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond *lazyFlag) {
  240. longest := m.re.longest
  241. for j := 0; j < len(runq.dense); j++ {
  242. d := &runq.dense[j]
  243. t := d.t
  244. if t == nil {
  245. continue
  246. }
  247. if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] {
  248. m.pool = append(m.pool, t)
  249. continue
  250. }
  251. i := t.inst
  252. add := false
  253. switch i.Op {
  254. default:
  255. panic("bad inst")
  256. case syntax.InstMatch:
  257. if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) {
  258. t.cap[1] = pos
  259. copy(m.matchcap, t.cap)
  260. }
  261. if !longest {
  262. // First-match mode: cut off all lower-priority threads.
  263. for _, d := range runq.dense[j+1:] {
  264. if d.t != nil {
  265. m.pool = append(m.pool, d.t)
  266. }
  267. }
  268. runq.dense = runq.dense[:0]
  269. }
  270. m.matched = true
  271. case syntax.InstRune:
  272. add = i.MatchRune(c)
  273. case syntax.InstRune1:
  274. add = c == i.Rune[0]
  275. case syntax.InstRuneAny:
  276. add = true
  277. case syntax.InstRuneAnyNotNL:
  278. add = c != '\n'
  279. }
  280. if add {
  281. t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t)
  282. }
  283. if t != nil {
  284. m.pool = append(m.pool, t)
  285. }
  286. }
  287. runq.dense = runq.dense[:0]
  288. }
  289. // add adds an entry to q for pc, unless the q already has such an entry.
  290. // It also recursively adds an entry for all instructions reachable from pc by following
  291. // empty-width conditions satisfied by cond. pos gives the current position
  292. // in the input.
  293. func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond *lazyFlag, t *thread) *thread {
  294. Again:
  295. if pc == 0 {
  296. return t
  297. }
  298. if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
  299. return t
  300. }
  301. j := len(q.dense)
  302. q.dense = q.dense[:j+1]
  303. d := &q.dense[j]
  304. d.t = nil
  305. d.pc = pc
  306. q.sparse[pc] = uint32(j)
  307. i := &m.p.Inst[pc]
  308. switch i.Op {
  309. default:
  310. panic("unhandled")
  311. case syntax.InstFail:
  312. // nothing
  313. case syntax.InstAlt, syntax.InstAltMatch:
  314. t = m.add(q, i.Out, pos, cap, cond, t)
  315. pc = i.Arg
  316. goto Again
  317. case syntax.InstEmptyWidth:
  318. if cond.match(syntax.EmptyOp(i.Arg)) {
  319. pc = i.Out
  320. goto Again
  321. }
  322. case syntax.InstNop:
  323. pc = i.Out
  324. goto Again
  325. case syntax.InstCapture:
  326. if int(i.Arg) < len(cap) {
  327. opos := cap[i.Arg]
  328. cap[i.Arg] = pos
  329. m.add(q, i.Out, pos, cap, cond, nil)
  330. cap[i.Arg] = opos
  331. } else {
  332. pc = i.Out
  333. goto Again
  334. }
  335. case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  336. if t == nil {
  337. t = m.alloc(i)
  338. } else {
  339. t.inst = i
  340. }
  341. if len(cap) > 0 && &t.cap[0] != &cap[0] {
  342. copy(t.cap, cap)
  343. }
  344. d.t = t
  345. t = nil
  346. }
  347. return t
  348. }
  349. type onePassMachine struct {
  350. inputs inputs
  351. matchcap []int
  352. }
  353. var onePassPool sync.Pool
  354. func newOnePassMachine() *onePassMachine {
  355. m, ok := onePassPool.Get().(*onePassMachine)
  356. if !ok {
  357. m = new(onePassMachine)
  358. }
  359. return m
  360. }
  361. func freeOnePassMachine(m *onePassMachine) {
  362. m.inputs.clear()
  363. onePassPool.Put(m)
  364. }
  365. // doOnePass implements r.doExecute using the one-pass execution engine.
  366. func (re *Regexp) doOnePass(ir io.RuneReader, ib []byte, is string, pos, ncap int, dstCap []int) []int {
  367. startCond := re.cond
  368. if startCond == ^syntax.EmptyOp(0) { // impossible
  369. return nil
  370. }
  371. m := newOnePassMachine()
  372. if cap(m.matchcap) < ncap {
  373. m.matchcap = make([]int, ncap)
  374. } else {
  375. m.matchcap = m.matchcap[:ncap]
  376. }
  377. matched := false
  378. for i := range m.matchcap {
  379. m.matchcap[i] = -1
  380. }
  381. i, _ := m.inputs.init(ir, ib, is)
  382. r, r1 := endOfText, endOfText
  383. width, width1 := 0, 0
  384. r, width = i.step(pos)
  385. if r != endOfText {
  386. r1, width1 = i.step(pos + width)
  387. }
  388. var flag lazyFlag
  389. if pos == 0 {
  390. flag = newLazyFlag(-1, r)
  391. } else {
  392. flag = i.context(pos)
  393. }
  394. pc := re.onepass.Start
  395. inst := re.onepass.Inst[pc]
  396. // If there is a simple literal prefix, skip over it.
  397. if pos == 0 && flag.match(syntax.EmptyOp(inst.Arg)) &&
  398. len(re.prefix) > 0 && i.canCheckPrefix() {
  399. // Match requires literal prefix; fast search for it.
  400. if !i.hasPrefix(re) {
  401. goto Return
  402. }
  403. pos += len(re.prefix)
  404. r, width = i.step(pos)
  405. r1, width1 = i.step(pos + width)
  406. flag = i.context(pos)
  407. pc = int(re.prefixEnd)
  408. }
  409. for {
  410. inst = re.onepass.Inst[pc]
  411. pc = int(inst.Out)
  412. switch inst.Op {
  413. default:
  414. panic("bad inst")
  415. case syntax.InstMatch:
  416. matched = true
  417. if len(m.matchcap) > 0 {
  418. m.matchcap[0] = 0
  419. m.matchcap[1] = pos
  420. }
  421. goto Return
  422. case syntax.InstRune:
  423. if !inst.MatchRune(r) {
  424. goto Return
  425. }
  426. case syntax.InstRune1:
  427. if r != inst.Rune[0] {
  428. goto Return
  429. }
  430. case syntax.InstRuneAny:
  431. // Nothing
  432. case syntax.InstRuneAnyNotNL:
  433. if r == '\n' {
  434. goto Return
  435. }
  436. // peek at the input rune to see which branch of the Alt to take
  437. case syntax.InstAlt, syntax.InstAltMatch:
  438. pc = int(onePassNext(&inst, r))
  439. continue
  440. case syntax.InstFail:
  441. goto Return
  442. case syntax.InstNop:
  443. continue
  444. case syntax.InstEmptyWidth:
  445. if !flag.match(syntax.EmptyOp(inst.Arg)) {
  446. goto Return
  447. }
  448. continue
  449. case syntax.InstCapture:
  450. if int(inst.Arg) < len(m.matchcap) {
  451. m.matchcap[inst.Arg] = pos
  452. }
  453. continue
  454. }
  455. if width == 0 {
  456. break
  457. }
  458. flag = newLazyFlag(r, r1)
  459. pos += width
  460. r, width = r1, width1
  461. if r != endOfText {
  462. r1, width1 = i.step(pos + width)
  463. }
  464. }
  465. Return:
  466. if !matched {
  467. freeOnePassMachine(m)
  468. return nil
  469. }
  470. dstCap = append(dstCap, m.matchcap...)
  471. freeOnePassMachine(m)
  472. return dstCap
  473. }
  474. // doMatch reports whether either r, b or s match the regexp.
  475. func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool {
  476. return re.doExecute(r, b, s, 0, 0, nil) != nil
  477. }
  478. // doExecute finds the leftmost match in the input, appends the position
  479. // of its subexpressions to dstCap and returns dstCap.
  480. //
  481. // nil is returned if no matches are found and non-nil if matches are found.
  482. func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int {
  483. if dstCap == nil {
  484. // Make sure 'return dstCap' is non-nil.
  485. dstCap = arrayNoInts[:0:0]
  486. }
  487. if r == nil && len(b)+len(s) < re.minInputLen {
  488. return nil
  489. }
  490. if re.onepass != nil {
  491. return re.doOnePass(r, b, s, pos, ncap, dstCap)
  492. }
  493. if r == nil && len(b)+len(s) < re.maxBitStateLen {
  494. return re.backtrack(b, s, pos, ncap, dstCap)
  495. }
  496. m := re.get()
  497. i, _ := m.inputs.init(r, b, s)
  498. m.init(ncap)
  499. if !m.match(i, pos) {
  500. re.put(m)
  501. return nil
  502. }
  503. dstCap = append(dstCap, m.matchcap...)
  504. re.put(m)
  505. return dstCap
  506. }
  507. // arrayNoInts is returned by doExecute match if nil dstCap is passed
  508. // to it with ncap=0.
  509. var arrayNoInts [0]int