scan_test.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package bufio_test
  5. import (
  6. . "bufio"
  7. "bytes"
  8. "errors"
  9. "io"
  10. "strings"
  11. "testing"
  12. "unicode"
  13. "unicode/utf8"
  14. )
  15. const smallMaxTokenSize = 256 // Much smaller for more efficient testing.
  16. // Test white space table matches the Unicode definition.
  17. func TestSpace(t *testing.T) {
  18. for r := rune(0); r <= utf8.MaxRune; r++ {
  19. if IsSpace(r) != unicode.IsSpace(r) {
  20. t.Fatalf("white space property disagrees: %#U should be %t", r, unicode.IsSpace(r))
  21. }
  22. }
  23. }
  24. var scanTests = []string{
  25. "",
  26. "a",
  27. "¼",
  28. "☹",
  29. "\x81", // UTF-8 error
  30. "\uFFFD", // correctly encoded RuneError
  31. "abcdefgh",
  32. "abc def\n\t\tgh ",
  33. "abc¼☹\x81\uFFFD日本語\x82abc",
  34. }
  35. func TestScanByte(t *testing.T) {
  36. for n, test := range scanTests {
  37. buf := strings.NewReader(test)
  38. s := NewScanner(buf)
  39. s.Split(ScanBytes)
  40. var i int
  41. for i = 0; s.Scan(); i++ {
  42. if b := s.Bytes(); len(b) != 1 || b[0] != test[i] {
  43. t.Errorf("#%d: %d: expected %q got %q", n, i, test, b)
  44. }
  45. }
  46. if i != len(test) {
  47. t.Errorf("#%d: termination expected at %d; got %d", n, len(test), i)
  48. }
  49. err := s.Err()
  50. if err != nil {
  51. t.Errorf("#%d: %v", n, err)
  52. }
  53. }
  54. }
  55. // Test that the rune splitter returns same sequence of runes (not bytes) as for range string.
  56. func TestScanRune(t *testing.T) {
  57. for n, test := range scanTests {
  58. buf := strings.NewReader(test)
  59. s := NewScanner(buf)
  60. s.Split(ScanRunes)
  61. var i, runeCount int
  62. var expect rune
  63. // Use a string range loop to validate the sequence of runes.
  64. for i, expect = range string(test) {
  65. if !s.Scan() {
  66. break
  67. }
  68. runeCount++
  69. got, _ := utf8.DecodeRune(s.Bytes())
  70. if got != expect {
  71. t.Errorf("#%d: %d: expected %q got %q", n, i, expect, got)
  72. }
  73. }
  74. if s.Scan() {
  75. t.Errorf("#%d: scan ran too long, got %q", n, s.Text())
  76. }
  77. testRuneCount := utf8.RuneCountInString(test)
  78. if runeCount != testRuneCount {
  79. t.Errorf("#%d: termination expected at %d; got %d", n, testRuneCount, runeCount)
  80. }
  81. err := s.Err()
  82. if err != nil {
  83. t.Errorf("#%d: %v", n, err)
  84. }
  85. }
  86. }
  87. var wordScanTests = []string{
  88. "",
  89. " ",
  90. "\n",
  91. "a",
  92. " a ",
  93. "abc def",
  94. " abc def ",
  95. " abc\tdef\nghi\rjkl\fmno\vpqr\u0085stu\u00a0\n",
  96. }
  97. // Test that the word splitter returns the same data as strings.Fields.
  98. func TestScanWords(t *testing.T) {
  99. for n, test := range wordScanTests {
  100. buf := strings.NewReader(test)
  101. s := NewScanner(buf)
  102. s.Split(ScanWords)
  103. words := strings.Fields(test)
  104. var wordCount int
  105. for wordCount = 0; wordCount < len(words); wordCount++ {
  106. if !s.Scan() {
  107. break
  108. }
  109. got := s.Text()
  110. if got != words[wordCount] {
  111. t.Errorf("#%d: %d: expected %q got %q", n, wordCount, words[wordCount], got)
  112. }
  113. }
  114. if s.Scan() {
  115. t.Errorf("#%d: scan ran too long, got %q", n, s.Text())
  116. }
  117. if wordCount != len(words) {
  118. t.Errorf("#%d: termination expected at %d; got %d", n, len(words), wordCount)
  119. }
  120. err := s.Err()
  121. if err != nil {
  122. t.Errorf("#%d: %v", n, err)
  123. }
  124. }
  125. }
  126. // slowReader is a reader that returns only a few bytes at a time, to test the incremental
  127. // reads in Scanner.Scan.
  128. type slowReader struct {
  129. max int
  130. buf io.Reader
  131. }
  132. func (sr *slowReader) Read(p []byte) (n int, err error) {
  133. if len(p) > sr.max {
  134. p = p[0:sr.max]
  135. }
  136. return sr.buf.Read(p)
  137. }
  138. // genLine writes to buf a predictable but non-trivial line of text of length
  139. // n, including the terminal newline and an occasional carriage return.
  140. // If addNewline is false, the \r and \n are not emitted.
  141. func genLine(buf *bytes.Buffer, lineNum, n int, addNewline bool) {
  142. buf.Reset()
  143. doCR := lineNum%5 == 0
  144. if doCR {
  145. n--
  146. }
  147. for i := 0; i < n-1; i++ { // Stop early for \n.
  148. c := 'a' + byte(lineNum+i)
  149. if c == '\n' || c == '\r' { // Don't confuse us.
  150. c = 'N'
  151. }
  152. buf.WriteByte(c)
  153. }
  154. if addNewline {
  155. if doCR {
  156. buf.WriteByte('\r')
  157. }
  158. buf.WriteByte('\n')
  159. }
  160. }
  161. // Test the line splitter, including some carriage returns but no long lines.
  162. func TestScanLongLines(t *testing.T) {
  163. // Build a buffer of lots of line lengths up to but not exceeding smallMaxTokenSize.
  164. tmp := new(bytes.Buffer)
  165. buf := new(bytes.Buffer)
  166. lineNum := 0
  167. j := 0
  168. for i := 0; i < 2*smallMaxTokenSize; i++ {
  169. genLine(tmp, lineNum, j, true)
  170. if j < smallMaxTokenSize {
  171. j++
  172. } else {
  173. j--
  174. }
  175. buf.Write(tmp.Bytes())
  176. lineNum++
  177. }
  178. s := NewScanner(&slowReader{1, buf})
  179. s.Split(ScanLines)
  180. s.MaxTokenSize(smallMaxTokenSize)
  181. j = 0
  182. for lineNum := 0; s.Scan(); lineNum++ {
  183. genLine(tmp, lineNum, j, false)
  184. if j < smallMaxTokenSize {
  185. j++
  186. } else {
  187. j--
  188. }
  189. line := tmp.String() // We use the string-valued token here, for variety.
  190. if s.Text() != line {
  191. t.Errorf("%d: bad line: %d %d\n%.100q\n%.100q\n", lineNum, len(s.Bytes()), len(line), s.Text(), line)
  192. }
  193. }
  194. err := s.Err()
  195. if err != nil {
  196. t.Fatal(err)
  197. }
  198. }
  199. // Test that the line splitter errors out on a long line.
  200. func TestScanLineTooLong(t *testing.T) {
  201. const smallMaxTokenSize = 256 // Much smaller for more efficient testing.
  202. // Build a buffer of lots of line lengths up to but not exceeding smallMaxTokenSize.
  203. tmp := new(bytes.Buffer)
  204. buf := new(bytes.Buffer)
  205. lineNum := 0
  206. j := 0
  207. for i := 0; i < 2*smallMaxTokenSize; i++ {
  208. genLine(tmp, lineNum, j, true)
  209. j++
  210. buf.Write(tmp.Bytes())
  211. lineNum++
  212. }
  213. s := NewScanner(&slowReader{3, buf})
  214. s.Split(ScanLines)
  215. s.MaxTokenSize(smallMaxTokenSize)
  216. j = 0
  217. for lineNum := 0; s.Scan(); lineNum++ {
  218. genLine(tmp, lineNum, j, false)
  219. if j < smallMaxTokenSize {
  220. j++
  221. } else {
  222. j--
  223. }
  224. line := tmp.Bytes()
  225. if !bytes.Equal(s.Bytes(), line) {
  226. t.Errorf("%d: bad line: %d %d\n%.100q\n%.100q\n", lineNum, len(s.Bytes()), len(line), s.Bytes(), line)
  227. }
  228. }
  229. err := s.Err()
  230. if err != ErrTooLong {
  231. t.Fatalf("expected ErrTooLong; got %s", err)
  232. }
  233. }
  234. // Test that the line splitter handles a final line without a newline.
  235. func testNoNewline(text string, lines []string, t *testing.T) {
  236. buf := strings.NewReader(text)
  237. s := NewScanner(&slowReader{7, buf})
  238. s.Split(ScanLines)
  239. for lineNum := 0; s.Scan(); lineNum++ {
  240. line := lines[lineNum]
  241. if s.Text() != line {
  242. t.Errorf("%d: bad line: %d %d\n%.100q\n%.100q\n", lineNum, len(s.Bytes()), len(line), s.Bytes(), line)
  243. }
  244. }
  245. err := s.Err()
  246. if err != nil {
  247. t.Fatal(err)
  248. }
  249. }
  250. // Test that the line splitter handles a final line without a newline.
  251. func TestScanLineNoNewline(t *testing.T) {
  252. const text = "abcdefghijklmn\nopqrstuvwxyz"
  253. lines := []string{
  254. "abcdefghijklmn",
  255. "opqrstuvwxyz",
  256. }
  257. testNoNewline(text, lines, t)
  258. }
  259. // Test that the line splitter handles a final line with a carriage return but no newline.
  260. func TestScanLineReturnButNoNewline(t *testing.T) {
  261. const text = "abcdefghijklmn\nopqrstuvwxyz\r"
  262. lines := []string{
  263. "abcdefghijklmn",
  264. "opqrstuvwxyz",
  265. }
  266. testNoNewline(text, lines, t)
  267. }
  268. // Test that the line splitter handles a final empty line.
  269. func TestScanLineEmptyFinalLine(t *testing.T) {
  270. const text = "abcdefghijklmn\nopqrstuvwxyz\n\n"
  271. lines := []string{
  272. "abcdefghijklmn",
  273. "opqrstuvwxyz",
  274. "",
  275. }
  276. testNoNewline(text, lines, t)
  277. }
  278. // Test that the line splitter handles a final empty line with a carriage return but no newline.
  279. func TestScanLineEmptyFinalLineWithCR(t *testing.T) {
  280. const text = "abcdefghijklmn\nopqrstuvwxyz\n\r"
  281. lines := []string{
  282. "abcdefghijklmn",
  283. "opqrstuvwxyz",
  284. "",
  285. }
  286. testNoNewline(text, lines, t)
  287. }
  288. var testError = errors.New("testError")
  289. // Test the correct error is returned when the split function errors out.
  290. func TestSplitError(t *testing.T) {
  291. // Create a split function that delivers a little data, then a predictable error.
  292. numSplits := 0
  293. const okCount = 7
  294. errorSplit := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
  295. if atEOF {
  296. panic("didn't get enough data")
  297. }
  298. if numSplits >= okCount {
  299. return 0, nil, testError
  300. }
  301. numSplits++
  302. return 1, data[0:1], nil
  303. }
  304. // Read the data.
  305. const text = "abcdefghijklmnopqrstuvwxyz"
  306. buf := strings.NewReader(text)
  307. s := NewScanner(&slowReader{1, buf})
  308. s.Split(errorSplit)
  309. var i int
  310. for i = 0; s.Scan(); i++ {
  311. if len(s.Bytes()) != 1 || text[i] != s.Bytes()[0] {
  312. t.Errorf("#%d: expected %q got %q", i, text[i], s.Bytes()[0])
  313. }
  314. }
  315. // Check correct termination location and error.
  316. if i != okCount {
  317. t.Errorf("unexpected termination; expected %d tokens got %d", okCount, i)
  318. }
  319. err := s.Err()
  320. if err != testError {
  321. t.Fatalf("expected %q got %v", testError, err)
  322. }
  323. }
  324. // Test that an EOF is overridden by a user-generated scan error.
  325. func TestErrAtEOF(t *testing.T) {
  326. s := NewScanner(strings.NewReader("1 2 33"))
  327. // This splitter will fail on last entry, after s.err==EOF.
  328. split := func(data []byte, atEOF bool) (advance int, token []byte, err error) {
  329. advance, token, err = ScanWords(data, atEOF)
  330. if len(token) > 1 {
  331. if s.ErrOrEOF() != io.EOF {
  332. t.Fatal("not testing EOF")
  333. }
  334. err = testError
  335. }
  336. return
  337. }
  338. s.Split(split)
  339. for s.Scan() {
  340. }
  341. if s.Err() != testError {
  342. t.Fatal("wrong error:", s.Err())
  343. }
  344. }
  345. // Test for issue 5268.
  346. type alwaysError struct{}
  347. func (alwaysError) Read(p []byte) (int, error) {
  348. return 0, io.ErrUnexpectedEOF
  349. }
  350. func TestNonEOFWithEmptyRead(t *testing.T) {
  351. scanner := NewScanner(alwaysError{})
  352. for scanner.Scan() {
  353. t.Fatal("read should fail")
  354. }
  355. err := scanner.Err()
  356. if err != io.ErrUnexpectedEOF {
  357. t.Errorf("unexpected error: %v", err)
  358. }
  359. }
  360. // Test that Scan finishes if we have endless empty reads.
  361. type endlessZeros struct{}
  362. func (endlessZeros) Read(p []byte) (int, error) {
  363. return 0, nil
  364. }
  365. func TestBadReader(t *testing.T) {
  366. scanner := NewScanner(endlessZeros{})
  367. for scanner.Scan() {
  368. t.Fatal("read should fail")
  369. }
  370. err := scanner.Err()
  371. if err != io.ErrNoProgress {
  372. t.Errorf("unexpected error: %v", err)
  373. }
  374. }
  375. func TestScanWordsExcessiveWhiteSpace(t *testing.T) {
  376. const word = "ipsum"
  377. s := strings.Repeat(" ", 4*smallMaxTokenSize) + word
  378. scanner := NewScanner(strings.NewReader(s))
  379. scanner.MaxTokenSize(smallMaxTokenSize)
  380. scanner.Split(ScanWords)
  381. if !scanner.Scan() {
  382. t.Fatalf("scan failed: %v", scanner.Err())
  383. }
  384. if token := scanner.Text(); token != word {
  385. t.Fatalf("unexpected token: %v", token)
  386. }
  387. }
  388. // Test that empty tokens, including at end of line or end of file, are found by the scanner.
  389. // Issue 8672: Could miss final empty token.
  390. func commaSplit(data []byte, atEOF bool) (advance int, token []byte, err error) {
  391. for i := 0; i < len(data); i++ {
  392. if data[i] == ',' {
  393. return i + 1, data[:i], nil
  394. }
  395. }
  396. return 0, data, ErrFinalToken
  397. }
  398. func testEmptyTokens(t *testing.T, text string, values []string) {
  399. s := NewScanner(strings.NewReader(text))
  400. s.Split(commaSplit)
  401. var i int
  402. for i = 0; s.Scan(); i++ {
  403. if i >= len(values) {
  404. t.Fatalf("got %d fields, expected %d", i+1, len(values))
  405. }
  406. if s.Text() != values[i] {
  407. t.Errorf("%d: expected %q got %q", i, values[i], s.Text())
  408. }
  409. }
  410. if i != len(values) {
  411. t.Fatalf("got %d fields, expected %d", i, len(values))
  412. }
  413. if err := s.Err(); err != nil {
  414. t.Fatal(err)
  415. }
  416. }
  417. func TestEmptyTokens(t *testing.T) {
  418. testEmptyTokens(t, "1,2,3,", []string{"1", "2", "3", ""})
  419. }
  420. func TestWithNoEmptyTokens(t *testing.T) {
  421. testEmptyTokens(t, "1,2,3", []string{"1", "2", "3"})
  422. }
  423. func loopAtEOFSplit(data []byte, atEOF bool) (advance int, token []byte, err error) {
  424. if len(data) > 0 {
  425. return 1, data[:1], nil
  426. }
  427. return 0, data, nil
  428. }
  429. func TestDontLoopForever(t *testing.T) {
  430. s := NewScanner(strings.NewReader("abc"))
  431. s.Split(loopAtEOFSplit)
  432. // Expect a panic
  433. defer func() {
  434. err := recover()
  435. if err == nil {
  436. t.Fatal("should have panicked")
  437. }
  438. if msg, ok := err.(string); !ok || !strings.Contains(msg, "empty tokens") {
  439. panic(err)
  440. }
  441. }()
  442. for count := 0; s.Scan(); count++ {
  443. if count > 1000 {
  444. t.Fatal("looping")
  445. }
  446. }
  447. if s.Err() != nil {
  448. t.Fatal("after scan:", s.Err())
  449. }
  450. }
  451. func TestBlankLines(t *testing.T) {
  452. s := NewScanner(strings.NewReader(strings.Repeat("\n", 1000)))
  453. for count := 0; s.Scan(); count++ {
  454. if count > 2000 {
  455. t.Fatal("looping")
  456. }
  457. }
  458. if s.Err() != nil {
  459. t.Fatal("after scan:", s.Err())
  460. }
  461. }
  462. type countdown int
  463. func (c *countdown) split(data []byte, atEOF bool) (advance int, token []byte, err error) {
  464. if *c > 0 {
  465. *c--
  466. return 1, data[:1], nil
  467. }
  468. return 0, nil, nil
  469. }
  470. // Check that the looping-at-EOF check doesn't trigger for merely empty tokens.
  471. func TestEmptyLinesOK(t *testing.T) {
  472. c := countdown(10000)
  473. s := NewScanner(strings.NewReader(strings.Repeat("\n", 10000)))
  474. s.Split(c.split)
  475. for s.Scan() {
  476. }
  477. if s.Err() != nil {
  478. t.Fatal("after scan:", s.Err())
  479. }
  480. if c != 0 {
  481. t.Fatalf("stopped with %d left to process", c)
  482. }
  483. }
  484. // Make sure we can read a huge token if a big enough buffer is provided.
  485. func TestHugeBuffer(t *testing.T) {
  486. text := strings.Repeat("x", 2*MaxScanTokenSize)
  487. s := NewScanner(strings.NewReader(text + "\n"))
  488. s.Buffer(make([]byte, 100), 3*MaxScanTokenSize)
  489. for s.Scan() {
  490. token := s.Text()
  491. if token != text {
  492. t.Errorf("scan got incorrect token of length %d", len(token))
  493. }
  494. }
  495. if s.Err() != nil {
  496. t.Fatal("after scan:", s.Err())
  497. }
  498. }
  499. // negativeEOFReader returns an invalid -1 at the end, as though it
  500. // were wrapping the read system call.
  501. type negativeEOFReader int
  502. func (r *negativeEOFReader) Read(p []byte) (int, error) {
  503. if *r > 0 {
  504. c := int(*r)
  505. if c > len(p) {
  506. c = len(p)
  507. }
  508. for i := 0; i < c; i++ {
  509. p[i] = 'a'
  510. }
  511. p[c-1] = '\n'
  512. *r -= negativeEOFReader(c)
  513. return c, nil
  514. }
  515. return -1, io.EOF
  516. }
  517. // Test that the scanner doesn't panic and returns ErrBadReadCount
  518. // on a reader that returns a negative count of bytes read (issue 38053).
  519. func TestNegativeEOFReader(t *testing.T) {
  520. r := negativeEOFReader(10)
  521. scanner := NewScanner(&r)
  522. c := 0
  523. for scanner.Scan() {
  524. c++
  525. if c > 1 {
  526. t.Error("read too many lines")
  527. break
  528. }
  529. }
  530. if got, want := scanner.Err(), ErrBadReadCount; got != want {
  531. t.Errorf("scanner.Err: got %v, want %v", got, want)
  532. }
  533. }
  534. // largeReader returns an invalid count that is larger than the number
  535. // of bytes requested.
  536. type largeReader struct{}
  537. func (largeReader) Read(p []byte) (int, error) {
  538. return len(p) + 1, nil
  539. }
  540. // Test that the scanner doesn't panic and returns ErrBadReadCount
  541. // on a reader that returns an impossibly large count of bytes read (issue 38053).
  542. func TestLargeReader(t *testing.T) {
  543. scanner := NewScanner(largeReader{})
  544. for scanner.Scan() {
  545. }
  546. if got, want := scanner.Err(), ErrBadReadCount; got != want {
  547. t.Errorf("scanner.Err: got %v, want %v", got, want)
  548. }
  549. }