bzip2.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502
  1. // Copyright 2011 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package bzip2 implements bzip2 decompression.
  5. package bzip2
  6. import "io"
  7. // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
  8. // of guessing: https://en.wikipedia.org/wiki/Bzip2
  9. // The source code to pyflate was useful for debugging:
  10. // http://www.paul.sladen.org/projects/pyflate
  11. // A StructuralError is returned when the bzip2 data is found to be
  12. // syntactically invalid.
  13. type StructuralError string
  14. func (s StructuralError) Error() string {
  15. return "bzip2 data invalid: " + string(s)
  16. }
  17. // A reader decompresses bzip2 compressed data.
  18. type reader struct {
  19. br bitReader
  20. fileCRC uint32
  21. blockCRC uint32
  22. wantBlockCRC uint32
  23. setupDone bool // true if we have parsed the bzip2 header.
  24. blockSize int // blockSize in bytes, i.e. 900 * 1000.
  25. eof bool
  26. c [256]uint // the ``C'' array for the inverse BWT.
  27. tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
  28. tPos uint32 // Index of the next output byte in tt.
  29. preRLE []uint32 // contains the RLE data still to be processed.
  30. preRLEUsed int // number of entries of preRLE used.
  31. lastByte int // the last byte value seen.
  32. byteRepeats uint // the number of repeats of lastByte seen.
  33. repeats uint // the number of copies of lastByte to output.
  34. }
  35. // NewReader returns an io.Reader which decompresses bzip2 data from r.
  36. // If r does not also implement io.ByteReader,
  37. // the decompressor may read more data than necessary from r.
  38. func NewReader(r io.Reader) io.Reader {
  39. bz2 := new(reader)
  40. bz2.br = newBitReader(r)
  41. return bz2
  42. }
  43. const bzip2FileMagic = 0x425a // "BZ"
  44. const bzip2BlockMagic = 0x314159265359
  45. const bzip2FinalMagic = 0x177245385090
  46. // setup parses the bzip2 header.
  47. func (bz2 *reader) setup(needMagic bool) error {
  48. br := &bz2.br
  49. if needMagic {
  50. magic := br.ReadBits(16)
  51. if magic != bzip2FileMagic {
  52. return StructuralError("bad magic value")
  53. }
  54. }
  55. t := br.ReadBits(8)
  56. if t != 'h' {
  57. return StructuralError("non-Huffman entropy encoding")
  58. }
  59. level := br.ReadBits(8)
  60. if level < '1' || level > '9' {
  61. return StructuralError("invalid compression level")
  62. }
  63. bz2.fileCRC = 0
  64. bz2.blockSize = 100 * 1000 * (level - '0')
  65. if bz2.blockSize > len(bz2.tt) {
  66. bz2.tt = make([]uint32, bz2.blockSize)
  67. }
  68. return nil
  69. }
  70. func (bz2 *reader) Read(buf []byte) (n int, err error) {
  71. if bz2.eof {
  72. return 0, io.EOF
  73. }
  74. if !bz2.setupDone {
  75. err = bz2.setup(true)
  76. brErr := bz2.br.Err()
  77. if brErr != nil {
  78. err = brErr
  79. }
  80. if err != nil {
  81. return 0, err
  82. }
  83. bz2.setupDone = true
  84. }
  85. n, err = bz2.read(buf)
  86. brErr := bz2.br.Err()
  87. if brErr != nil {
  88. err = brErr
  89. }
  90. return
  91. }
  92. func (bz2 *reader) readFromBlock(buf []byte) int {
  93. // bzip2 is a block based compressor, except that it has a run-length
  94. // preprocessing step. The block based nature means that we can
  95. // preallocate fixed-size buffers and reuse them. However, the RLE
  96. // preprocessing would require allocating huge buffers to store the
  97. // maximum expansion. Thus we process blocks all at once, except for
  98. // the RLE which we decompress as required.
  99. n := 0
  100. for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
  101. // We have RLE data pending.
  102. // The run-length encoding works like this:
  103. // Any sequence of four equal bytes is followed by a length
  104. // byte which contains the number of repeats of that byte to
  105. // include. (The number of repeats can be zero.) Because we are
  106. // decompressing on-demand our state is kept in the reader
  107. // object.
  108. if bz2.repeats > 0 {
  109. buf[n] = byte(bz2.lastByte)
  110. n++
  111. bz2.repeats--
  112. if bz2.repeats == 0 {
  113. bz2.lastByte = -1
  114. }
  115. continue
  116. }
  117. bz2.tPos = bz2.preRLE[bz2.tPos]
  118. b := byte(bz2.tPos)
  119. bz2.tPos >>= 8
  120. bz2.preRLEUsed++
  121. if bz2.byteRepeats == 3 {
  122. bz2.repeats = uint(b)
  123. bz2.byteRepeats = 0
  124. continue
  125. }
  126. if bz2.lastByte == int(b) {
  127. bz2.byteRepeats++
  128. } else {
  129. bz2.byteRepeats = 0
  130. }
  131. bz2.lastByte = int(b)
  132. buf[n] = b
  133. n++
  134. }
  135. return n
  136. }
  137. func (bz2 *reader) read(buf []byte) (int, error) {
  138. for {
  139. n := bz2.readFromBlock(buf)
  140. if n > 0 || len(buf) == 0 {
  141. bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
  142. return n, nil
  143. }
  144. // End of block. Check CRC.
  145. if bz2.blockCRC != bz2.wantBlockCRC {
  146. bz2.br.err = StructuralError("block checksum mismatch")
  147. return 0, bz2.br.err
  148. }
  149. // Find next block.
  150. br := &bz2.br
  151. switch br.ReadBits64(48) {
  152. default:
  153. return 0, StructuralError("bad magic value found")
  154. case bzip2BlockMagic:
  155. // Start of block.
  156. err := bz2.readBlock()
  157. if err != nil {
  158. return 0, err
  159. }
  160. case bzip2FinalMagic:
  161. // Check end-of-file CRC.
  162. wantFileCRC := uint32(br.ReadBits64(32))
  163. if br.err != nil {
  164. return 0, br.err
  165. }
  166. if bz2.fileCRC != wantFileCRC {
  167. br.err = StructuralError("file checksum mismatch")
  168. return 0, br.err
  169. }
  170. // Skip ahead to byte boundary.
  171. // Is there a file concatenated to this one?
  172. // It would start with BZ.
  173. if br.bits%8 != 0 {
  174. br.ReadBits(br.bits % 8)
  175. }
  176. b, err := br.r.ReadByte()
  177. if err == io.EOF {
  178. br.err = io.EOF
  179. bz2.eof = true
  180. return 0, io.EOF
  181. }
  182. if err != nil {
  183. br.err = err
  184. return 0, err
  185. }
  186. z, err := br.r.ReadByte()
  187. if err != nil {
  188. if err == io.EOF {
  189. err = io.ErrUnexpectedEOF
  190. }
  191. br.err = err
  192. return 0, err
  193. }
  194. if b != 'B' || z != 'Z' {
  195. return 0, StructuralError("bad magic value in continuation file")
  196. }
  197. if err := bz2.setup(false); err != nil {
  198. return 0, err
  199. }
  200. }
  201. }
  202. }
  203. // readBlock reads a bzip2 block. The magic number should already have been consumed.
  204. func (bz2 *reader) readBlock() (err error) {
  205. br := &bz2.br
  206. bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
  207. bz2.blockCRC = 0
  208. bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
  209. randomized := br.ReadBits(1)
  210. if randomized != 0 {
  211. return StructuralError("deprecated randomized files")
  212. }
  213. origPtr := uint(br.ReadBits(24))
  214. // If not every byte value is used in the block (i.e., it's text) then
  215. // the symbol set is reduced. The symbols used are stored as a
  216. // two-level, 16x16 bitmap.
  217. symbolRangeUsedBitmap := br.ReadBits(16)
  218. symbolPresent := make([]bool, 256)
  219. numSymbols := 0
  220. for symRange := uint(0); symRange < 16; symRange++ {
  221. if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
  222. bits := br.ReadBits(16)
  223. for symbol := uint(0); symbol < 16; symbol++ {
  224. if bits&(1<<(15-symbol)) != 0 {
  225. symbolPresent[16*symRange+symbol] = true
  226. numSymbols++
  227. }
  228. }
  229. }
  230. }
  231. if numSymbols == 0 {
  232. // There must be an EOF symbol.
  233. return StructuralError("no symbols in input")
  234. }
  235. // A block uses between two and six different Huffman trees.
  236. numHuffmanTrees := br.ReadBits(3)
  237. if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
  238. return StructuralError("invalid number of Huffman trees")
  239. }
  240. // The Huffman tree can switch every 50 symbols so there's a list of
  241. // tree indexes telling us which tree to use for each 50 symbol block.
  242. numSelectors := br.ReadBits(15)
  243. treeIndexes := make([]uint8, numSelectors)
  244. // The tree indexes are move-to-front transformed and stored as unary
  245. // numbers.
  246. mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
  247. for i := range treeIndexes {
  248. c := 0
  249. for {
  250. inc := br.ReadBits(1)
  251. if inc == 0 {
  252. break
  253. }
  254. c++
  255. }
  256. if c >= numHuffmanTrees {
  257. return StructuralError("tree index too large")
  258. }
  259. treeIndexes[i] = mtfTreeDecoder.Decode(c)
  260. }
  261. // The list of symbols for the move-to-front transform is taken from
  262. // the previously decoded symbol bitmap.
  263. symbols := make([]byte, numSymbols)
  264. nextSymbol := 0
  265. for i := 0; i < 256; i++ {
  266. if symbolPresent[i] {
  267. symbols[nextSymbol] = byte(i)
  268. nextSymbol++
  269. }
  270. }
  271. mtf := newMTFDecoder(symbols)
  272. numSymbols += 2 // to account for RUNA and RUNB symbols
  273. huffmanTrees := make([]huffmanTree, numHuffmanTrees)
  274. // Now we decode the arrays of code-lengths for each tree.
  275. lengths := make([]uint8, numSymbols)
  276. for i := range huffmanTrees {
  277. // The code lengths are delta encoded from a 5-bit base value.
  278. length := br.ReadBits(5)
  279. for j := range lengths {
  280. for {
  281. if length < 1 || length > 20 {
  282. return StructuralError("Huffman length out of range")
  283. }
  284. if !br.ReadBit() {
  285. break
  286. }
  287. if br.ReadBit() {
  288. length--
  289. } else {
  290. length++
  291. }
  292. }
  293. lengths[j] = uint8(length)
  294. }
  295. huffmanTrees[i], err = newHuffmanTree(lengths)
  296. if err != nil {
  297. return err
  298. }
  299. }
  300. selectorIndex := 1 // the next tree index to use
  301. if len(treeIndexes) == 0 {
  302. return StructuralError("no tree selectors given")
  303. }
  304. if int(treeIndexes[0]) >= len(huffmanTrees) {
  305. return StructuralError("tree selector out of range")
  306. }
  307. currentHuffmanTree := huffmanTrees[treeIndexes[0]]
  308. bufIndex := 0 // indexes bz2.buf, the output buffer.
  309. // The output of the move-to-front transform is run-length encoded and
  310. // we merge the decoding into the Huffman parsing loop. These two
  311. // variables accumulate the repeat count. See the Wikipedia page for
  312. // details.
  313. repeat := 0
  314. repeatPower := 0
  315. // The `C' array (used by the inverse BWT) needs to be zero initialized.
  316. for i := range bz2.c {
  317. bz2.c[i] = 0
  318. }
  319. decoded := 0 // counts the number of symbols decoded by the current tree.
  320. for {
  321. if decoded == 50 {
  322. if selectorIndex >= numSelectors {
  323. return StructuralError("insufficient selector indices for number of symbols")
  324. }
  325. if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
  326. return StructuralError("tree selector out of range")
  327. }
  328. currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
  329. selectorIndex++
  330. decoded = 0
  331. }
  332. v := currentHuffmanTree.Decode(br)
  333. decoded++
  334. if v < 2 {
  335. // This is either the RUNA or RUNB symbol.
  336. if repeat == 0 {
  337. repeatPower = 1
  338. }
  339. repeat += repeatPower << v
  340. repeatPower <<= 1
  341. // This limit of 2 million comes from the bzip2 source
  342. // code. It prevents repeat from overflowing.
  343. if repeat > 2*1024*1024 {
  344. return StructuralError("repeat count too large")
  345. }
  346. continue
  347. }
  348. if repeat > 0 {
  349. // We have decoded a complete run-length so we need to
  350. // replicate the last output symbol.
  351. if repeat > bz2.blockSize-bufIndex {
  352. return StructuralError("repeats past end of block")
  353. }
  354. for i := 0; i < repeat; i++ {
  355. b := mtf.First()
  356. bz2.tt[bufIndex] = uint32(b)
  357. bz2.c[b]++
  358. bufIndex++
  359. }
  360. repeat = 0
  361. }
  362. if int(v) == numSymbols-1 {
  363. // This is the EOF symbol. Because it's always at the
  364. // end of the move-to-front list, and never gets moved
  365. // to the front, it has this unique value.
  366. break
  367. }
  368. // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
  369. // one would expect |v-2| to be passed to the MTF decoder.
  370. // However, the front of the MTF list is never referenced as 0,
  371. // it's always referenced with a run-length of 1. Thus 0
  372. // doesn't need to be encoded and we have |v-1| in the next
  373. // line.
  374. b := mtf.Decode(int(v - 1))
  375. if bufIndex >= bz2.blockSize {
  376. return StructuralError("data exceeds block size")
  377. }
  378. bz2.tt[bufIndex] = uint32(b)
  379. bz2.c[b]++
  380. bufIndex++
  381. }
  382. if origPtr >= uint(bufIndex) {
  383. return StructuralError("origPtr out of bounds")
  384. }
  385. // We have completed the entropy decoding. Now we can perform the
  386. // inverse BWT and setup the RLE buffer.
  387. bz2.preRLE = bz2.tt[:bufIndex]
  388. bz2.preRLEUsed = 0
  389. bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
  390. bz2.lastByte = -1
  391. bz2.byteRepeats = 0
  392. bz2.repeats = 0
  393. return nil
  394. }
  395. // inverseBWT implements the inverse Burrows-Wheeler transform as described in
  396. // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
  397. // In that document, origPtr is called ``I'' and c is the ``C'' array after the
  398. // first pass over the data. It's an argument here because we merge the first
  399. // pass with the Huffman decoding.
  400. //
  401. // This also implements the ``single array'' method from the bzip2 source code
  402. // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
  403. // index of the next byte in the top 24-bits. The index of the first byte is
  404. // returned.
  405. func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
  406. sum := uint(0)
  407. for i := 0; i < 256; i++ {
  408. sum += c[i]
  409. c[i] = sum - c[i]
  410. }
  411. for i := range tt {
  412. b := tt[i] & 0xff
  413. tt[c[b]] |= uint32(i) << 8
  414. c[b]++
  415. }
  416. return tt[origPtr] >> 8
  417. }
  418. // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
  419. // causing the bits in the input to be processed in the reverse of the usual order.
  420. var crctab [256]uint32
  421. func init() {
  422. const poly = 0x04C11DB7
  423. for i := range crctab {
  424. crc := uint32(i) << 24
  425. for j := 0; j < 8; j++ {
  426. if crc&0x80000000 != 0 {
  427. crc = (crc << 1) ^ poly
  428. } else {
  429. crc <<= 1
  430. }
  431. }
  432. crctab[i] = crc
  433. }
  434. }
  435. // updateCRC updates the crc value to incorporate the data in b.
  436. // The initial value is 0.
  437. func updateCRC(val uint32, b []byte) uint32 {
  438. crc := ^val
  439. for _, v := range b {
  440. crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
  441. }
  442. return ^crc
  443. }