123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502 |
- // Copyright 2011 The Go Authors. All rights reserved.
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file.
- // Package bzip2 implements bzip2 decompression.
- package bzip2
- import "io"
- // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
- // of guessing: https://en.wikipedia.org/wiki/Bzip2
- // The source code to pyflate was useful for debugging:
- // http://www.paul.sladen.org/projects/pyflate
- // A StructuralError is returned when the bzip2 data is found to be
- // syntactically invalid.
- type StructuralError string
- func (s StructuralError) Error() string {
- return "bzip2 data invalid: " + string(s)
- }
- // A reader decompresses bzip2 compressed data.
- type reader struct {
- br bitReader
- fileCRC uint32
- blockCRC uint32
- wantBlockCRC uint32
- setupDone bool // true if we have parsed the bzip2 header.
- blockSize int // blockSize in bytes, i.e. 900 * 1000.
- eof bool
- c [256]uint // the ``C'' array for the inverse BWT.
- tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
- tPos uint32 // Index of the next output byte in tt.
- preRLE []uint32 // contains the RLE data still to be processed.
- preRLEUsed int // number of entries of preRLE used.
- lastByte int // the last byte value seen.
- byteRepeats uint // the number of repeats of lastByte seen.
- repeats uint // the number of copies of lastByte to output.
- }
- // NewReader returns an io.Reader which decompresses bzip2 data from r.
- // If r does not also implement io.ByteReader,
- // the decompressor may read more data than necessary from r.
- func NewReader(r io.Reader) io.Reader {
- bz2 := new(reader)
- bz2.br = newBitReader(r)
- return bz2
- }
- const bzip2FileMagic = 0x425a // "BZ"
- const bzip2BlockMagic = 0x314159265359
- const bzip2FinalMagic = 0x177245385090
- // setup parses the bzip2 header.
- func (bz2 *reader) setup(needMagic bool) error {
- br := &bz2.br
- if needMagic {
- magic := br.ReadBits(16)
- if magic != bzip2FileMagic {
- return StructuralError("bad magic value")
- }
- }
- t := br.ReadBits(8)
- if t != 'h' {
- return StructuralError("non-Huffman entropy encoding")
- }
- level := br.ReadBits(8)
- if level < '1' || level > '9' {
- return StructuralError("invalid compression level")
- }
- bz2.fileCRC = 0
- bz2.blockSize = 100 * 1000 * (level - '0')
- if bz2.blockSize > len(bz2.tt) {
- bz2.tt = make([]uint32, bz2.blockSize)
- }
- return nil
- }
- func (bz2 *reader) Read(buf []byte) (n int, err error) {
- if bz2.eof {
- return 0, io.EOF
- }
- if !bz2.setupDone {
- err = bz2.setup(true)
- brErr := bz2.br.Err()
- if brErr != nil {
- err = brErr
- }
- if err != nil {
- return 0, err
- }
- bz2.setupDone = true
- }
- n, err = bz2.read(buf)
- brErr := bz2.br.Err()
- if brErr != nil {
- err = brErr
- }
- return
- }
- func (bz2 *reader) readFromBlock(buf []byte) int {
- // bzip2 is a block based compressor, except that it has a run-length
- // preprocessing step. The block based nature means that we can
- // preallocate fixed-size buffers and reuse them. However, the RLE
- // preprocessing would require allocating huge buffers to store the
- // maximum expansion. Thus we process blocks all at once, except for
- // the RLE which we decompress as required.
- n := 0
- for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
- // We have RLE data pending.
- // The run-length encoding works like this:
- // Any sequence of four equal bytes is followed by a length
- // byte which contains the number of repeats of that byte to
- // include. (The number of repeats can be zero.) Because we are
- // decompressing on-demand our state is kept in the reader
- // object.
- if bz2.repeats > 0 {
- buf[n] = byte(bz2.lastByte)
- n++
- bz2.repeats--
- if bz2.repeats == 0 {
- bz2.lastByte = -1
- }
- continue
- }
- bz2.tPos = bz2.preRLE[bz2.tPos]
- b := byte(bz2.tPos)
- bz2.tPos >>= 8
- bz2.preRLEUsed++
- if bz2.byteRepeats == 3 {
- bz2.repeats = uint(b)
- bz2.byteRepeats = 0
- continue
- }
- if bz2.lastByte == int(b) {
- bz2.byteRepeats++
- } else {
- bz2.byteRepeats = 0
- }
- bz2.lastByte = int(b)
- buf[n] = b
- n++
- }
- return n
- }
- func (bz2 *reader) read(buf []byte) (int, error) {
- for {
- n := bz2.readFromBlock(buf)
- if n > 0 || len(buf) == 0 {
- bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
- return n, nil
- }
- // End of block. Check CRC.
- if bz2.blockCRC != bz2.wantBlockCRC {
- bz2.br.err = StructuralError("block checksum mismatch")
- return 0, bz2.br.err
- }
- // Find next block.
- br := &bz2.br
- switch br.ReadBits64(48) {
- default:
- return 0, StructuralError("bad magic value found")
- case bzip2BlockMagic:
- // Start of block.
- err := bz2.readBlock()
- if err != nil {
- return 0, err
- }
- case bzip2FinalMagic:
- // Check end-of-file CRC.
- wantFileCRC := uint32(br.ReadBits64(32))
- if br.err != nil {
- return 0, br.err
- }
- if bz2.fileCRC != wantFileCRC {
- br.err = StructuralError("file checksum mismatch")
- return 0, br.err
- }
- // Skip ahead to byte boundary.
- // Is there a file concatenated to this one?
- // It would start with BZ.
- if br.bits%8 != 0 {
- br.ReadBits(br.bits % 8)
- }
- b, err := br.r.ReadByte()
- if err == io.EOF {
- br.err = io.EOF
- bz2.eof = true
- return 0, io.EOF
- }
- if err != nil {
- br.err = err
- return 0, err
- }
- z, err := br.r.ReadByte()
- if err != nil {
- if err == io.EOF {
- err = io.ErrUnexpectedEOF
- }
- br.err = err
- return 0, err
- }
- if b != 'B' || z != 'Z' {
- return 0, StructuralError("bad magic value in continuation file")
- }
- if err := bz2.setup(false); err != nil {
- return 0, err
- }
- }
- }
- }
- // readBlock reads a bzip2 block. The magic number should already have been consumed.
- func (bz2 *reader) readBlock() (err error) {
- br := &bz2.br
- bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
- bz2.blockCRC = 0
- bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
- randomized := br.ReadBits(1)
- if randomized != 0 {
- return StructuralError("deprecated randomized files")
- }
- origPtr := uint(br.ReadBits(24))
- // If not every byte value is used in the block (i.e., it's text) then
- // the symbol set is reduced. The symbols used are stored as a
- // two-level, 16x16 bitmap.
- symbolRangeUsedBitmap := br.ReadBits(16)
- symbolPresent := make([]bool, 256)
- numSymbols := 0
- for symRange := uint(0); symRange < 16; symRange++ {
- if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
- bits := br.ReadBits(16)
- for symbol := uint(0); symbol < 16; symbol++ {
- if bits&(1<<(15-symbol)) != 0 {
- symbolPresent[16*symRange+symbol] = true
- numSymbols++
- }
- }
- }
- }
- if numSymbols == 0 {
- // There must be an EOF symbol.
- return StructuralError("no symbols in input")
- }
- // A block uses between two and six different Huffman trees.
- numHuffmanTrees := br.ReadBits(3)
- if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
- return StructuralError("invalid number of Huffman trees")
- }
- // The Huffman tree can switch every 50 symbols so there's a list of
- // tree indexes telling us which tree to use for each 50 symbol block.
- numSelectors := br.ReadBits(15)
- treeIndexes := make([]uint8, numSelectors)
- // The tree indexes are move-to-front transformed and stored as unary
- // numbers.
- mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
- for i := range treeIndexes {
- c := 0
- for {
- inc := br.ReadBits(1)
- if inc == 0 {
- break
- }
- c++
- }
- if c >= numHuffmanTrees {
- return StructuralError("tree index too large")
- }
- treeIndexes[i] = mtfTreeDecoder.Decode(c)
- }
- // The list of symbols for the move-to-front transform is taken from
- // the previously decoded symbol bitmap.
- symbols := make([]byte, numSymbols)
- nextSymbol := 0
- for i := 0; i < 256; i++ {
- if symbolPresent[i] {
- symbols[nextSymbol] = byte(i)
- nextSymbol++
- }
- }
- mtf := newMTFDecoder(symbols)
- numSymbols += 2 // to account for RUNA and RUNB symbols
- huffmanTrees := make([]huffmanTree, numHuffmanTrees)
- // Now we decode the arrays of code-lengths for each tree.
- lengths := make([]uint8, numSymbols)
- for i := range huffmanTrees {
- // The code lengths are delta encoded from a 5-bit base value.
- length := br.ReadBits(5)
- for j := range lengths {
- for {
- if length < 1 || length > 20 {
- return StructuralError("Huffman length out of range")
- }
- if !br.ReadBit() {
- break
- }
- if br.ReadBit() {
- length--
- } else {
- length++
- }
- }
- lengths[j] = uint8(length)
- }
- huffmanTrees[i], err = newHuffmanTree(lengths)
- if err != nil {
- return err
- }
- }
- selectorIndex := 1 // the next tree index to use
- if len(treeIndexes) == 0 {
- return StructuralError("no tree selectors given")
- }
- if int(treeIndexes[0]) >= len(huffmanTrees) {
- return StructuralError("tree selector out of range")
- }
- currentHuffmanTree := huffmanTrees[treeIndexes[0]]
- bufIndex := 0 // indexes bz2.buf, the output buffer.
- // The output of the move-to-front transform is run-length encoded and
- // we merge the decoding into the Huffman parsing loop. These two
- // variables accumulate the repeat count. See the Wikipedia page for
- // details.
- repeat := 0
- repeatPower := 0
- // The `C' array (used by the inverse BWT) needs to be zero initialized.
- for i := range bz2.c {
- bz2.c[i] = 0
- }
- decoded := 0 // counts the number of symbols decoded by the current tree.
- for {
- if decoded == 50 {
- if selectorIndex >= numSelectors {
- return StructuralError("insufficient selector indices for number of symbols")
- }
- if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
- return StructuralError("tree selector out of range")
- }
- currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
- selectorIndex++
- decoded = 0
- }
- v := currentHuffmanTree.Decode(br)
- decoded++
- if v < 2 {
- // This is either the RUNA or RUNB symbol.
- if repeat == 0 {
- repeatPower = 1
- }
- repeat += repeatPower << v
- repeatPower <<= 1
- // This limit of 2 million comes from the bzip2 source
- // code. It prevents repeat from overflowing.
- if repeat > 2*1024*1024 {
- return StructuralError("repeat count too large")
- }
- continue
- }
- if repeat > 0 {
- // We have decoded a complete run-length so we need to
- // replicate the last output symbol.
- if repeat > bz2.blockSize-bufIndex {
- return StructuralError("repeats past end of block")
- }
- for i := 0; i < repeat; i++ {
- b := mtf.First()
- bz2.tt[bufIndex] = uint32(b)
- bz2.c[b]++
- bufIndex++
- }
- repeat = 0
- }
- if int(v) == numSymbols-1 {
- // This is the EOF symbol. Because it's always at the
- // end of the move-to-front list, and never gets moved
- // to the front, it has this unique value.
- break
- }
- // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
- // one would expect |v-2| to be passed to the MTF decoder.
- // However, the front of the MTF list is never referenced as 0,
- // it's always referenced with a run-length of 1. Thus 0
- // doesn't need to be encoded and we have |v-1| in the next
- // line.
- b := mtf.Decode(int(v - 1))
- if bufIndex >= bz2.blockSize {
- return StructuralError("data exceeds block size")
- }
- bz2.tt[bufIndex] = uint32(b)
- bz2.c[b]++
- bufIndex++
- }
- if origPtr >= uint(bufIndex) {
- return StructuralError("origPtr out of bounds")
- }
- // We have completed the entropy decoding. Now we can perform the
- // inverse BWT and setup the RLE buffer.
- bz2.preRLE = bz2.tt[:bufIndex]
- bz2.preRLEUsed = 0
- bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
- bz2.lastByte = -1
- bz2.byteRepeats = 0
- bz2.repeats = 0
- return nil
- }
- // inverseBWT implements the inverse Burrows-Wheeler transform as described in
- // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
- // In that document, origPtr is called ``I'' and c is the ``C'' array after the
- // first pass over the data. It's an argument here because we merge the first
- // pass with the Huffman decoding.
- //
- // This also implements the ``single array'' method from the bzip2 source code
- // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
- // index of the next byte in the top 24-bits. The index of the first byte is
- // returned.
- func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
- sum := uint(0)
- for i := 0; i < 256; i++ {
- sum += c[i]
- c[i] = sum - c[i]
- }
- for i := range tt {
- b := tt[i] & 0xff
- tt[c[b]] |= uint32(i) << 8
- c[b]++
- }
- return tt[origPtr] >> 8
- }
- // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
- // causing the bits in the input to be processed in the reverse of the usual order.
- var crctab [256]uint32
- func init() {
- const poly = 0x04C11DB7
- for i := range crctab {
- crc := uint32(i) << 24
- for j := 0; j < 8; j++ {
- if crc&0x80000000 != 0 {
- crc = (crc << 1) ^ poly
- } else {
- crc <<= 1
- }
- }
- crctab[i] = crc
- }
- }
- // updateCRC updates the crc value to incorporate the data in b.
- // The initial value is 0.
- func updateCRC(val uint32, b []byte) uint32 {
- crc := ^val
- for _, v := range b {
- crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
- }
- return ^crc
- }
|