/* Copyright (C) Teemu Suutari */ #include #include #include "BZIP2Decompressor.hpp" #include "HuffmanDecoder.hpp" #include "InputStream.hpp" #include "OutputStream.hpp" #include "common/MemoryBuffer.hpp" #include "common/CRC32.hpp" #include "common/Common.hpp" namespace ancient::internal { bool BZIP2Decompressor::detectHeader(uint32_t hdr) noexcept { return ((hdr&0xffff'ff00U)==FourCC("BZh\0") && (hdr&0xffU)>='1' && (hdr&0xffU)<='9'); } bool BZIP2Decompressor::detectHeaderXPK(uint32_t hdr) noexcept { return (hdr==FourCC("BZP2")); } std::shared_ptr BZIP2Decompressor::create(const Buffer &packedData,bool exactSizeKnown,bool verify) { return std::make_shared(packedData,exactSizeKnown,verify); } std::shared_ptr BZIP2Decompressor::create(uint32_t hdr,uint32_t recursionLevel,const Buffer &packedData,std::shared_ptr &state,bool verify) { return std::make_shared(hdr,recursionLevel,packedData,state,verify); } BZIP2Decompressor::BZIP2Decompressor(const Buffer &packedData,bool exactSizeKnown,bool verify) : _packedData(packedData), _packedSize(0) { uint32_t hdr=packedData.readBE32(0); if (!detectHeader(hdr)) throw Decompressor::InvalidFormatError();; _blockSize=((hdr&0xffU)-'0')*100'000; } BZIP2Decompressor::BZIP2Decompressor(uint32_t hdr,uint32_t recursionLevel,const Buffer &packedData,std::shared_ptr &state,bool verify) : XPKDecompressor(recursionLevel), _packedData(packedData), _packedSize(_packedData.size()) { uint32_t blockHdr=packedData.readBE32(0); if (!detectHeader(blockHdr)) throw Decompressor::InvalidFormatError();; _blockSize=((blockHdr&0xffU)-'0')*100'000; } BZIP2Decompressor::~BZIP2Decompressor() { // nothing needed } const std::string &BZIP2Decompressor::getName() const noexcept { static std::string name="bz2: bzip2"; return name; } const std::string &BZIP2Decompressor::getSubName() const noexcept { static std::string name="XPK-BZP2: bzip2"; return name; } size_t BZIP2Decompressor::getPackedSize() const noexcept { // no way to know before decompressing return _packedSize; } size_t BZIP2Decompressor::getRawSize() const noexcept { // same thing, decompression needed first return _rawSize; } void BZIP2Decompressor::decompressImpl(Buffer &rawData,bool verify) { size_t packedSize=_packedSize?_packedSize:_packedData.size(); size_t rawSize=_rawSize?_rawSize:rawData.size(); ForwardInputStream inputStream(_packedData,4,packedSize); MSBBitReader bitReader(inputStream); auto readBits=[&](uint32_t count)->uint32_t { return bitReader.readBits8(count); }; auto readBit=[&]()->uint32_t { return bitReader.readBits8(1); }; ForwardOutputStream outputStream(rawData,0,rawSize); // stream verification // // there is so much wrong in bzip2 CRC-calculation :( // 1. The bit ordering is opposite what everyone else does with CRC32 // 2. The block CRCs are calculated separately, no way of calculating a complete // CRC without knowing the block layout // 3. The CRC is the end of the stream and the stream is bit aligned. You // can't read CRC without decompressing the stream. uint32_t crc=0; auto calculateBlockCRC=[&](size_t blockPos,size_t blockSize) { crc=(crc<<1)|(crc>>31); crc^=CRC32Rev(rawData,blockPos,blockSize,0); }; HuffmanDecoder selectorDecoder { // incomplete Huffman table. errors possible HuffmanCode{1,0b000000,0}, HuffmanCode{2,0b000010,1}, HuffmanCode{3,0b000110,2}, HuffmanCode{4,0b001110,3}, HuffmanCode{5,0b011110,4}, HuffmanCode{6,0b111110,5} }; HuffmanDecoder deltaDecoder { HuffmanCode{1,0b00,0}, HuffmanCode{2,0b10,1}, HuffmanCode{2,0b11,-1} }; MemoryBuffer tmpBuffer(_blockSize); uint8_t *tmpBufferPtr=tmpBuffer.data(); // This is the dark, ancient secret of bzip2. // versions before 0.9.5 had a data randomization for "too regular" // data problematic for the bwt-implementation at that time. // although it is never utilized anymore, the support is still there // And this is exactly the kind of ancient stuff we want to support :) // // On this specific part (since it is a table of magic numbers) // we have no way other than copying it from the original reference // Table has a separate copyright, lets have it as a separate file as well #include "BZIP2Table.hpp" for (;;) { uint32_t blockHdrHigh=readBits(32); uint32_t blockHdrLow=readBits(16); if (blockHdrHigh==0x31415926U && blockHdrLow==0x5359U) { // a block // this is rather spaghetti... readBits(32); // block crc, not interested bool randomized=readBit(); // basically the random inserted is one LSB after n-th bytes // per defined in the table. uint32_t randomPos=1; uint32_t randomCounter=randomTable[0]-1; auto randomBit=[&]()->bool { // Beauty is in the eye of the beholder: this is smallest form to hide the ugliness return (!randomCounter--)?randomCounter=randomTable[randomPos++&511]:false; }; uint32_t currentPtr=readBits(24); uint32_t currentBlockSize=0; { uint32_t numHuffmanItems=2; uint32_t huffmanValues[256]; { // this is just a little bit inefficient but still we reading bit by bit since // reference does it. (bitsream format details do not spill over) std::vector usedMap(16); for (uint32_t i=0;i<16;i++) usedMap[i]=readBit(); std::vector huffmanMap(256); for (uint32_t i=0;i<16;i++) { for (uint32_t j=0;j<16;j++) huffmanMap[i*16+j]=(usedMap[i])?readBit():false; } for (uint32_t i=0;i<256;i++) if (huffmanMap[i]) numHuffmanItems++; if (numHuffmanItems==2) throw DecompressionError(); for (uint32_t currentValue=0,i=0;i<256;i++) if (huffmanMap[i]) huffmanValues[currentValue++]=i; } uint32_t huffmanGroups=readBits(3); if (huffmanGroups<2 || huffmanGroups>6) throw DecompressionError(); uint32_t selectorsUsed=readBits(15); if (!selectorsUsed) throw DecompressionError(); MemoryBuffer huffmanSelectorList(selectorsUsed); auto unMTF=[](uint8_t value,uint8_t map[])->uint8_t { uint8_t ret=map[value]; if (value) { uint8_t tmp=map[value]; for (uint32_t i=value;i;i--) map[i]=map[i-1]; map[0]=tmp; } return ret; }; // create Huffman selectors uint8_t selectorMTFMap[6]={0,1,2,3,4,5}; for (uint32_t i=0;i=huffmanGroups) throw DecompressionError(); huffmanSelectorList[i]=item; } typedef HuffmanDecoder BZIP2Decoder; std::vector dataDecoders(huffmanGroups); // Create all tables for (uint32_t i=0;i20) throw DecompressionError(); bitLengths[j]=currentBits; } dataDecoders[i].createOrderlyHuffmanTable(bitLengths,numHuffmanItems); } // Huffman decode + unRLE + unMTF BZIP2Decoder *currentHuffmanDecoder=nullptr; uint32_t currentHuffmanIndex=0; uint8_t dataMTFMap[256]; for (uint32_t i=0;i_blockSize) throw DecompressionError(); for (uint32_t i=0;i=selectorsUsed) throw DecompressionError(); currentHuffmanDecoder=&dataDecoders[huffmanSelectorList[currentHuffmanIndex++]]; } uint32_t symbolMTF=currentHuffmanDecoder->decode(readBit); // stop marker is referenced only once, and it is the last one // This means we do no have to un-MTF it for detection if (symbolMTF==numHuffmanItems-1) break; if (currentBlockSize>=_blockSize) throw DecompressionError(); if (symbolMTF<2) { currentRunLength+=currentRLEWeight<=_blockSize) throw DecompressionError(); tmpBufferPtr[currentBlockSize++]=huffmanValues[symbol]; } } decodeRLE(); if (currentPtr>=currentBlockSize) throw DecompressionError(); } // inverse BWT + final RLE decoding. // there are a few dark corners here as well // 1. Can the stream end at 4 literals without count? I assume it is a valid optimization (and that this does not spillover to next block) // 2. Can the RLE-step include counts 252 to 255 even if reference does not do them? I assume yes here as here as well // 3. Can the stream be empty? We do not take issue here about that (that should be culled out earlier already) uint32_t sums[256]; for (uint32_t i=0;i<256;i++) sums[i]=0; for (uint32_t i=0;i(); for (uint32_t i=0;i