Home | History | Annotate | Download | only in courgette
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // Streams classes.
      6 //
      7 // These memory-resident streams are used for serializing data into a sequential
      8 // region of memory.
      9 //
     10 // Streams are divided into SourceStreams for reading and SinkStreams for
     11 // writing.  Streams are aggregated into Sets which allows several streams to be
     12 // used at once.  Example: we can write A1, B1, A2, B2 but achieve the memory
     13 // layout A1 A2 B1 B2 by writing 'A's to one stream and 'B's to another.
     14 //
     15 // The aggregated streams are important to Courgette's compression efficiency,
     16 // we use it to cluster similar kinds of data which helps to generate longer
     17 // common subsequences and repeated sequences.
     18 
     19 #include "courgette/streams.h"
     20 
     21 #include <memory.h>
     22 
     23 #include "base/basictypes.h"
     24 #include "base/logging.h"
     25 
     26 namespace courgette {
     27 
     28 // Update this version number if the serialization format of a StreamSet
     29 // changes.
     30 static const unsigned int kStreamsSerializationFormatVersion = 20090218;
     31 
     32 //
     33 // This is a cut down Varint implementation, implementing only what we use for
     34 // streams.
     35 //
     36 class Varint {
     37  public:
     38   // Maximum lengths of varint encoding of uint32
     39   static const int kMax32 = 5;
     40 
     41   // Parses a Varint32 encoded value from |source| and stores it in |output|,
     42   // and returns a pointer to the following byte.  Returns NULL if a valid
     43   // varint value was not found before |limit|.
     44   static const uint8* Parse32WithLimit(const uint8* source, const uint8* limit,
     45                                        uint32* output);
     46 
     47   // Writes the Varint32 encoded representation of |value| to buffer
     48   // |destination|.  |destination| must have sufficient length to hold kMax32
     49   // bytes.  Returns a pointer to the byte just past the last encoded byte.
     50   static uint8* Encode32(uint8* destination, uint32 value);
     51 };
     52 
     53 // Parses a Varint32 encoded unsigned number from |source|.  The Varint32
     54 // encoding is a little-endian sequence of bytes containing base-128 digits,
     55 // with the high order bit set to indicate if there are more digits.
     56 //
     57 // For each byte, we mask out the digit and 'or' it into the right place in the
     58 // result.
     59 //
     60 // The digit loop is unrolled for performance.  It usually exits after the first
     61 // one or two digits.
     62 const uint8* Varint::Parse32WithLimit(const uint8* source,
     63                                       const uint8* limit,
     64                                       uint32* output) {
     65   uint32 digit, result;
     66   if (source >= limit)
     67     return NULL;
     68   digit = *(source++);
     69   result = digit & 127;
     70   if (digit < 128) {
     71     *output = result;
     72     return source;
     73   }
     74 
     75   if (source >= limit)
     76     return NULL;
     77   digit = *(source++);
     78   result |= (digit & 127) <<  7;
     79   if (digit < 128) {
     80     *output = result;
     81     return source;
     82   }
     83 
     84   if (source >= limit)
     85     return NULL;
     86   digit = *(source++);
     87   result |= (digit & 127) << 14;
     88   if (digit < 128) {
     89     *output = result;
     90     return source;
     91   }
     92 
     93   if (source >= limit)
     94     return NULL;
     95   digit = *(source++);
     96   result |= (digit & 127) << 21;
     97   if (digit < 128) {
     98     *output = result;
     99     return source;
    100   }
    101 
    102   if (source >= limit)
    103     return NULL;
    104   digit = *(source++);
    105   result |= (digit & 127) << 28;
    106   if (digit < 128) {
    107     *output = result;
    108     return source;
    109   }
    110 
    111   return NULL;  // Value is too long to be a Varint32.
    112 }
    113 
    114 // Write the base-128 digits in little-endian order.  All except the last digit
    115 // have the high bit set to indicate more digits.
    116 inline uint8* Varint::Encode32(uint8* destination, uint32 value) {
    117   while (value >= 128) {
    118     *(destination++) = value | 128;
    119     value = value >> 7;
    120   }
    121   *(destination++) = value;
    122   return destination;
    123 }
    124 
    125 void SourceStream::Init(const SinkStream& sink) {
    126   Init(sink.Buffer(), sink.Length());
    127 }
    128 
    129 bool SourceStream::Read(void* destination, size_t count) {
    130   if (current_ + count > end_)
    131     return false;
    132   memcpy(destination, current_, count);
    133   current_ += count;
    134   return true;
    135 }
    136 
    137 bool SourceStream::ReadVarint32(uint32* output_value) {
    138   const uint8* after = Varint::Parse32WithLimit(current_, end_, output_value);
    139   if (!after)
    140     return false;
    141   current_ = after;
    142   return true;
    143 }
    144 
    145 bool SourceStream::ReadVarint32Signed(int32* output_value) {
    146   // Signed numbers are encoded as unsigned numbers so that numbers nearer zero
    147   // have shorter varint encoding.
    148   //  0000xxxx encoded as 000xxxx0.
    149   //  1111xxxx encoded as 000yyyy1 where yyyy is complement of xxxx.
    150   uint32 unsigned_value;
    151   if (!ReadVarint32(&unsigned_value))
    152     return false;
    153   if (unsigned_value & 1)
    154     *output_value = ~static_cast<int32>(unsigned_value >> 1);
    155   else
    156     *output_value = (unsigned_value >> 1);
    157   return true;
    158 }
    159 
    160 bool SourceStream::ShareSubstream(size_t offset, size_t length,
    161                                   SourceStream* substream) {
    162   if (offset > Remaining())
    163     return false;
    164   if (length > Remaining() - offset)
    165     return false;
    166   substream->Init(current_ + offset, length);
    167   return true;
    168 }
    169 
    170 bool SourceStream::ReadSubstream(size_t length, SourceStream* substream) {
    171   if (!ShareSubstream(0, length, substream))
    172     return false;
    173   current_ += length;
    174   return true;
    175 }
    176 
    177 bool SourceStream::Skip(size_t byte_count) {
    178   if (current_ + byte_count > end_)
    179     return false;
    180   current_ += byte_count;
    181   return true;
    182 }
    183 
    184 CheckBool SinkStream::Write(const void* data, size_t byte_count) {
    185   return buffer_.append(static_cast<const char*>(data), byte_count);
    186 }
    187 
    188 CheckBool SinkStream::WriteVarint32(uint32 value) {
    189   uint8 buffer[Varint::kMax32];
    190   uint8* end = Varint::Encode32(buffer, value);
    191   return Write(buffer, end - buffer);
    192 }
    193 
    194 CheckBool SinkStream::WriteVarint32Signed(int32 value) {
    195   // Encode signed numbers so that numbers nearer zero have shorter
    196   // varint encoding.
    197   //  0000xxxx encoded as 000xxxx0.
    198   //  1111xxxx encoded as 000yyyy1 where yyyy is complement of xxxx.
    199   bool ret;
    200   if (value < 0)
    201     ret = WriteVarint32(~value * 2 + 1);
    202   else
    203     ret = WriteVarint32(value * 2);
    204   return ret;
    205 }
    206 
    207 CheckBool SinkStream::WriteSizeVarint32(size_t value) {
    208   uint32 narrowed_value = static_cast<uint32>(value);
    209   // On 32-bit, the compiler should figure out this test always fails.
    210   LOG_ASSERT(value == narrowed_value);
    211   return WriteVarint32(narrowed_value);
    212 }
    213 
    214 CheckBool SinkStream::Append(SinkStream* other) {
    215   bool ret = Write(other->buffer_.data(), other->buffer_.size());
    216   if (ret)
    217     other->Retire();
    218   return ret;
    219 }
    220 
    221 void SinkStream::Retire() {
    222   buffer_.clear();
    223 }
    224 
    225 ////////////////////////////////////////////////////////////////////////////////
    226 
    227 SourceStreamSet::SourceStreamSet()
    228     : count_(kMaxStreams) {
    229 }
    230 
    231 SourceStreamSet::~SourceStreamSet() {
    232 }
    233 
    234 
    235 // Initializes from |source|.
    236 // The stream set for N streams is serialized as a header
    237 //   <version><N><length1><length2>...<lengthN>
    238 // followed by the stream contents
    239 //   <bytes1><bytes2>...<bytesN>
    240 //
    241 bool SourceStreamSet::Init(const void* source, size_t byte_count) {
    242   const uint8* start = static_cast<const uint8*>(source);
    243   const uint8* end = start + byte_count;
    244 
    245   unsigned int version;
    246   const uint8* finger = Varint::Parse32WithLimit(start, end, &version);
    247   if (finger == NULL)
    248     return false;
    249   if (version != kStreamsSerializationFormatVersion)
    250     return false;
    251 
    252   unsigned int count;
    253   finger = Varint::Parse32WithLimit(finger, end, &count);
    254   if (finger == NULL)
    255     return false;
    256   if (count > kMaxStreams)
    257     return false;
    258 
    259   count_ = count;
    260 
    261   unsigned int lengths[kMaxStreams];
    262   size_t accumulated_length = 0;
    263 
    264   for (size_t i = 0; i < count_; ++i) {
    265     finger = Varint::Parse32WithLimit(finger, end, &lengths[i]);
    266     if (finger == NULL)
    267       return false;
    268     accumulated_length += lengths[i];
    269   }
    270 
    271   // Remaining bytes should add up to sum of lengths.
    272   if (static_cast<size_t>(end - finger) != accumulated_length)
    273     return false;
    274 
    275   accumulated_length = finger - start;
    276   for (size_t i = 0; i < count_; ++i) {
    277     stream(i)->Init(start + accumulated_length, lengths[i]);
    278     accumulated_length += lengths[i];
    279   }
    280 
    281   return true;
    282 }
    283 
    284 bool SourceStreamSet::Init(SourceStream* source) {
    285   // TODO(sra): consume the rest of |source|.
    286   return Init(source->Buffer(), source->Remaining());
    287 }
    288 
    289 bool SourceStreamSet::ReadSet(SourceStreamSet* set) {
    290   uint32 stream_count = 0;
    291   SourceStream* control_stream = this->stream(0);
    292   if (!control_stream->ReadVarint32(&stream_count))
    293     return false;
    294 
    295   uint32 lengths[kMaxStreams] = {};  // i.e. all zero.
    296 
    297   for (size_t i = 0; i < stream_count; ++i) {
    298     if (!control_stream->ReadVarint32(&lengths[i]))
    299       return false;
    300   }
    301 
    302   for (size_t i = 0; i < stream_count; ++i) {
    303     if (!this->stream(i)->ReadSubstream(lengths[i], set->stream(i)))
    304       return false;
    305   }
    306   return true;
    307 }
    308 
    309 bool SourceStreamSet::Empty() const {
    310   for (size_t i = 0; i < count_; ++i) {
    311     if (streams_[i].Remaining() != 0)
    312       return false;
    313   }
    314   return true;
    315 }
    316 
    317 ////////////////////////////////////////////////////////////////////////////////
    318 
    319 SinkStreamSet::SinkStreamSet()
    320   : count_(kMaxStreams) {
    321 }
    322 
    323 SinkStreamSet::~SinkStreamSet() {
    324 }
    325 
    326 void SinkStreamSet::Init(size_t stream_index_limit) {
    327   count_ = stream_index_limit;
    328 }
    329 
    330 // The header for a stream set for N streams is serialized as
    331 //   <version><N><length1><length2>...<lengthN>
    332 CheckBool SinkStreamSet::CopyHeaderTo(SinkStream* header) {
    333   bool ret = header->WriteVarint32(kStreamsSerializationFormatVersion);
    334   if (ret) {
    335     ret = header->WriteSizeVarint32(count_);
    336     for (size_t i = 0; ret && i < count_; ++i) {
    337       ret = header->WriteSizeVarint32(stream(i)->Length());
    338     }
    339   }
    340   return ret;
    341 }
    342 
    343 // Writes |this| to |combined_stream|.  See SourceStreamSet::Init for the layout
    344 // of the stream metadata and contents.
    345 CheckBool SinkStreamSet::CopyTo(SinkStream *combined_stream) {
    346   SinkStream header;
    347   bool ret = CopyHeaderTo(&header);
    348   if (!ret)
    349     return ret;
    350 
    351   // Reserve the correct amount of storage.
    352   size_t length = header.Length();
    353   for (size_t i = 0; i < count_; ++i) {
    354     length += stream(i)->Length();
    355   }
    356   ret = combined_stream->Reserve(length);
    357   if (ret) {
    358     ret = combined_stream->Append(&header);
    359     for (size_t i = 0; ret && i < count_; ++i) {
    360       ret = combined_stream->Append(stream(i));
    361     }
    362   }
    363   return ret;
    364 }
    365 
    366 CheckBool SinkStreamSet::WriteSet(SinkStreamSet* set) {
    367   uint32 lengths[kMaxStreams];
    368   // 'stream_count' includes all non-empty streams and all empty stream numbered
    369   // lower than a non-empty stream.
    370   size_t stream_count = 0;
    371   for (size_t i = 0; i < kMaxStreams; ++i) {
    372     SinkStream* stream = set->stream(i);
    373     lengths[i] = static_cast<uint32>(stream->Length());
    374     if (lengths[i] > 0)
    375       stream_count = i + 1;
    376   }
    377 
    378   SinkStream* control_stream = this->stream(0);
    379   bool ret = control_stream->WriteSizeVarint32(stream_count);
    380   for (size_t i = 0; ret && i < stream_count; ++i) {
    381     ret = control_stream->WriteSizeVarint32(lengths[i]);
    382   }
    383 
    384   for (size_t i = 0; ret && i < stream_count; ++i) {
    385     ret = this->stream(i)->Append(set->stream(i));
    386   }
    387   return ret;
    388 }
    389 
    390 }  // namespace
    391