Home | History | Annotate | Download | only in zip
      1 /*
      2  * Copyright 2017 Google Inc. All Rights Reserved.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.turbine.zip;
     18 
     19 import static java.nio.charset.StandardCharsets.UTF_8;
     20 
     21 import com.google.common.io.ByteStreams;
     22 import com.google.common.primitives.UnsignedInts;
     23 import java.io.ByteArrayInputStream;
     24 import java.io.Closeable;
     25 import java.io.IOError;
     26 import java.io.IOException;
     27 import java.nio.ByteBuffer;
     28 import java.nio.ByteOrder;
     29 import java.nio.MappedByteBuffer;
     30 import java.nio.channels.FileChannel;
     31 import java.nio.channels.FileChannel.MapMode;
     32 import java.nio.charset.CharacterCodingException;
     33 import java.nio.charset.CharsetDecoder;
     34 import java.nio.file.Path;
     35 import java.nio.file.StandardOpenOption;
     36 import java.util.Iterator;
     37 import java.util.zip.Inflater;
     38 import java.util.zip.InflaterInputStream;
     39 import java.util.zip.ZipException;
     40 
     41 /**
     42  * A fast, minimal, and somewhat garbage zip implementation. This exists because graal <a
     43  * href="http://mail.openjdk.java.net/pipermail/graal-dev/2017-August/005039.html">doesn't yet
     44  * support</a> {@link java.util.zip.ZipFile}, and {@link java.util.zip.ZipInputStream} doesn't have
     45  * the performance we'd like (*). If you're reading this, you almost certainly want {@code ZipFile}
     46  * instead.
     47  *
     48  * <p>If you're reading this because you're fixing a bug, sorry.
     49  *
     50  * <p>(*) A benchmark that iterates over all of the entries in rt.jar takes 6.97ms to run with this
     51  * implementation and 202.99ms with ZipInputStream. (Those are averages across 100 reps, and I
     52  * verified they're doing the same work.) This is likely largely due to ZipInputStream reading the
     53  * entire file from the beginning to scan the local headers, whereas this implementation (and
     54  * ZipFile) only read the central directory. Iterating over the entries (but not reading the data)
     55  * is an interesting benchmark because we typically only read ~10% of the compile-time classpath, so
     56  * most time is spent just scanning entry names. And rt.jar is an interesting test case because
     57  * every compilation has to read it, and it dominates the size of the classpath for small
     58  * compilations.
     59  *
     60  * <p>Implementation notes:
     61  *
     62  * <ul>
     63  *   <li>Leading garbage may be supported, since the archive is read backwards using the central
     64  *       directory. Archives modified with zip -A may not be supported. Trailing garbage is not
     65  *       supported.
     66  *   <li>UTF-8 is the only supported encoding.
     67  *   <li>STORED and DEFLATE are the only supported compression methods.
     68  *   <li>zip64 extensible data sectors are not supported.
     69  *   <li>Zip files larger than Integer.MAX_VALUE bytes are not supported.
     70  *   <li>The only supported ZIP64 field is ENDTOT. This implementation assumes that the ZIP64 end
     71  *       header is present only if ENDTOT in EOCD header is 0xFFFF.
     72  * </ul>
     73  */
     74 public class Zip {
     75 
     76   static final int ZIP64_ENDSIG = 0x06064b50;
     77 
     78   static final int LOCHDR = 30; // LOC header size
     79   static final int CENHDR = 46; // CEN header size
     80   static final int ENDHDR = 22; // END header size
     81   static final int ZIP64_LOCHDR = 20; // ZIP64 end locator header size
     82   static final int ZIP64_ENDHDR = 56; // ZIP64 end header size
     83 
     84   static final int ENDTOT = 10; // total number of entries
     85   static final int ENDSIZ = 12; // central directory size in bytes
     86   static final int ENDCOM = 20; // zip file comment length
     87 
     88   static final int CENHOW = 10; // compression method
     89   static final int CENLEN = 24; // uncompressed size
     90   static final int CENSIZ = 20; // compressed size
     91   static final int CENNAM = 28; // filename length
     92   static final int CENEXT = 30; // extra field length
     93   static final int CENCOM = 32; // comment length
     94   static final int CENOFF = 42; // LOC header offset
     95 
     96   static final int LOCEXT = 28; // extra field length
     97 
     98   static final int ZIP64_ENDSIZ = 40; // central directory size in bytes
     99 
    100   static final int ZIP64_MAGICCOUNT = 0xFFFF;
    101 
    102   /** Iterates over a zip archive. */
    103   static class ZipIterator implements Iterator<Entry> {
    104 
    105     /** A reader for the backing storage. */
    106     private final FileChannel chan;
    107 
    108     private final Path path;
    109     private int cdindex = 0;
    110     private final MappedByteBuffer cd;
    111     private final CharsetDecoder decoder = UTF_8.newDecoder();
    112 
    113     ZipIterator(Path path, FileChannel chan, MappedByteBuffer cd) {
    114       this.path = path;
    115       this.chan = chan;
    116       this.cd = cd;
    117     }
    118 
    119     @Override
    120     public boolean hasNext() {
    121       return cdindex < cd.limit();
    122     }
    123 
    124     /* Returns a {@link Entry} for the current CEN entry. */
    125     @Override
    126     public Entry next() {
    127       // TODO(cushon): technically we're supposed to throw NSEE
    128       checkSignature(path, cd, cdindex, 1, 2, "CENSIG");
    129       int nameLength = cd.getChar(cdindex + CENNAM);
    130       int extLength = cd.getChar(cdindex + CENEXT);
    131       int commentLength = cd.getChar(cdindex + CENCOM);
    132       Entry entry = new Entry(path, chan, string(cd, cdindex + CENHDR, nameLength), cd, cdindex);
    133       cdindex += CENHDR + nameLength + extLength + commentLength;
    134       return entry;
    135     }
    136 
    137     public String string(ByteBuffer buf, int offset, int length) {
    138       buf = buf.duplicate();
    139       buf.position(offset);
    140       buf.limit(offset + length);
    141       decoder.reset();
    142       try {
    143         return decoder.decode(buf).toString();
    144       } catch (CharacterCodingException e) {
    145         throw new IOError(e);
    146       }
    147     }
    148   }
    149 
    150   /** Provides an {@link Iterable} of {@link Entry} over a zip archive. */
    151   public static class ZipIterable implements Iterable<Entry>, Closeable {
    152 
    153     private final Path path;
    154     private final FileChannel chan;
    155     private final MappedByteBuffer cd;
    156 
    157     public ZipIterable(Path path) throws IOException {
    158       this.path = path;
    159       this.chan = FileChannel.open(path, StandardOpenOption.READ);
    160       // Locate the EOCD
    161       long size = chan.size();
    162       if (size < ENDHDR) {
    163         throw new ZipException("invalid zip archive");
    164       }
    165       long eocdOffset = size - ENDHDR;
    166       MappedByteBuffer eocd = chan.map(MapMode.READ_ONLY, eocdOffset, ENDHDR);
    167       eocd.order(ByteOrder.LITTLE_ENDIAN);
    168       int index = 0;
    169       int commentSize = 0;
    170       if (!isSignature(eocd, 0, 5, 6)) {
    171         // The archive may contain a zip file comment; keep looking for the EOCD.
    172         long start = Math.max(0, size - ENDHDR - 0xFFFF);
    173         eocd = chan.map(MapMode.READ_ONLY, start, (size - start));
    174         eocd.order(ByteOrder.LITTLE_ENDIAN);
    175         index = (int) ((size - start) - ENDHDR);
    176         while (index > 0) {
    177           index--;
    178           eocd.position(index);
    179           if (isSignature(eocd, index, 5, 6)) {
    180             commentSize = (int) ((size - start) - ENDHDR) - index;
    181             eocdOffset = start + index;
    182             break;
    183           }
    184         }
    185       }
    186       checkSignature(path, eocd, index, 5, 6, "ENDSIG");
    187       int totalEntries = eocd.getChar(index + ENDTOT);
    188       long cdsize = UnsignedInts.toLong(eocd.getInt(index + ENDSIZ));
    189       int actualCommentSize = eocd.getChar(index + ENDCOM);
    190       if (commentSize != actualCommentSize) {
    191         throw new ZipException(
    192             String.format(
    193                 "zip file comment length was %d, expected %d", commentSize, actualCommentSize));
    194       }
    195       // If the number of entries is 0xffff, check if the archive has a zip64 EOCD locator.
    196       if (totalEntries == ZIP64_MAGICCOUNT) {
    197         // Assume the zip64 EOCD has the usual size; we don't support zip64 extensible data sectors.
    198         long zip64eocdOffset = size - ENDHDR - ZIP64_LOCHDR - ZIP64_ENDHDR;
    199         MappedByteBuffer zip64eocd = chan.map(MapMode.READ_ONLY, zip64eocdOffset, ZIP64_ENDHDR);
    200         zip64eocd.order(ByteOrder.LITTLE_ENDIAN);
    201         // Note that zip reading is necessarily best-effort, since an archive could contain 0xFFFF
    202         // entries and the last entry's data could contain a ZIP64_ENDSIG. Some implementations
    203         // read the full EOCD records and compare them.
    204         if (zip64eocd.getInt(0) == ZIP64_ENDSIG) {
    205           cdsize = zip64eocd.getLong(ZIP64_ENDSIZ);
    206           eocdOffset = zip64eocdOffset;
    207         }
    208       }
    209       this.cd = chan.map(MapMode.READ_ONLY, eocdOffset - cdsize, cdsize);
    210       cd.order(ByteOrder.LITTLE_ENDIAN);
    211     }
    212 
    213     @Override
    214     public Iterator<Entry> iterator() {
    215       return new ZipIterator(path, chan, cd);
    216     }
    217 
    218     @Override
    219     public void close() throws IOException {
    220       chan.close();
    221     }
    222   }
    223 
    224   /** An entry in a zip archive. */
    225   public static class Entry {
    226 
    227     private final Path path;
    228     private final FileChannel chan;
    229     private final String name;
    230     private final ByteBuffer cd;
    231     private final int cdindex;
    232 
    233     public Entry(Path path, FileChannel chan, String name, ByteBuffer cd, int cdindex) {
    234       this.path = path;
    235       this.chan = chan;
    236       this.name = name;
    237       this.cd = cd;
    238       this.cdindex = cdindex;
    239     }
    240 
    241     /** The entry name. */
    242     public String name() {
    243       return name;
    244     }
    245 
    246     /** The entry data. */
    247     public byte[] data() {
    248       // Read the offset and variable lengths from the central directory and then try to map in the
    249       // data section in one shot.
    250       long offset = UnsignedInts.toLong(cd.getInt(cdindex + CENOFF));
    251       int nameLength = cd.getChar(cdindex + CENNAM);
    252       int extLength = cd.getChar(cdindex + CENEXT);
    253       int compression = cd.getChar(cdindex + CENHOW);
    254       switch (compression) {
    255         case 0x8:
    256           return getBytes(
    257               offset,
    258               nameLength,
    259               extLength,
    260               UnsignedInts.toLong(cd.getInt(cdindex + CENSIZ)),
    261               /*deflate=*/ true);
    262         case 0x0:
    263           return getBytes(
    264               offset,
    265               nameLength,
    266               extLength,
    267               UnsignedInts.toLong(cd.getInt(cdindex + CENLEN)),
    268               /*deflate=*/ false);
    269         default:
    270           throw new AssertionError(
    271               String.format("unsupported compression mode: 0x%x", compression));
    272       }
    273     }
    274 
    275     /**
    276      * Number of extra bytes to read for each file, to avoid re-mapping the data if the local header
    277      * reports more extra field data than the central directory.
    278      */
    279     static final int EXTRA_FIELD_SLACK = 128;
    280 
    281     private byte[] getBytes(
    282         long offset, int nameLength, int cenExtLength, long size, boolean deflate) {
    283       if (size > Integer.MAX_VALUE) {
    284         throw new IllegalArgumentException("unsupported zip entry size: " + size);
    285       }
    286       try {
    287         MappedByteBuffer fc =
    288             chan.map(
    289                 MapMode.READ_ONLY,
    290                 offset,
    291                 Math.min(
    292                     LOCHDR + nameLength + cenExtLength + size + EXTRA_FIELD_SLACK,
    293                     chan.size() - offset));
    294         fc.order(ByteOrder.LITTLE_ENDIAN);
    295         checkSignature(path, fc, /* index= */ 0, 3, 4, "LOCSIG");
    296         int locExtLength = fc.getChar(LOCEXT);
    297         if (locExtLength > cenExtLength + EXTRA_FIELD_SLACK) {
    298           // If the local header's extra fields don't match the central directory and we didn't
    299           // leave enough slac, re-map the data section with the correct extra field length.
    300           fc = chan.map(MapMode.READ_ONLY, offset + LOCHDR + nameLength + locExtLength, size);
    301           fc.order(ByteOrder.LITTLE_ENDIAN);
    302         } else {
    303           // Otherwise seek past the local header, name, and extra fields to the data.
    304           fc.position(LOCHDR + nameLength + locExtLength);
    305           fc.limit((int) (LOCHDR + nameLength + locExtLength + size));
    306         }
    307         byte[] bytes = new byte[(int) size];
    308         fc.get(bytes);
    309         if (deflate) {
    310           bytes =
    311               ByteStreams.toByteArray(
    312                   new InflaterInputStream(
    313                       new ByteArrayInputStream(bytes), new Inflater(/*nowrap=*/ true)));
    314         }
    315         return bytes;
    316       } catch (IOException e) {
    317         throw new IOError(e);
    318       }
    319     }
    320   }
    321 
    322   static void checkSignature(
    323       Path path, MappedByteBuffer buf, int index, int i, int j, String name) {
    324     if (!isSignature(buf, index, i, j)) {
    325       throw new AssertionError(
    326           String.format(
    327               "%s: bad %s (expected: 0x%02x%02x%02x%02x, actual: 0x%08x)",
    328               path, name, i, j, (int) 'K', (int) 'P', buf.getInt(index)));
    329     }
    330   }
    331 
    332   static boolean isSignature(MappedByteBuffer buf, int index, int i, int j) {
    333     return (buf.get(index) == 'P')
    334         && (buf.get(index + 1) == 'K')
    335         && (buf.get(index + 2) == i)
    336         && (buf.get(index + 3) == j);
    337   }
    338 }
    339