Home | History | Annotate | Download | only in rjsmin
      1 /*
      2  * Copyright 2011 - 2015
      3  * Andr\xe9 Malo or his licensors, as applicable
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at
      8  *
      9  *     http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  */
     17 
     18 #include "cext.h"
     19 EXT_INIT_FUNC;
     20 
     21 #define RJSMIN_DULL_BIT           (1 << 0)
     22 #define RJSMIN_PRE_REGEX_BIT      (1 << 1)
     23 #define RJSMIN_REGEX_DULL_BIT     (1 << 2)
     24 #define RJSMIN_REGEX_CC_DULL_BIT  (1 << 3)
     25 #define RJSMIN_ID_LIT_BIT         (1 << 4)
     26 #define RJSMIN_ID_LIT_O_BIT       (1 << 5)
     27 #define RJSMIN_ID_LIT_C_BIT       (1 << 6)
     28 #define RJSMIN_STRING_DULL_BIT    (1 << 7)
     29 #define RJSMIN_SPACE_BIT          (1 << 8)
     30 #define RJSMIN_POST_REGEX_OFF_BIT (1 << 9)
     31 
     32 #ifdef EXT3
     33 typedef Py_UNICODE rchar;
     34 #else
     35 typedef unsigned char rchar;
     36 #endif
     37 #define U(c) ((rchar)(c))
     38 
     39 #define RJSMIN_IS_DULL(c) ((U(c) > 127) || \
     40     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_DULL_BIT))
     41 
     42 #define RJSMIN_IS_REGEX_DULL(c) ((U(c) > 127) || \
     43     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_DULL_BIT))
     44 
     45 #define RJSMIN_IS_REGEX_CC_DULL(c) ((U(c) > 127) || \
     46     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_REGEX_CC_DULL_BIT))
     47 
     48 #define RJSMIN_IS_STRING_DULL(c) ((U(c) > 127) || \
     49     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_STRING_DULL_BIT))
     50 
     51 #define RJSMIN_IS_ID_LITERAL(c) ((U(c) > 127) || \
     52     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_BIT))
     53 
     54 #define RJSMIN_IS_ID_LITERAL_OPEN(c) ((U(c) > 127) || \
     55     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_O_BIT))
     56 
     57 #define RJSMIN_IS_ID_LITERAL_CLOSE(c) ((U(c) > 127) || \
     58     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_ID_LIT_C_BIT))
     59 
     60 #define RJSMIN_IS_POST_REGEX_OFF(c) ((U(c) > 127) || \
     61     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_POST_REGEX_OFF_BIT))
     62 
     63 #define RJSMIN_IS_SPACE(c) ((U(c) <= 127) && \
     64     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_SPACE_BIT))
     65 
     66 #define RJSMIN_IS_PRE_REGEX_1(c) ((U(c) <= 127) && \
     67     (rjsmin_charmask[U(c) & 0x7F] & RJSMIN_PRE_REGEX_BIT))
     68 
     69 
     70 static const unsigned short rjsmin_charmask[128] = {
     71     396, 396, 396, 396, 396, 396, 396, 396,
     72     396, 396,   2, 396, 396,   2, 396, 396,
     73     396, 396, 396, 396, 396, 396, 396, 396,
     74     396, 396, 396, 396, 396, 396, 396, 396,
     75     396, 687, 588, 653, 765, 653, 143, 588,
     76     687, 205, 653, 237, 143, 237, 141, 648,
     77     765, 765, 765, 765, 765, 765, 765, 765,
     78     765, 765, 143, 143, 653, 143, 653, 143,
     79     653, 765, 765, 765, 765, 765, 765, 765,
     80     765, 765, 765, 765, 765, 765, 765, 765,
     81     765, 765, 765, 765, 765, 765, 765, 765,
     82     765, 765, 765, 683, 513, 197, 653, 765,
     83     653, 765, 765, 765, 765, 765, 765, 765,
     84     765, 765, 765, 765, 765, 765, 765, 765,
     85     765, 765, 765, 765, 765, 765, 765, 765,
     86     765, 765, 765, 687, 143, 207, 653, 765
     87 };
     88 
     89 static Py_ssize_t
     90 rjsmin(const rchar *source, rchar *target, Py_ssize_t length,
     91        int keep_bang_comments)
     92 {
     93     const rchar *reset, *pcreset = NULL, *pctoken = NULL, *xtarget,
     94                 *sentinel = source + length;
     95     rchar *tstart = target;
     96     int post_regex = 0;
     97     rchar c, quote, spaced = U(' ');
     98 
     99     while (source < sentinel) {
    100         c = *source++;
    101         if (RJSMIN_IS_DULL(c)) {
    102             if (post_regex) post_regex = 0;
    103             if (pctoken) pctoken = NULL;
    104             if (spaced == U('\n')) spaced = U(' ');
    105 
    106             *target++ = c;
    107             continue;
    108         }
    109         switch (c) {
    110 
    111         /* String */
    112         case U('\''): case U('"'):
    113             if (post_regex) post_regex = 0;
    114             if (pctoken) pctoken = NULL;
    115             if (spaced == U('\n')) spaced = U(' ');
    116 
    117             reset = source;
    118             *target++ = quote = c;
    119             while (source < sentinel) {
    120                 c = *source++;
    121                 *target++ = c;
    122                 if (RJSMIN_IS_STRING_DULL(c))
    123                     continue;
    124                 switch (c) {
    125                 case U('\''): case U('"'):
    126                     if (c == quote)
    127                         goto cont;
    128                     continue;
    129                 case U('\\'):
    130                     if (source < sentinel) {
    131                         c = *source++;
    132                         *target++ = c;
    133                         if (c == U('\r') && source < sentinel
    134                             && *source == U('\n'))
    135                             *target++ = *source++;
    136                     }
    137                     continue;
    138                 }
    139                 break;
    140             }
    141             target -= source - reset;
    142             source = reset;
    143             continue;
    144 
    145         /* Comment or Regex or something else entirely */
    146         case U('/'):
    147             if (!(source < sentinel)) {
    148                 if (post_regex) post_regex = 0;
    149                 if (pctoken) pctoken = NULL;
    150                 if (spaced == U('\n')) spaced = U(' ');
    151 
    152                 *target++ = c;
    153             }
    154             else {
    155                 switch (*source) {
    156             /* Comment */
    157                 case U('*'): case U('/'):
    158                     goto skip_or_copy_ws;
    159 
    160                 default:
    161                     xtarget = NULL;
    162                     if (   target == tstart
    163                         || RJSMIN_IS_PRE_REGEX_1(*((pctoken ? pctoken : target)
    164                                                    - 1))
    165                         || (
    166                             (xtarget = pctoken ? pctoken : target)
    167                             && (xtarget - tstart >= 6)
    168                             && *(xtarget - 1) == U('n')
    169                             && *(xtarget - 2) == U('r')
    170                             && *(xtarget - 3) == U('u')
    171                             && *(xtarget - 4) == U('t')
    172                             && *(xtarget - 5) == U('e')
    173                             && *(xtarget - 6) == U('r')
    174                             && (
    175                                    xtarget - tstart == 6
    176                                 || !RJSMIN_IS_ID_LITERAL(*(xtarget - 7))
    177                             )
    178                         )) {
    179 
    180             /* Regex */
    181                         if (post_regex) post_regex = 0;
    182                         if (pctoken) pctoken = NULL;
    183 
    184                         reset = source;
    185                         if (spaced == U('\n')) {
    186                             spaced = U(' ');
    187                             if (xtarget)
    188                                 *target++ = U('\n');
    189                         }
    190 
    191                         *target++ = U('/');
    192                         while (source < sentinel) {
    193                             c = *source++;
    194                             *target++ = c;
    195                             if (RJSMIN_IS_REGEX_DULL(c))
    196                                 continue;
    197                             switch (c) {
    198                             case U('/'):
    199                                 post_regex = 1;
    200                                 goto cont;
    201                             case U('\\'):
    202                                 if (source < sentinel) {
    203                                     c = *source++;
    204                                     *target++ = c;
    205                                     if (c == U('\r') || c == U('\n'))
    206                                         break;
    207                                 }
    208                                 continue;
    209                             case U('['):
    210                                 while (source < sentinel) {
    211                                     c = *source++;
    212                                     *target++ = c;
    213                                     if (RJSMIN_IS_REGEX_CC_DULL(c))
    214                                         continue;
    215                                     switch (c) {
    216                                     case U('\\'):
    217                                         if (source < sentinel) {
    218                                             c = *source++;
    219                                             *target++ = c;
    220                                             if (c == U('\r') || c == U('\n'))
    221                                                 break;
    222                                         }
    223                                         continue;
    224                                     case U(']'):
    225                                         goto cont_regex;
    226                                     }
    227                                 }
    228                                 break;
    229                             }
    230                             break;
    231                         cont_regex:
    232                             continue;
    233                         }
    234                         target -= source - reset;
    235                         source = reset;
    236                     }
    237                     else {
    238             /* Just a slash */
    239                         if (post_regex) post_regex = 0;
    240                         if (pctoken) pctoken = NULL;
    241                         if (spaced == U('\n')) spaced = U(' ');
    242 
    243                         *target++ = c;
    244                     }
    245                     continue;
    246                 }
    247             }
    248             continue;
    249 
    250         /* Whitespace */
    251         default:
    252         skip_or_copy_ws:
    253             quote = U(' ');
    254             --source;
    255             while (source < sentinel) {
    256                 c = *source++;
    257                 if (RJSMIN_IS_SPACE(c))
    258                     continue;
    259                 switch (c) {
    260                 case U('\r'): case U('\n'):
    261                     quote = U('\n');
    262                     continue;
    263                 case U('/'):
    264                     if (source < sentinel) {
    265                         switch (*source) {
    266                         case U('*'):
    267                             reset = source++;
    268                             /* copy bang comment, if requested */
    269                             if (   keep_bang_comments && source < sentinel
    270                                 && *source == U('!')) {
    271                                 if (!pctoken) {
    272                                     pctoken = target;
    273                                     pcreset = reset;
    274                                 }
    275 
    276                                 *target++ = U('/');
    277                                 *target++ = U('*');
    278                                 *target++ = *source++;
    279                                 while (source < sentinel) {
    280                                     c = *source++;
    281                                     *target++ = c;
    282                                     if (c == U('*') && source < sentinel
    283                                         && *source == U('/')) {
    284                                         *target++ = *source++;
    285                                         reset = NULL;
    286                                         break;
    287                                     }
    288                                 }
    289                                 if (!reset)
    290                                     continue;
    291 
    292                                 target -= source - reset;
    293                                 source = reset;
    294                                 if (pcreset == reset) {
    295                                     pctoken = NULL;
    296                                     pcreset = NULL;
    297                                 }
    298 
    299                             }
    300                             /* strip regular comment */
    301                             else {
    302                                 while (source < sentinel) {
    303                                     c = *source++;
    304                                     if (c == U('*') && source < sentinel
    305                                         && *source == U('/')) {
    306                                         ++source;
    307                                         reset = NULL;
    308                                         break;
    309                                     }
    310                                 }
    311                                 if (!reset)
    312                                     continue;
    313                                 source = reset;
    314                                 *target++ = U('/');
    315                             }
    316                             goto cont;
    317                         case U('/'):
    318                             ++source;
    319                             while (source < sentinel) {
    320                                 c = *source++;
    321                                 switch (c) {
    322                                 case U('\n'):
    323                                     break;
    324                                 case U('\r'):
    325                                     if (source < sentinel
    326                                         && *source == U('\n'))
    327                                         ++source;
    328                                     break;
    329                                 default:
    330                                     continue;
    331                                 }
    332                                 break;
    333                             }
    334                             quote = U('\n');
    335                             continue;
    336                         }
    337                     }
    338                 }
    339                 --source;
    340                 break;
    341             }
    342 
    343             if ((tstart < (pctoken ? pctoken : target) && source < sentinel)
    344                 && ((quote == U('\n')
    345                      && ((RJSMIN_IS_ID_LITERAL_CLOSE(*((pctoken ?
    346                                                         pctoken : target) - 1))
    347                           && RJSMIN_IS_ID_LITERAL_OPEN(*source))
    348                          || (post_regex
    349                              && RJSMIN_IS_POST_REGEX_OFF(*source)
    350                              && !(post_regex = 0))))
    351                     ||
    352                     (quote == U(' ') && !pctoken
    353                      && ((RJSMIN_IS_ID_LITERAL(*(target - 1))
    354                           && RJSMIN_IS_ID_LITERAL(*source))
    355                          || (source < sentinel
    356                              && ((*(target - 1) == U('+')
    357                                   && *source == U('+'))
    358                                  || (*(target - 1) == U('-')
    359                                      && *source == U('-')))))))) {
    360                 *target++ = quote;
    361             }
    362 
    363             pcreset = NULL;
    364             spaced = quote;
    365         }
    366     cont:
    367         continue;
    368     }
    369     return (Py_ssize_t)(target - tstart);
    370 }
    371 
    372 
    373 PyDoc_STRVAR(rjsmin_jsmin__doc__,
    374 "jsmin(script, keep_bang_comments=False)\n\
    375 \n\
    376 Minify javascript based on `jsmin.c by Douglas Crockford`_\\.\n\
    377 \n\
    378 Instead of parsing the stream char by char, it uses a regular\n\
    379 expression approach which minifies the whole script with one big\n\
    380 substitution regex.\n\
    381 \n\
    382 .. _jsmin.c by Douglas Crockford:\n\
    383    http://www.crockford.com/javascript/jsmin.c\n\
    384 \n\
    385 :Note: This is a hand crafted C implementation built on the regex\n\
    386        semantics.\n\
    387 \n\
    388 :Parameters:\n\
    389   `script` : ``str``\n\
    390     Script to minify\n\
    391 \n\
    392   `keep_bang_comments` : ``bool``\n\
    393     Keep comments starting with an exclamation mark? (``/*!...*/``)\n\
    394 \n\
    395 :Return: Minified script\n\
    396 :Rtype: ``str``");
    397 
    398 static PyObject *
    399 rjsmin_jsmin(PyObject *self, PyObject *args, PyObject *kwds)
    400 {
    401     PyObject *script, *keep_bang_comments_ = NULL, *result;
    402     static char *kwlist[] = {"script", "keep_bang_comments", NULL};
    403     Py_ssize_t slength, length;
    404     int keep_bang_comments;
    405 #ifdef EXT2
    406     int uni;
    407 #define UOBJ "O"
    408 #endif
    409 #ifdef EXT3
    410 #define UOBJ "U"
    411 #endif
    412 
    413     if (!PyArg_ParseTupleAndKeywords(args, kwds, UOBJ "|O", kwlist,
    414                                      &script, &keep_bang_comments_))
    415         return NULL;
    416 
    417     if (!keep_bang_comments_)
    418         keep_bang_comments = 0;
    419     else {
    420         keep_bang_comments = PyObject_IsTrue(keep_bang_comments_);
    421         if (keep_bang_comments == -1)
    422             return NULL;
    423     }
    424 
    425 #ifdef EXT2
    426     if (PyUnicode_Check(script)) {
    427         if (!(script = PyUnicode_AsUTF8String(script)))
    428             return NULL;
    429         uni = 1;
    430     }
    431     else {
    432         if (!(script = PyObject_Str(script)))
    433             return NULL;
    434         uni = 0;
    435     }
    436 #endif
    437 
    438 #ifdef EXT3
    439     Py_INCREF(script);
    440 #define PyString_GET_SIZE PyUnicode_GET_SIZE
    441 #define PyString_AS_STRING PyUnicode_AS_UNICODE
    442 #define _PyString_Resize PyUnicode_Resize
    443 #define PyString_FromStringAndSize PyUnicode_FromUnicode
    444 #endif
    445 
    446     slength = PyString_GET_SIZE(script);
    447     if (!(result = PyString_FromStringAndSize(NULL, slength))) {
    448         Py_DECREF(script);
    449         return NULL;
    450     }
    451     Py_BEGIN_ALLOW_THREADS
    452     length = rjsmin((rchar *)PyString_AS_STRING(script),
    453                     (rchar *)PyString_AS_STRING(result),
    454                     slength, keep_bang_comments);
    455     Py_END_ALLOW_THREADS
    456 
    457     Py_DECREF(script);
    458     if (length < 0) {
    459         Py_DECREF(result);
    460         return NULL;
    461     }
    462     if (length != slength && _PyString_Resize(&result, length) == -1)
    463         return NULL;
    464 
    465 #ifdef EXT2
    466     if (uni) {
    467         script = PyUnicode_DecodeUTF8(PyString_AS_STRING(result),
    468                                       PyString_GET_SIZE(result), "strict");
    469         Py_DECREF(result);
    470         if (!script)
    471             return NULL;
    472         result = script;
    473     }
    474 #endif
    475     return result;
    476 }
    477 
    478 /* ------------------------ BEGIN MODULE DEFINITION ------------------------ */
    479 
    480 EXT_METHODS = {
    481     {"jsmin",
    482         (PyCFunction)rjsmin_jsmin, METH_VARARGS | METH_KEYWORDS,
    483         rjsmin_jsmin__doc__},
    484 
    485     {NULL}  /* Sentinel */
    486 };
    487 
    488 PyDoc_STRVAR(EXT_DOCS_VAR,
    489 "C implementation of rjsmin\n\
    490 ==========================\n\
    491 \n\
    492 C implementation of rjsmin.");
    493 
    494 
    495 EXT_DEFINE(EXT_MODULE_NAME, EXT_METHODS_VAR, EXT_DOCS_VAR);
    496 
    497 EXT_INIT_FUNC {
    498     PyObject *m;
    499 
    500     /* Create the module and populate stuff */
    501     if (!(m = EXT_CREATE(&EXT_DEFINE_VAR)))
    502         EXT_INIT_ERROR(NULL);
    503 
    504     EXT_ADD_UNICODE(m, "__author__", "Andr\xe9 Malo", "latin-1");
    505     EXT_ADD_STRING(m, "__docformat__", "restructuredtext en");
    506 
    507     EXT_INIT_RETURN(m);
    508 }
    509 
    510 /* ------------------------- END MODULE DEFINITION ------------------------- */
    511