Home | History | Annotate | Download | only in plugins
      1 /*
      2  * Copyright (C) 2016 The Android Open Source Project
      3  * Copyright (C) 2016 Mopria Alliance, Inc.
      4  * Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
      5  *
      6  * Licensed under the Apache License, Version 2.0 (the "License");
      7  * you may not use this file except in compliance with the License.
      8  * You may obtain a copy of the License at
      9  *
     10  *      http://www.apache.org/licenses/LICENSE-2.0
     11  *
     12  * Unless required by applicable law or agreed to in writing, software
     13  * distributed under the License is distributed on an "AS IS" BASIS,
     14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15  * See the License for the specific language governing permissions and
     16  * limitations under the License.
     17  */
     18 
     19 #include "wprint_scaler.h"
     20 #include <assert.h>
     21 #include <stdio.h>
     22 
     23 #define ROUND_4_DOWN(x) ((x) & ~3)
     24 #define ROUND_4_UP(x)   (ROUND_4_DOWN((x) + 3))
     25 #define PSCALER_FRACT_BITS_COUNT 24
     26 
     27 typedef enum {
     28     FRACTION_ROUND_UP,
     29     FRACTION_TRUNCATE
     30 } pscaler_fraction_t;
     31 
     32 static uint32
     33         _scaler_fraction_part(uint32 iNum, uint32 iDen, pscaler_fraction_t mode, bool_t *overflow);
     34 
     35 static void _hw_scale_image_plane(scaler_config_t *pscaler_config, scaler_mode_t scaleMode);
     36 
     37 static void _calculate_factors(scaler_config_t *pscaler_config, scaler_mode_t scaleMode);
     38 
     39 void scaler_make_image_scaler_tables(uint16 image_input_width, uint16 image_input_buf_width,
     40         uint16 image_output_width, uint16 image_output_buf_width, uint16 image_input_height,
     41         uint16 image_output_height, scaler_config_t *pscaler_config) {
     42     pscaler_config->iSrcWidth = image_input_width;
     43     pscaler_config->iSrcHeight = image_input_height;
     44     pscaler_config->iOutWidth = image_output_width;
     45     pscaler_config->iOutHeight = image_output_height;
     46 
     47     if ((image_input_width >= image_output_width) &&
     48             (image_input_height >= image_output_height)) { // scale DOWN
     49         pscaler_config->scaleMode = PSCALER_SCALE_DOWN;
     50     } else if ((image_input_width <= image_output_width) &&
     51             (image_input_height <= image_output_height)) { // scale UP
     52         pscaler_config->scaleMode = PSCALER_SCALE_UP;
     53     } else if (image_input_width > image_output_width) { // mixed scale Y-axis first
     54         pscaler_config->scaleMode = PSCALER_SCALE_MIXED_YUP;
     55     } else { // mixed scale X-axis first
     56         pscaler_config->scaleMode = PSCALER_SCALE_MIXED_XUP;
     57     }
     58 
     59     // Setup scale factors
     60     _calculate_factors(pscaler_config, pscaler_config->scaleMode);
     61 
     62     // calculates initial buffer sizes for scaling whole image
     63     //  start rows    == 0
     64     //  end_rows      == image height
     65     //  buffer widths == image widths
     66     pscaler_config->fSrcStartRow.decimal = 0;
     67     pscaler_config->fSrcStartRow.fraction = 0;
     68     pscaler_config->iSrcStartRow = 0;
     69     pscaler_config->iSrcEndRow = pscaler_config->iSrcHeight;
     70     pscaler_config->iSrcBufWidth = image_input_buf_width;
     71     pscaler_config->iOutStartRow = 0;
     72     pscaler_config->iOutEndRow = pscaler_config->iOutHeight;
     73     pscaler_config->iOutBufWidth = image_output_buf_width;
     74     pscaler_config->pSrcBuf = NULL;
     75     pscaler_config->pOutBuf = NULL;
     76     pscaler_config->pTmpBuf = NULL;
     77 }
     78 
     79 void scaler_calculate_scaling_rows(uint16 start_output_row_number, uint16 end_output_row_number,
     80         void *tables_ptr, uint16 *start_input_row_number, uint16 *end_input_row_number,
     81         uint16 *num_output_rows_generated, uint16 *num_rows_offset_to_start_output_row,
     82         uint32 *mixed_axis_temp_buffer_size_needed) {
     83     float64_t fSrcEndRow;
     84     bool_t overflow;
     85     scaler_config_t *pscaler_config;
     86 
     87     pscaler_config = (scaler_config_t *) tables_ptr;
     88     assert (start_output_row_number < pscaler_config->iOutHeight);
     89 
     90     // copy the output start and end rows
     91     // Don't ever attempt to output a single row from the scaler.
     92     if (end_output_row_number == start_output_row_number) {
     93         if (start_output_row_number == 0) {
     94             pscaler_config->iOutStartRow = start_output_row_number;
     95             pscaler_config->iOutEndRow = end_output_row_number + 1;
     96             *num_rows_offset_to_start_output_row = 0;
     97         } else {
     98             pscaler_config->iOutStartRow = start_output_row_number - 1;
     99             pscaler_config->iOutEndRow = end_output_row_number;
    100             *num_rows_offset_to_start_output_row = 1;
    101         }
    102     } else {
    103         pscaler_config->iOutStartRow = start_output_row_number;
    104         pscaler_config->iOutEndRow = end_output_row_number;
    105         *num_rows_offset_to_start_output_row = 0;
    106     }
    107 
    108     if (pscaler_config->iOutEndRow >= pscaler_config->iOutHeight) { // last stripe
    109         pscaler_config->iOutEndRow = pscaler_config->iOutHeight - 1;
    110     }
    111 
    112     if (pscaler_config->scaleMode == PSCALER_SCALE_UP ||
    113             pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP) {
    114         // scale factors are calculated as dim-1/dim-1
    115         pscaler_config->iSrcHeight--;
    116         pscaler_config->iOutHeight--;
    117     }
    118 
    119     pscaler_config->fSrcStartRow.decimal = (uint32) pscaler_config->iOutStartRow *
    120             (uint32) pscaler_config->iSrcHeight / (uint32) pscaler_config->iOutHeight;
    121 
    122     pscaler_config->fSrcStartRow.fraction = _scaler_fraction_part(
    123             (uint32) pscaler_config->iOutStartRow * (uint32) pscaler_config->iSrcHeight,
    124             (uint32) pscaler_config->iOutHeight, FRACTION_ROUND_UP, &overflow);
    125 
    126     if (overflow) {
    127         pscaler_config->fSrcStartRow.decimal++;
    128     }
    129 
    130     pscaler_config->iSrcStartRow = pscaler_config->fSrcStartRow.decimal;
    131 
    132     if (pscaler_config->scaleMode == PSCALER_SCALE_UP ||
    133             pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP) {
    134         fSrcEndRow.decimal = (uint32) pscaler_config->iOutEndRow *
    135                 (uint32) pscaler_config->iSrcHeight / (uint32) pscaler_config->iOutHeight;
    136         fSrcEndRow.fraction = _scaler_fraction_part(
    137                 (uint32) pscaler_config->iOutEndRow * (uint32) pscaler_config->iSrcHeight,
    138                 (uint32) pscaler_config->iOutHeight, FRACTION_TRUNCATE, &overflow);
    139 
    140         pscaler_config->iSrcEndRow = (uint16) fSrcEndRow.decimal;
    141 
    142         if (0 != fSrcEndRow.fraction) {
    143             // will cause an extra output row to be created...
    144             pscaler_config->iSrcEndRow++;
    145             pscaler_config->iOutEndRow++;
    146         }
    147 
    148         // restore dimensions
    149         pscaler_config->iSrcHeight++;
    150         pscaler_config->iOutHeight++;
    151     } else {
    152         fSrcEndRow.decimal = (uint32) (pscaler_config->iOutEndRow + 1) *
    153                 (uint32) pscaler_config->iSrcHeight /
    154                 (uint32) pscaler_config->iOutHeight;
    155 
    156         fSrcEndRow.fraction = _scaler_fraction_part(
    157                 (uint32) (pscaler_config->iOutEndRow + 1) * (uint32) pscaler_config->iSrcHeight,
    158                 (uint32) pscaler_config->iOutHeight, FRACTION_TRUNCATE, &overflow);
    159 
    160         pscaler_config->iSrcEndRow = (uint16) fSrcEndRow.decimal;
    161 
    162         if (0 == fSrcEndRow.fraction) {
    163             pscaler_config->iSrcEndRow--;
    164         }
    165     }
    166 
    167     // check to be sure we're not going beyond the source image
    168     if (pscaler_config->iSrcEndRow >= pscaler_config->iSrcHeight) { // last stripe
    169         pscaler_config->iSrcEndRow = pscaler_config->iSrcHeight - 1;
    170     }
    171 
    172     *start_input_row_number = pscaler_config->iSrcStartRow;
    173     *end_input_row_number = pscaler_config->iSrcEndRow;
    174     *num_output_rows_generated = (pscaler_config->iOutEndRow - pscaler_config->iOutStartRow + 1);
    175 
    176     // Calculate the 2nd pass buffer size if mixed scaling is done
    177     if (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_XUP) {
    178         *mixed_axis_temp_buffer_size_needed =
    179                 ROUND_4_UP(pscaler_config->iOutWidth + 1) *
    180                         (*end_input_row_number - *start_input_row_number + 1);
    181     } else if (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP) {
    182         *mixed_axis_temp_buffer_size_needed =
    183                 ROUND_4_UP(pscaler_config->iSrcWidth) * (*num_output_rows_generated + 1);
    184     } else {
    185         *mixed_axis_temp_buffer_size_needed = 0;
    186     }
    187 
    188     (*num_output_rows_generated)++;
    189 }
    190 
    191 void scaler_scale_image_data(uint8 *input_plane, void *tables_ptr, uint8 *scaled_output_plane,
    192         uint8 *temp_buffer_for_mixed_axis_scaling) {
    193     uint16 iOrigWidth, iOrigHeight, iOrigOutBufWidth, iOrigSrcBufWidth;
    194     uint16 iOrigOutStartRow, iOrigOutEndRow, iOrigSrcStartRow, iOrigSrcEndRow;
    195     float64_t fOrigSrcStartRow;
    196     uint8 *pOrigBuf;
    197     scaler_config_t *pscaler_config;
    198 
    199     pscaler_config = (scaler_config_t *) tables_ptr;
    200     pscaler_config->pSrcBuf = input_plane;
    201     pscaler_config->pOutBuf = scaled_output_plane;
    202 
    203     if ((PSCALER_SCALE_MIXED_XUP == pscaler_config->scaleMode) ||
    204             (PSCALER_SCALE_MIXED_YUP == pscaler_config->scaleMode)) {
    205         pscaler_config->pTmpBuf = temp_buffer_for_mixed_axis_scaling;
    206 
    207         // save the output buffer
    208         pOrigBuf = pscaler_config->pOutBuf;
    209 
    210         // use the temp buff as the output buff for pass 1
    211         pscaler_config->pOutBuf = pscaler_config->pTmpBuf;
    212 
    213         if (PSCALER_SCALE_MIXED_YUP == pscaler_config->scaleMode) {
    214             // save the original output widths
    215             iOrigWidth = pscaler_config->iOutWidth;
    216             iOrigOutBufWidth = pscaler_config->iOutBufWidth;
    217 
    218             // set output widths to input widths (1::1)
    219             pscaler_config->iOutWidth = pscaler_config->iSrcWidth;
    220             pscaler_config->iOutBufWidth = pscaler_config->iSrcBufWidth;
    221 
    222             // calculate the new scaler factors
    223             _calculate_factors(pscaler_config, PSCALER_SCALE_UP);
    224 
    225             // Run the photo scaler hardware
    226             _hw_scale_image_plane(pscaler_config, PSCALER_SCALE_UP);
    227 
    228             // reset the output widths
    229             pscaler_config->iOutWidth = iOrigWidth;
    230             pscaler_config->iOutBufWidth = iOrigOutBufWidth;
    231         } else {
    232             // save the original output height and row info
    233             iOrigHeight = pscaler_config->iOutHeight;
    234             iOrigOutStartRow = pscaler_config->iOutStartRow;
    235             iOrigOutEndRow = pscaler_config->iOutEndRow;
    236             fOrigSrcStartRow.fraction = pscaler_config->fSrcStartRow.fraction;
    237 
    238             // set output height and rows to input height and rows(1::1)
    239             pscaler_config->iOutHeight = pscaler_config->iSrcHeight;
    240             pscaler_config->iOutStartRow = pscaler_config->iSrcStartRow;
    241             pscaler_config->iOutEndRow = pscaler_config->iSrcEndRow;
    242             pscaler_config->fSrcStartRow.fraction = 0;
    243 
    244             // calculate the new scaler factors
    245             _calculate_factors(pscaler_config, PSCALER_SCALE_UP);
    246 
    247             // Run the photo scaler hardware
    248             _hw_scale_image_plane(pscaler_config, PSCALER_SCALE_UP);
    249 
    250             // reset the output height and rows
    251             pscaler_config->iOutHeight = iOrigHeight;
    252             pscaler_config->iOutStartRow = iOrigOutStartRow;
    253             pscaler_config->iOutEndRow = iOrigOutEndRow;
    254             pscaler_config->fSrcStartRow.fraction = fOrigSrcStartRow.fraction;
    255         }
    256         // restore the original output buffer
    257         pscaler_config->pOutBuf = pOrigBuf;
    258 
    259         // save the original input buffer
    260         pOrigBuf = pscaler_config->pSrcBuf;
    261 
    262         // use the previous output (temp) buffer as the new input buffer
    263         pscaler_config->pSrcBuf = pscaler_config->pTmpBuf;
    264 
    265         if (PSCALER_SCALE_MIXED_YUP == pscaler_config->scaleMode) {
    266             // save the original input height and rows
    267             iOrigHeight = pscaler_config->iSrcHeight;
    268             iOrigSrcStartRow = pscaler_config->iSrcStartRow;
    269             iOrigSrcEndRow = pscaler_config->iSrcEndRow;
    270             fOrigSrcStartRow.decimal = pscaler_config->fSrcStartRow.decimal;
    271             fOrigSrcStartRow.fraction = pscaler_config->fSrcStartRow.fraction;
    272 
    273             // set the height and rows to 1::1 for the second pass
    274             pscaler_config->iSrcHeight = pscaler_config->iOutHeight;
    275             pscaler_config->iSrcStartRow = pscaler_config->iOutStartRow;
    276             pscaler_config->iSrcEndRow = pscaler_config->iOutEndRow;
    277             pscaler_config->fSrcStartRow.decimal = pscaler_config->iOutStartRow;
    278             pscaler_config->fSrcStartRow.fraction = 0;
    279 
    280             // calculate new scale factors
    281             _calculate_factors(pscaler_config, PSCALER_SCALE_DOWN);
    282 
    283             // Run the photo scaler hardware
    284             _hw_scale_image_plane(pscaler_config, PSCALER_SCALE_DOWN);
    285 
    286             // restore original input height and rows
    287             pscaler_config->iSrcHeight = iOrigHeight;
    288             pscaler_config->iSrcStartRow = iOrigSrcStartRow;
    289             pscaler_config->iSrcEndRow = iOrigSrcEndRow;
    290             pscaler_config->fSrcStartRow.decimal = fOrigSrcStartRow.decimal;
    291             pscaler_config->fSrcStartRow.fraction = fOrigSrcStartRow.fraction;
    292         } else {
    293             // save the original input widths
    294             iOrigWidth = pscaler_config->iSrcWidth;
    295             iOrigSrcBufWidth = pscaler_config->iSrcBufWidth;
    296 
    297             // set the widths to 1::1 for the second pass
    298             pscaler_config->iSrcWidth = pscaler_config->iOutWidth;
    299             pscaler_config->iSrcBufWidth = pscaler_config->iOutBufWidth;
    300 
    301             // calculate new scale factors
    302             _calculate_factors(pscaler_config, PSCALER_SCALE_DOWN);
    303 
    304             // Run the photo scaler hardware
    305             _hw_scale_image_plane(pscaler_config, PSCALER_SCALE_DOWN);
    306 
    307             // restore original input widths
    308             pscaler_config->iSrcWidth = iOrigWidth;
    309             pscaler_config->iSrcBufWidth = iOrigSrcBufWidth;
    310         }
    311 
    312         // restore the input buffer
    313         pscaler_config->pTmpBuf = pscaler_config->pSrcBuf;
    314         pscaler_config->pSrcBuf = pOrigBuf;
    315 
    316         // release the temp buffer
    317         pscaler_config->pTmpBuf = NULL;
    318     } else {
    319         // Run the photo scaler hardware
    320         _hw_scale_image_plane(pscaler_config, pscaler_config->scaleMode);
    321     }
    322 }
    323 
    324 static void _calculate_factors(scaler_config_t *pscaler_config, scaler_mode_t scaleMode) {
    325     bool_t overflow;
    326     if ((pscaler_config->scaleMode == PSCALER_SCALE_UP) ||
    327             (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP)) {
    328         // scale up factors are computed as (dim-1)/(dim-1)
    329         pscaler_config->iSrcHeight--;
    330         pscaler_config->iOutHeight--;
    331     }
    332     if ((pscaler_config->scaleMode == PSCALER_SCALE_UP) ||
    333             (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_XUP)) {
    334         pscaler_config->iSrcWidth--;
    335         pscaler_config->iOutWidth--;
    336     }
    337 
    338     pscaler_config->fXfactor.decimal = (uint32) pscaler_config->iOutWidth /
    339             (uint32) pscaler_config->iSrcWidth;
    340     pscaler_config->fXfactor.fraction = _scaler_fraction_part(
    341             (uint32) pscaler_config->iOutWidth,
    342             (uint32) pscaler_config->iSrcWidth,
    343             FRACTION_TRUNCATE,
    344             &overflow);
    345 
    346     pscaler_config->fXfactorInv.decimal = (uint32) pscaler_config->iSrcWidth /
    347             (uint32) pscaler_config->iOutWidth;
    348     pscaler_config->fXfactorInv.fraction = _scaler_fraction_part(
    349             (uint32) pscaler_config->iSrcWidth, (uint32) pscaler_config->iOutWidth,
    350             FRACTION_ROUND_UP, &overflow);
    351 
    352     if (overflow) {
    353         pscaler_config->fXfactorInv.decimal++;
    354     }
    355 
    356     pscaler_config->fYfactor.decimal = (uint32) pscaler_config->iOutHeight /
    357             (uint32) pscaler_config->iSrcHeight;
    358     pscaler_config->fYfactor.fraction = _scaler_fraction_part(
    359             (uint32) pscaler_config->iOutHeight, (uint32) pscaler_config->iSrcHeight,
    360             FRACTION_TRUNCATE, &overflow);
    361 
    362     pscaler_config->fYfactorInv.decimal = (uint32) pscaler_config->iSrcHeight /
    363             (uint32) pscaler_config->iOutHeight;
    364     pscaler_config->fYfactorInv.fraction = _scaler_fraction_part(
    365             (uint32) pscaler_config->iSrcHeight, (uint32) pscaler_config->iOutHeight,
    366             FRACTION_ROUND_UP, &overflow);
    367 
    368     if (overflow) {
    369         pscaler_config->fYfactorInv.decimal++;
    370     }
    371 
    372     if ((pscaler_config->scaleMode == PSCALER_SCALE_UP) ||
    373             (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_YUP)) {
    374         // restore original dimensions
    375         pscaler_config->iSrcHeight++;
    376         pscaler_config->iOutHeight++;
    377     }
    378     if ((pscaler_config->scaleMode == PSCALER_SCALE_UP) ||
    379             (pscaler_config->scaleMode == PSCALER_SCALE_MIXED_XUP)) {
    380         pscaler_config->iSrcWidth++;
    381         pscaler_config->iOutWidth++;
    382     }
    383 }
    384 
    385 static uint32 _scaler_fraction_part(uint32 iNum, uint32 iDen, pscaler_fraction_t mode,
    386         bool_t *overflow) {
    387     uint32 iFract;     // fractional part
    388     uint32 iRem;       // remainder part
    389     int i;          // loop counter
    390 
    391     *overflow = 0;
    392     iFract = 0;
    393     iRem = iNum % iDen;
    394 
    395     if (iRem == 0) {
    396         return (0);
    397     }
    398 
    399     for (i = PSCALER_FRACT_BITS_COUNT - 1; i >= 0; i--) {
    400         iRem <<= 1;
    401 
    402         if (iRem == iDen) {
    403             iFract |= (1 << i);
    404             break;
    405         } else if (iRem > iDen) {
    406             iFract |= (1 << i);
    407             iRem -= iDen;
    408         }
    409     }
    410 
    411     if (mode == FRACTION_TRUNCATE) {
    412         return (iFract << 8);
    413     } else {
    414         if (iRem == 0) {
    415             return (iFract << 8);
    416         } else {
    417             if (iFract < 0x00ffffff) {
    418                 iFract++;
    419                 return (iFract << 8);
    420             } else {
    421                 *overflow = 1;
    422                 return (0);
    423             }
    424         }
    425     }
    426 }
    427 
    428 #define _RESTRICT_ __restrict__
    429 
    430 static inline void _scale_row_down_9in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
    431         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ in5,
    432         uint8 *_RESTRICT_ in6, uint8 *_RESTRICT_ in7, uint8 *_RESTRICT_ in8, uint8 *_RESTRICT_ out,
    433         uint64 position_x, uint64 x_factor_inv, uint32 top_weight, uint32 bot_weight,
    434         uint32 weight_reciprocal, int out_width) {
    435     int x;
    436     uint32 in_col;
    437     sint32 total_weight;
    438 
    439     for (x = 0; x < out_width; x++) {
    440         uint32 acc_r = 0;
    441         uint32 acc_g = 0;
    442         uint32 acc_b = 0;
    443         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    444         total_weight = x_factor_inv >> 24;
    445 
    446         in_col = position_x >> 32;
    447 
    448         while (total_weight > 0) {
    449             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
    450             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
    451             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
    452             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
    453             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight << 8;
    454             acc_r += (uint32) in5[(in_col * 3) + 0] * curr_weight << 8;
    455             acc_r += (uint32) in6[(in_col * 3) + 0] * curr_weight << 8;
    456             acc_r += (uint32) in7[(in_col * 3) + 0] * curr_weight << 8;
    457             acc_r += (uint32) in8[(in_col * 3) + 0] * curr_weight * bot_weight;
    458 
    459             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
    460             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
    461             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
    462             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
    463             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight << 8;
    464             acc_g += (uint32) in5[(in_col * 3) + 1] * curr_weight << 8;
    465             acc_g += (uint32) in6[(in_col * 3) + 1] * curr_weight << 8;
    466             acc_g += (uint32) in7[(in_col * 3) + 1] * curr_weight << 8;
    467             acc_g += (uint32) in8[(in_col * 3) + 1] * curr_weight * bot_weight;
    468 
    469             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
    470             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
    471             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
    472             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
    473             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight << 8;
    474             acc_b += (uint32) in5[(in_col * 3) + 2] * curr_weight << 8;
    475             acc_b += (uint32) in6[(in_col * 3) + 2] * curr_weight << 8;
    476             acc_b += (uint32) in7[(in_col * 3) + 2] * curr_weight << 8;
    477             acc_b += (uint32) in8[(in_col * 3) + 2] * curr_weight * bot_weight;
    478 
    479             in_col++;
    480 
    481             total_weight -= curr_weight;
    482             curr_weight = total_weight > 256 ? 256 : total_weight;
    483         }
    484 
    485         position_x += x_factor_inv;
    486 
    487         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    488         out[(x * 3) + 0] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    489         out[(x * 3) + 0] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    490     }
    491 }
    492 
    493 static inline void _scale_row_down_8in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
    494         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ in5,
    495         uint8 *_RESTRICT_ in6, uint8 *_RESTRICT_ in7, uint8 *_RESTRICT_ out, uint64 position_x,
    496         uint64 x_factor_inv, uint32 top_weight,
    497         uint32 bot_weight, uint32 weight_reciprocal,
    498         int out_width) {
    499     int x;
    500     uint32 in_col;
    501     sint32 total_weight;
    502 
    503     for (x = 0; x < out_width; x++) {
    504         uint32 acc_r = 0;
    505         uint32 acc_g = 0;
    506         uint32 acc_b = 0;
    507         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    508         total_weight = x_factor_inv >> 24;
    509 
    510         in_col = position_x >> 32;
    511 
    512         while (total_weight > 0) {
    513             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
    514             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
    515             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
    516             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
    517             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight << 8;
    518             acc_r += (uint32) in5[(in_col * 3) + 0] * curr_weight << 8;
    519             acc_r += (uint32) in6[(in_col * 3) + 0] * curr_weight << 8;
    520             acc_r += (uint32) in7[(in_col * 3) + 0] * curr_weight * bot_weight;
    521 
    522             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
    523             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
    524             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
    525             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
    526             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight << 8;
    527             acc_g += (uint32) in5[(in_col * 3) + 1] * curr_weight << 8;
    528             acc_g += (uint32) in6[(in_col * 3) + 1] * curr_weight << 8;
    529             acc_g += (uint32) in7[(in_col * 3) + 1] * curr_weight * bot_weight;
    530 
    531             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
    532             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
    533             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
    534             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
    535             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight << 8;
    536             acc_b += (uint32) in5[(in_col * 3) + 2] * curr_weight << 8;
    537             acc_b += (uint32) in6[(in_col * 3) + 2] * curr_weight << 8;
    538             acc_b += (uint32) in7[(in_col * 3) + 2] * curr_weight * bot_weight;
    539 
    540             in_col++;
    541 
    542             total_weight -= curr_weight;
    543             curr_weight = total_weight > 256 ? 256 : total_weight;
    544         }
    545 
    546         position_x += x_factor_inv;
    547 
    548         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    549         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    550         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    551     }
    552 }
    553 
    554 static inline void _scale_row_down_7in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
    555         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ in5,
    556         uint8 *_RESTRICT_ in6, uint8 *_RESTRICT_ out, uint64 position_x, uint64 x_factor_inv,
    557         uint32 top_weight, uint32 bot_weight, uint32 weight_reciprocal, int out_width) {
    558     int x;
    559     uint32 in_col;
    560     sint32 total_weight;
    561 
    562     for (x = 0; x < out_width; x++) {
    563         uint32 acc_r = 0;
    564         uint32 acc_g = 0;
    565         uint32 acc_b = 0;
    566         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    567         total_weight = x_factor_inv >> 24;
    568 
    569         in_col = position_x >> 32;
    570 
    571         while (total_weight > 0) {
    572             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
    573             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
    574             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
    575             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
    576             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight << 8;
    577             acc_r += (uint32) in5[(in_col * 3) + 0] * curr_weight << 8;
    578             acc_r += (uint32) in6[(in_col * 3) + 0] * curr_weight * bot_weight;
    579 
    580             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
    581             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
    582             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
    583             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
    584             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight << 8;
    585             acc_g += (uint32) in5[(in_col * 3) + 1] * curr_weight << 8;
    586             acc_g += (uint32) in6[(in_col * 3) + 1] * curr_weight * bot_weight;
    587 
    588             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
    589             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
    590             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
    591             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
    592             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight << 8;
    593             acc_b += (uint32) in5[(in_col * 3) + 2] * curr_weight << 8;
    594             acc_b += (uint32) in6[(in_col * 3) + 2] * curr_weight * bot_weight;
    595 
    596             in_col++;
    597 
    598             total_weight -= curr_weight;
    599             curr_weight = total_weight > 256 ? 256 : total_weight;
    600         }
    601 
    602         position_x += x_factor_inv;
    603 
    604         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    605         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    606         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    607     }
    608 }
    609 
    610 static inline void _scale_row_down_6in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
    611         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ in5,
    612         uint8 *_RESTRICT_ out, uint64 position_x, uint64 x_factor_inv, uint32 top_weight,
    613         uint32 bot_weight, uint32 weight_reciprocal, int out_width) {
    614     int x;
    615     uint32 in_col;
    616     sint32 total_weight;
    617 
    618     for (x = 0; x < out_width; x++) {
    619         uint32 acc_r = 0;
    620         uint32 acc_g = 0;
    621         uint32 acc_b = 0;
    622         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    623         total_weight = x_factor_inv >> 24;
    624 
    625         in_col = position_x >> 32;
    626 
    627         while (total_weight > 0) {
    628             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
    629             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
    630             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
    631             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
    632             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight << 8;
    633             acc_r += (uint32) in5[(in_col * 3) + 0] * curr_weight * bot_weight;
    634 
    635             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
    636             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
    637             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
    638             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
    639             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight << 8;
    640             acc_g += (uint32) in5[(in_col * 3) + 1] * curr_weight * bot_weight;
    641 
    642             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
    643             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
    644             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
    645             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
    646             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight << 8;
    647             acc_b += (uint32) in5[(in_col * 3) + 2] * curr_weight * bot_weight;
    648 
    649             in_col++;
    650 
    651             total_weight -= curr_weight;
    652             curr_weight = total_weight > 256 ? 256 : total_weight;
    653         }
    654 
    655         position_x += x_factor_inv;
    656 
    657         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    658         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    659         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    660     }
    661 }
    662 
    663 static inline void _scale_row_down_5in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
    664         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ in4, uint8 *_RESTRICT_ out,
    665         uint64 position_x, uint64 x_factor_inv, uint32 top_weight, uint32 bot_weight,
    666         uint32 weight_reciprocal, int out_width) {
    667     int x;
    668     uint32 in_col;
    669     sint32 total_weight;
    670 
    671     for (x = 0; x < out_width; x++) {
    672         uint32 acc_r = 0;
    673         uint32 acc_g = 0;
    674         uint32 acc_b = 0;
    675         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    676         total_weight = x_factor_inv >> 24;
    677 
    678         in_col = position_x >> 32;
    679 
    680         while (total_weight > 0) {
    681             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
    682             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
    683             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
    684             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight << 8;
    685             acc_r += (uint32) in4[(in_col * 3) + 0] * curr_weight * bot_weight;
    686 
    687             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
    688             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
    689             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
    690             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight << 8;
    691             acc_g += (uint32) in4[(in_col * 3) + 1] * curr_weight * bot_weight;
    692 
    693             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
    694             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
    695             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
    696             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight << 8;
    697             acc_b += (uint32) in4[(in_col * 3) + 2] * curr_weight * bot_weight;
    698 
    699             in_col++;
    700 
    701             total_weight -= curr_weight;
    702             curr_weight = total_weight > 256 ? 256 : total_weight;
    703         }
    704 
    705         position_x += x_factor_inv;
    706 
    707         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    708         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    709         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    710     }
    711 }
    712 
    713 static inline void _scale_row_down_4in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
    714         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ in3, uint8 *_RESTRICT_ out, uint64 position_x,
    715         uint64 x_factor_inv, uint32 top_weight, uint32 bot_weight, uint32 weight_reciprocal,
    716         int out_width) {
    717     int x;
    718     uint32 in_col;
    719     sint32 total_weight;
    720 
    721     for (x = 0; x < out_width; x++) {
    722         uint32 acc_r = 0;
    723         uint32 acc_g = 0;
    724         uint32 acc_b = 0;
    725         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    726         total_weight = x_factor_inv >> 24;
    727 
    728         in_col = position_x >> 32;
    729 
    730         while (total_weight > 0) {
    731             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
    732             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
    733             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight << 8;
    734             acc_r += (uint32) in3[(in_col * 3) + 0] * curr_weight * bot_weight;
    735 
    736             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
    737             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
    738             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight << 8;
    739             acc_g += (uint32) in3[(in_col * 3) + 1] * curr_weight * bot_weight;
    740 
    741             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
    742             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
    743             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight << 8;
    744             acc_b += (uint32) in3[(in_col * 3) + 2] * curr_weight * bot_weight;
    745 
    746             in_col++;
    747 
    748             total_weight -= curr_weight;
    749             curr_weight = total_weight > 256 ? 256 : total_weight;
    750         }
    751 
    752         position_x += x_factor_inv;
    753 
    754         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    755         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    756         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    757     }
    758 }
    759 
    760 static inline void _scale_row_down_3in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
    761         uint8 *_RESTRICT_ in2, uint8 *_RESTRICT_ out, uint64 position_x, uint64 x_factor_inv,
    762         uint32 top_weight, uint32 bot_weight, uint32 weight_reciprocal, int out_width) {
    763     int x;
    764     uint32 in_col;
    765     sint32 total_weight;
    766 
    767     for (x = 0; x < out_width; x++) {
    768         uint32 acc_r = 0;
    769         uint32 acc_g = 0;
    770         uint32 acc_b = 0;
    771         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    772         total_weight = x_factor_inv >> 24;
    773 
    774         in_col = position_x >> 32;
    775 
    776         while (total_weight > 0) {
    777             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
    778             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight << 8;
    779             acc_r += (uint32) in2[(in_col * 3) + 0] * curr_weight * bot_weight;
    780 
    781             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
    782             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight << 8;
    783             acc_g += (uint32) in2[(in_col * 3) + 1] * curr_weight * bot_weight;
    784 
    785             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
    786             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight << 8;
    787             acc_b += (uint32) in2[(in_col * 3) + 2] * curr_weight * bot_weight;
    788 
    789             in_col++;
    790 
    791             total_weight -= curr_weight;
    792             curr_weight = total_weight > 256 ? 256 : total_weight;
    793         }
    794 
    795         position_x += x_factor_inv;
    796 
    797         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    798         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    799         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    800     }
    801 }
    802 
    803 static inline void _scale_row_down_2in(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1,
    804         uint8 *_RESTRICT_ out, uint64 position_x, uint64 x_factor_inv, uint32 top_weight,
    805         uint32 bot_weight, uint32 weight_reciprocal, int out_width) {
    806     int x;
    807     uint32 in_col;
    808     sint32 total_weight;
    809 
    810     for (x = 0; x < out_width; x++) {
    811         uint32 acc_r = 0;
    812         uint32 acc_g = 0;
    813         uint32 acc_b = 0;
    814         uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    815         total_weight = x_factor_inv >> 24;
    816 
    817         in_col = position_x >> 32;
    818 
    819         while (total_weight > 0) {
    820             acc_r += (uint32) in0[(in_col * 3) + 0] * curr_weight * top_weight;
    821             acc_r += (uint32) in1[(in_col * 3) + 0] * curr_weight * bot_weight;
    822 
    823             acc_g += (uint32) in0[(in_col * 3) + 1] * curr_weight * top_weight;
    824             acc_g += (uint32) in1[(in_col * 3) + 1] * curr_weight * bot_weight;
    825 
    826             acc_b += (uint32) in0[(in_col * 3) + 2] * curr_weight * top_weight;
    827             acc_b += (uint32) in1[(in_col * 3) + 2] * curr_weight * bot_weight;
    828 
    829             in_col++;
    830 
    831             total_weight -= curr_weight;
    832             curr_weight = total_weight > 256 ? 256 : total_weight;
    833         }
    834 
    835         position_x += x_factor_inv;
    836 
    837         out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    838         out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    839         out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    840     }
    841 }
    842 
    843 static inline void _scale_row_down(uint8 *in, uint8 *_RESTRICT_ out, uint32 in_row_ofs,
    844         uint64 position_x, uint64 position_y, uint64 x_factor_inv, uint64 y_factor_inv,
    845         uint32 weight_reciprocal, int out_width) {
    846     int x;
    847     uint32 y, in_col, in_rows, top_weight, bot_weight;
    848     sint32 total_weight;
    849 
    850     total_weight = y_factor_inv >> 24;
    851 
    852     top_weight = (uint32) 256 - ((position_y >> 24) & 0xff);
    853 
    854     if ((sint32) top_weight > total_weight) {
    855         top_weight = total_weight;
    856     }
    857     total_weight -= top_weight;
    858 
    859     if (total_weight & 0xff) {
    860         bot_weight = total_weight & 0xff;
    861     } else if (total_weight > 255) {
    862         bot_weight = 256;
    863     } else {
    864         bot_weight = 0;
    865     }
    866 
    867     total_weight -= bot_weight;
    868 
    869     assert(total_weight >= 0);
    870     assert((total_weight & 0xff) == 0);
    871 
    872     in_rows = 2 + (total_weight >> 8);
    873 
    874     if (in_rows == 2) {
    875         _scale_row_down_2in(in, in + in_row_ofs,
    876                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
    877                 out_width);
    878     } else if (in_rows == 3) {
    879         _scale_row_down_3in(in, in + in_row_ofs, in + 2 * in_row_ofs,
    880                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
    881                 out_width);
    882     } else if (in_rows == 4) {
    883         _scale_row_down_4in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
    884                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
    885                 out_width);
    886     } else if (in_rows == 5) {
    887         _scale_row_down_5in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
    888                 in + 4 * in_row_ofs,
    889                 out, position_x, x_factor_inv,
    890                 top_weight, bot_weight, weight_reciprocal,
    891                 out_width);
    892     } else if (in_rows == 6) {
    893         _scale_row_down_6in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
    894                 in + 4 * in_row_ofs, in + 5 * in_row_ofs,
    895                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
    896                 out_width);
    897     } else if (in_rows == 7) {
    898         _scale_row_down_7in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
    899                 in + 4 * in_row_ofs, in + 5 * in_row_ofs, in + 6 * in_row_ofs,
    900                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
    901                 out_width);
    902     } else if (in_rows == 8) {
    903         _scale_row_down_8in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
    904                 in + 4 * in_row_ofs, in + 5 * in_row_ofs, in + 6 * in_row_ofs,
    905                 in + 7 * in_row_ofs,
    906                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
    907                 out_width);
    908     } else if (in_rows == 9) {
    909         _scale_row_down_9in(in, in + in_row_ofs, in + 2 * in_row_ofs, in + 3 * in_row_ofs,
    910                 in + 4 * in_row_ofs, in + 5 * in_row_ofs, in + 6 * in_row_ofs,
    911                 in + 7 * in_row_ofs, in + 8 * in_row_ofs,
    912                 out, position_x, x_factor_inv, top_weight, bot_weight, weight_reciprocal,
    913                 out_width);
    914     } else {
    915         for (x = 0; x < out_width; x++) {
    916             uint32 acc_r = 0;
    917             uint32 acc_g = 0;
    918             uint32 acc_b = 0;
    919             uint32 curr_weight = 256 - ((position_x >> 24) & 0xff);
    920             total_weight = x_factor_inv >> 24;
    921 
    922             in_col = position_x >> 32;
    923 
    924             while (total_weight > 0) {
    925                 acc_r += (uint32) in[(in_col * 3) + 0] * curr_weight * top_weight;
    926                 acc_g += (uint32) in[(in_col * 3) + 1] * curr_weight * top_weight;
    927                 acc_b += (uint32) in[(in_col * 3) + 2] * curr_weight * top_weight;
    928 
    929                 for (y = 1; y < in_rows - 1; y++) {
    930                     acc_r += (uint32) in[y * in_row_ofs + ((in_col * 3) + 0)] * curr_weight * 256;
    931                     acc_g += (uint32) in[y * in_row_ofs + ((in_col * 3) + 1)] * curr_weight * 256;
    932                     acc_b += (uint32) in[y * in_row_ofs + ((in_col * 3) + 2)] * curr_weight * 256;
    933                 }
    934 
    935                 acc_r +=
    936                         (uint32) in[y * in_row_ofs + ((in_col * 3) + 0)] * curr_weight * bot_weight;
    937                 acc_g +=
    938                         (uint32) in[y * in_row_ofs + ((in_col * 3) + 1)] * curr_weight * bot_weight;
    939                 acc_b +=
    940                         (uint32) in[y * in_row_ofs + ((in_col * 3) + 2)] * curr_weight * bot_weight;
    941 
    942                 in_col++;
    943                 total_weight -= curr_weight;
    944                 curr_weight = total_weight > 256 ? 256 : total_weight;
    945             }
    946 
    947             position_x += x_factor_inv;
    948 
    949             out[(x * 3) + 0] = ((uint64) acc_r * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    950             out[(x * 3) + 1] = ((uint64) acc_g * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    951             out[(x * 3) + 2] = ((uint64) acc_b * weight_reciprocal + ((uint64) 1 << 31)) >> 32;
    952         }
    953     }
    954 }
    955 
    956 static void _scale_row_up(uint8 *_RESTRICT_ in0, uint8 *_RESTRICT_ in1, uint8 *_RESTRICT_ out,
    957         sint32 weight_y, uint64 position_x, uint64 increment_x, int out_width) {
    958     int x;
    959     for (x = 0; x < out_width; x++) {
    960         sint32 top_val_r, bot_val_r;
    961         sint32 top_val_g, bot_val_g;
    962         sint32 top_val_b, bot_val_b;
    963 
    964         // Position is tracked with 32 bits of precision, but interpolation is
    965         // only guided by 10. REVISIT - Check ASM and make sure the compiler
    966         // handled the second part here optimally.
    967         uint32 pix_x = position_x >> 32;
    968 
    969         sint32 weight_x = (position_x & 0xffffffff) >> 22;
    970 
    971         // top_val and bot_val become 18-bit values here
    972         top_val_r = (in0[(pix_x * 3) + 0] << 10) +
    973                 weight_x * ((sint32) in0[((pix_x + 1) * 3) + 0] - in0[(pix_x * 3) + 0]);
    974         bot_val_r = (in1[(pix_x * 3) + 0] << 10) +
    975                 weight_x * ((sint32) in1[((pix_x + 1) * 3) + 0] - in1[(pix_x * 3) + 0]);
    976 
    977         top_val_g = (in0[(pix_x * 3) + 1] << 10) +
    978                 weight_x * ((sint32) in0[((pix_x + 1) * 3) + 1] - in0[(pix_x * 3) + 1]);
    979         bot_val_g = (in1[(pix_x * 3) + 1] << 10) +
    980                 weight_x * ((sint32) in1[((pix_x + 1) * 3) + 1] - in1[(pix_x * 3) + 1]);
    981 
    982         top_val_b = (in0[(pix_x * 3) + 2] << 10) +
    983                 weight_x * ((sint32) in0[((pix_x + 1) * 3) + 2] - in0[(pix_x * 3) + 2]);
    984         bot_val_b = (in1[(pix_x * 3) + 2] << 10) +
    985                 weight_x * ((sint32) in1[((pix_x + 1) * 3) + 2] - in1[(pix_x * 3) + 2]);
    986 
    987         // out is an 8-bit value. We do not need to range-check, as overflow
    988         // is mathematically impossible.
    989         out[(x * 3) + 0] = ((top_val_r << 10) + weight_y * (bot_val_r - top_val_r)) >> 20;
    990         out[(x * 3) + 1] = ((top_val_g << 10) + weight_y * (bot_val_g - top_val_g)) >> 20;
    991         out[(x * 3) + 2] = ((top_val_b << 10) + weight_y * (bot_val_b - top_val_b)) >> 20;
    992 
    993         position_x += increment_x;
    994     }
    995 }
    996 
    997 static void _hw_scale_image_plane(scaler_config_t *pscaler_config, scaler_mode_t scaleMode) {
    998     // These pointers duplicate h/w regs
    999     uint64 x_factor, y_factor, x_factor_inv, y_factor_inv;
   1000     uint32 x_output_width, y_output_width;
   1001     uint32 input_pixel_ptr_offset, output_pixel_ptr_offset;
   1002     uint32 first_xi;
   1003     uint64 first_y_src, first_x_src, weight_reciprocal;
   1004 
   1005     // These are internal state
   1006     uint32 r;
   1007     uint8 *outp;
   1008 
   1009     x_output_width = pscaler_config->iOutWidth;
   1010     y_output_width = pscaler_config->iOutEndRow -
   1011             pscaler_config->iOutStartRow + 1;
   1012 
   1013     input_pixel_ptr_offset = pscaler_config->iSrcBufWidth;
   1014     output_pixel_ptr_offset = pscaler_config->iOutBufWidth;
   1015 
   1016     x_factor = (uint64) pscaler_config->fXfactor.decimal << 32;
   1017     x_factor |= pscaler_config->fXfactor.fraction;
   1018 
   1019     y_factor = (uint64) pscaler_config->fYfactor.decimal << 32;
   1020     y_factor |= pscaler_config->fYfactor.fraction;
   1021 
   1022     x_factor_inv = (uint64) pscaler_config->fXfactorInv.decimal << 32;
   1023     x_factor_inv |= pscaler_config->fXfactorInv.fraction;
   1024 
   1025     y_factor_inv = (uint64) pscaler_config->fYfactorInv.decimal << 32;
   1026     y_factor_inv |= pscaler_config->fYfactorInv.fraction;
   1027 
   1028     first_y_src = (uint64) pscaler_config->fSrcStartRow.decimal << 32;
   1029     first_y_src |= pscaler_config->fSrcStartRow.fraction;
   1030 
   1031     // PC REVISIT - The HW has config registers for these, but they aren't being
   1032     // used by lib_photo_scaler do I don't want to use them, either. For now
   1033     // just print them so I can figure out what's going on and then clear the
   1034     // associated variables. Maybe we're always running the scaler from the
   1035     // left edge of the source so they're implicitly zero?
   1036     first_xi = pscaler_config->iOutStartColumn;
   1037 
   1038     first_x_src = (uint64) pscaler_config->fSrcStartColumn.decimal << 32;
   1039     first_x_src |= pscaler_config->fSrcStartColumn.fraction;
   1040 
   1041     first_xi = first_x_src = 0;
   1042 
   1043     weight_reciprocal = ((uint64) 1 << 32);
   1044     weight_reciprocal /= (x_factor_inv >> 24) * (y_factor_inv >> 24);
   1045 
   1046     outp = (pscaler_config->pOutBuf) + (first_xi * 3);
   1047 
   1048     // PC - Assume pSrcBuf is already aligned to "true" base of input,
   1049     // so ignore whole-number part of first_y_src.
   1050     first_y_src = first_y_src & 0xffffffff;
   1051 
   1052     for (r = 0; r < y_output_width; r++) {
   1053         uint8 *inp = (pscaler_config->pSrcBuf) +
   1054                 (first_y_src >> 32) * input_pixel_ptr_offset;
   1055         {
   1056             if (scaleMode == PSCALER_SCALE_UP) {
   1057                 _scale_row_up(inp, inp + input_pixel_ptr_offset, outp,
   1058                         (first_y_src & 0xffffffff) >> 22, first_x_src,
   1059                         x_factor_inv, x_output_width);
   1060             } else {
   1061                 _scale_row_down(inp, outp, input_pixel_ptr_offset,
   1062                         first_x_src, first_y_src, x_factor_inv, y_factor_inv,
   1063                         weight_reciprocal, x_output_width);
   1064             }
   1065         }
   1066         first_y_src += y_factor_inv;
   1067         outp += output_pixel_ptr_offset;
   1068     }
   1069 }