Home | History | Annotate | Download | only in common
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <stdio.h>
     13 #include <stdlib.h>
     14 #include <memory.h>
     15 #include <math.h>
     16 #include <assert.h>
     17 
     18 #include "config/av1_rtcd.h"
     19 
     20 #include "av1/common/warped_motion.h"
     21 #include "av1/common/scale.h"
     22 
     23 #define WARP_ERROR_BLOCK 32
     24 
     25 /* clang-format off */
     26 static const int error_measure_lut[512] = {
     27   // pow 0.7
     28   16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
     29   16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
     30   15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
     31   15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
     32   14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
     33   14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
     34   14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
     35   13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
     36   13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
     37   12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
     38   12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
     39   12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
     40   11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
     41   11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
     42   10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
     43   10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
     44   10058, 10002,  9947,  9891,  9835,  9779,  9723,  9666,
     45   9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
     46   9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
     47   8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
     48   8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
     49   7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
     50   7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
     51   6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
     52   6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
     53   5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
     54   5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
     55   4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
     56   3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
     57   3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
     58   2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
     59   1323, 1187, 1045,  894,  731,  550,  339,    0,
     60   339,  550,  731,  894, 1045, 1187, 1323, 1452,
     61   1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
     62   2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
     63   3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
     64   3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
     65   4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
     66   5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
     67   5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
     68   6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
     69   6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
     70   7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
     71   7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
     72   8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
     73   8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
     74   9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
     75   9723,  9779,  9835,  9891,  9947, 10002, 10058, 10113,
     76   10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
     77   10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
     78   11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
     79   11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
     80   11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
     81   12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
     82   12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
     83   13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
     84   13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
     85   13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
     86   14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
     87   14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
     88   15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
     89   15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
     90   15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
     91   16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
     92 };
     93 /* clang-format on */
     94 
     95 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
     96 // at a time. The zoom/rotation/shear in the model are applied to the
     97 // "fractional" position of each pixel, which therefore varies within
     98 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
     99 // We need an extra 2 taps to fit this in, for a total of 8 taps.
    100 /* clang-format off */
    101 const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
    102 #if WARPEDPIXEL_PREC_BITS == 6
    103   // [-1, 0)
    104   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
    105   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
    106   { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
    107   { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
    108   { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
    109   { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
    110   { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
    111   { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
    112   { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
    113   { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
    114   { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
    115   { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
    116   { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
    117   { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
    118   { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
    119   { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
    120   { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
    121   { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
    122   { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
    123   { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
    124   { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
    125   { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
    126   { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
    127   { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
    128   { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
    129   { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
    130   { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
    131   { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
    132   { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
    133   { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
    134   { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
    135   { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
    136 
    137   // [0, 1)
    138   { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
    139   { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
    140   { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
    141   {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
    142   {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
    143   {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
    144   {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
    145   {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
    146   {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
    147   {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
    148   {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
    149   {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
    150   {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
    151   {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
    152   {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
    153   {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
    154   {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
    155   {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
    156   {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
    157   {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
    158   {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
    159   {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
    160   {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
    161   {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
    162   {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
    163   {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
    164   {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
    165   {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
    166   {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
    167   {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
    168   { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
    169   { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
    170 
    171   // [1, 2)
    172   { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
    173   { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
    174   { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
    175   { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
    176   { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
    177   { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
    178   { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
    179   { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
    180   { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
    181   { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
    182   { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
    183   { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
    184   { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
    185   { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
    186   { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
    187   { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
    188   { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
    189   { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
    190   { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
    191   { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
    192   { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
    193   { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
    194   { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
    195   { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
    196   { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
    197   { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
    198   { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
    199   { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
    200   { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
    201   { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
    202   { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
    203   { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
    204   // dummy (replicate row index 191)
    205   { 0, 0, 0,   0,   2, 127, - 1, 0 },
    206 
    207 #elif WARPEDPIXEL_PREC_BITS == 5
    208   // [-1, 0)
    209   {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
    210   {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
    211   {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
    212   {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
    213   {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
    214   {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
    215   {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
    216   {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
    217   {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
    218   {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
    219   {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
    220   {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
    221   {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
    222   {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
    223   {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
    224   {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
    225   // [0, 1)
    226   { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
    227   { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
    228   {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
    229   {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
    230   {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
    231   {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
    232   {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
    233   {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
    234   {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
    235   {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
    236   {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
    237   {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
    238   {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
    239   {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
    240   {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
    241   { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
    242   // [1, 2)
    243   {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
    244   {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
    245   {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
    246   {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
    247   {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
    248   {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
    249   {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
    250   {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
    251   {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
    252   {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
    253   {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
    254   {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
    255   {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
    256   {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
    257   {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
    258   {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
    259   // dummy (replicate row index 95)
    260   {0, 0, 0,  -1,   4, 127,  -3, 1},
    261 
    262 #endif  // WARPEDPIXEL_PREC_BITS == 6
    263 };
    264 
    265 /* clang-format on */
    266 
    267 #define DIV_LUT_PREC_BITS 14
    268 #define DIV_LUT_BITS 8
    269 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
    270 
    271 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
    272   16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
    273   15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
    274   15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
    275   14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
    276   13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
    277   13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
    278   13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
    279   12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
    280   12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
    281   11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
    282   11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
    283   11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
    284   10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
    285   10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
    286   10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
    287   9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
    288   9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
    289   9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
    290   9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
    291   9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
    292   8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
    293   8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
    294   8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
    295   8240,  8224,  8208,  8192,
    296 };
    297 
    298 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
    299 // at precision of DIV_LUT_PREC_BITS along with the shift.
    300 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
    301   int64_t f;
    302   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
    303                                : get_msb((unsigned int)D));
    304   // e is obtained from D after resetting the most significant 1 bit.
    305   const int64_t e = D - ((uint64_t)1 << *shift);
    306   // Get the most significant DIV_LUT_BITS (8) bits of e into f
    307   if (*shift > DIV_LUT_BITS)
    308     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
    309   else
    310     f = e << (DIV_LUT_BITS - *shift);
    311   assert(f <= DIV_LUT_NUM);
    312   *shift += DIV_LUT_PREC_BITS;
    313   // Use f as lookup into the precomputed table of multipliers
    314   return div_lut[f];
    315 }
    316 
    317 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
    318   int32_t f;
    319   *shift = get_msb(D);
    320   // e is obtained from D after resetting the most significant 1 bit.
    321   const int32_t e = D - ((uint32_t)1 << *shift);
    322   // Get the most significant DIV_LUT_BITS (8) bits of e into f
    323   if (*shift > DIV_LUT_BITS)
    324     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
    325   else
    326     f = e << (DIV_LUT_BITS - *shift);
    327   assert(f <= DIV_LUT_NUM);
    328   *shift += DIV_LUT_PREC_BITS;
    329   // Use f as lookup into the precomputed table of multipliers
    330   return div_lut[f];
    331 }
    332 
    333 static int is_affine_valid(const WarpedMotionParams *const wm) {
    334   const int32_t *mat = wm->wmmat;
    335   return (mat[2] > 0);
    336 }
    337 
    338 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
    339                                    int16_t delta) {
    340   if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
    341       (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
    342     return 0;
    343   else
    344     return 1;
    345 }
    346 
    347 // Returns 1 on success or 0 on an invalid affine set
    348 int get_shear_params(WarpedMotionParams *wm) {
    349   const int32_t *mat = wm->wmmat;
    350   if (!is_affine_valid(wm)) return 0;
    351   wm->alpha =
    352       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
    353   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
    354   int16_t shift;
    355   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
    356   int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
    357   wm->gamma =
    358       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
    359   v = ((int64_t)mat[3] * mat[4]) * y;
    360   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
    361                         (1 << WARPEDMODEL_PREC_BITS),
    362                     INT16_MIN, INT16_MAX);
    363 
    364   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
    365               (1 << WARP_PARAM_REDUCE_BITS);
    366   wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
    367              (1 << WARP_PARAM_REDUCE_BITS);
    368   wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
    369               (1 << WARP_PARAM_REDUCE_BITS);
    370   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
    371               (1 << WARP_PARAM_REDUCE_BITS);
    372 
    373   if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
    374     return 0;
    375 
    376   return 1;
    377 }
    378 
    379 static INLINE int highbd_error_measure(int err, int bd) {
    380   const int b = bd - 8;
    381   const int bmask = (1 << b) - 1;
    382   const int v = (1 << b);
    383   err = abs(err);
    384   const int e1 = err >> b;
    385   const int e2 = err & bmask;
    386   return error_measure_lut[255 + e1] * (v - e2) +
    387          error_measure_lut[256 + e1] * e2;
    388 }
    389 
    390 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
    391     for hardware implementations, see the comments above av1_warp_affine_c
    392 */
    393 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
    394                               int width, int height, int stride, uint16_t *pred,
    395                               int p_col, int p_row, int p_width, int p_height,
    396                               int p_stride, int subsampling_x,
    397                               int subsampling_y, int bd,
    398                               ConvolveParams *conv_params, int16_t alpha,
    399                               int16_t beta, int16_t gamma, int16_t delta) {
    400   int32_t tmp[15 * 8];
    401   const int reduce_bits_horiz =
    402       conv_params->round_0 +
    403       AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
    404   const int reduce_bits_vert = conv_params->is_compound
    405                                    ? conv_params->round_1
    406                                    : 2 * FILTER_BITS - reduce_bits_horiz;
    407   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
    408   const int offset_bits_horiz = bd + FILTER_BITS - 1;
    409   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
    410   const int round_bits =
    411       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    412   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    413   (void)max_bits_horiz;
    414   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
    415 
    416   for (int i = p_row; i < p_row + p_height; i += 8) {
    417     for (int j = p_col; j < p_col + p_width; j += 8) {
    418       // Calculate the center of this 8x8 block,
    419       // project to luma coordinates (if in a subsampled chroma plane),
    420       // apply the affine transformation,
    421       // then convert back to the original coordinates (if necessary)
    422       const int32_t src_x = (j + 4) << subsampling_x;
    423       const int32_t src_y = (i + 4) << subsampling_y;
    424       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
    425       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
    426       const int32_t x4 = dst_x >> subsampling_x;
    427       const int32_t y4 = dst_y >> subsampling_y;
    428 
    429       const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
    430       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    431       const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
    432       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    433 
    434       sx4 += alpha * (-4) + beta * (-4);
    435       sy4 += gamma * (-4) + delta * (-4);
    436 
    437       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    438       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    439 
    440       // Horizontal filter
    441       for (int k = -7; k < 8; ++k) {
    442         const int iy = clamp(iy4 + k, 0, height - 1);
    443 
    444         int sx = sx4 + beta * (k + 4);
    445         for (int l = -4; l < 4; ++l) {
    446           int ix = ix4 + l - 3;
    447           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
    448                            WARPEDPIXEL_PREC_SHIFTS;
    449           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    450           const int16_t *coeffs = warped_filter[offs];
    451 
    452           int32_t sum = 1 << offset_bits_horiz;
    453           for (int m = 0; m < 8; ++m) {
    454             const int sample_x = clamp(ix + m, 0, width - 1);
    455             sum += ref[iy * stride + sample_x] * coeffs[m];
    456           }
    457           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
    458           assert(0 <= sum && sum < (1 << max_bits_horiz));
    459           tmp[(k + 7) * 8 + (l + 4)] = sum;
    460           sx += alpha;
    461         }
    462       }
    463 
    464       // Vertical filter
    465       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
    466         int sy = sy4 + delta * (k + 4);
    467         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
    468           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
    469                            WARPEDPIXEL_PREC_SHIFTS;
    470           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    471           const int16_t *coeffs = warped_filter[offs];
    472 
    473           int32_t sum = 1 << offset_bits_vert;
    474           for (int m = 0; m < 8; ++m) {
    475             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
    476           }
    477 
    478           if (conv_params->is_compound) {
    479             CONV_BUF_TYPE *p =
    480                 &conv_params
    481                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
    482                            (j - p_col + l + 4)];
    483             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
    484             if (conv_params->do_average) {
    485               uint16_t *dst16 =
    486                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    487               int32_t tmp32 = *p;
    488               if (conv_params->use_dist_wtd_comp_avg) {
    489                 tmp32 = tmp32 * conv_params->fwd_offset +
    490                         sum * conv_params->bck_offset;
    491                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
    492               } else {
    493                 tmp32 += sum;
    494                 tmp32 = tmp32 >> 1;
    495               }
    496               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
    497                       (1 << (offset_bits - conv_params->round_1 - 1));
    498               *dst16 =
    499                   clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
    500             } else {
    501               *p = sum;
    502             }
    503           } else {
    504             uint16_t *p =
    505                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    506             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
    507             assert(0 <= sum && sum < (1 << (bd + 2)));
    508             *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
    509           }
    510           sy += gamma;
    511         }
    512       }
    513     }
    514   }
    515 }
    516 
    517 static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
    518                               int width, int height, int stride,
    519                               const uint8_t *const pred8, int p_col, int p_row,
    520                               int p_width, int p_height, int p_stride,
    521                               int subsampling_x, int subsampling_y, int bd,
    522                               ConvolveParams *conv_params) {
    523   assert(wm->wmtype <= AFFINE);
    524   if (wm->wmtype == ROTZOOM) {
    525     wm->wmmat[5] = wm->wmmat[2];
    526     wm->wmmat[4] = -wm->wmmat[3];
    527   }
    528   const int32_t *const mat = wm->wmmat;
    529   const int16_t alpha = wm->alpha;
    530   const int16_t beta = wm->beta;
    531   const int16_t gamma = wm->gamma;
    532   const int16_t delta = wm->delta;
    533 
    534   const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
    535   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
    536   av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
    537                          p_width, p_height, p_stride, subsampling_x,
    538                          subsampling_y, bd, conv_params, alpha, beta, gamma,
    539                          delta);
    540 }
    541 
    542 static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
    543                                   const uint16_t *const dst, int p_width,
    544                                   int p_height, int p_stride, int bd) {
    545   int64_t sum_error = 0;
    546   for (int i = 0; i < p_height; ++i) {
    547     for (int j = 0; j < p_width; ++j) {
    548       sum_error +=
    549           highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
    550     }
    551   }
    552   return sum_error;
    553 }
    554 
    555 static int64_t highbd_warp_error(
    556     WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
    557     int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
    558     int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
    559     int64_t best_error) {
    560   int64_t gm_sumerr = 0;
    561   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
    562   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
    563   uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
    564 
    565   ConvolveParams conv_params = get_conv_params(0, 0, bd);
    566   conv_params.use_dist_wtd_comp_avg = 0;
    567   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
    568     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
    569       // avoid warping extra 8x8 blocks in the padded region of the frame
    570       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
    571       const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
    572       const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
    573       highbd_warp_plane(wm, ref8, width, height, stride,
    574                         CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
    575                         WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
    576                         &conv_params);
    577 
    578       gm_sumerr += highbd_frame_error(
    579           tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
    580           warp_w, warp_h, p_stride, bd);
    581       if (gm_sumerr > best_error) return gm_sumerr;
    582     }
    583   }
    584   return gm_sumerr;
    585 }
    586 
    587 static INLINE int error_measure(int err) {
    588   return error_measure_lut[255 + err];
    589 }
    590 
    591 /* The warp filter for ROTZOOM and AFFINE models works as follows:
    592    * Split the input into 8x8 blocks
    593    * For each block, project the point (4, 4) within the block, to get the
    594      overall block position. Split into integer and fractional coordinates,
    595      maintaining full WARPEDMODEL precision
    596    * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
    597      variable horizontal offset. This means that, while the rows of the
    598      intermediate buffer align with the rows of the *reference* image, the
    599      columns align with the columns of the *destination* image.
    600    * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
    601      destination is too small we crop the output at this stage). Each pixel has
    602      a variable vertical offset, so that the resulting rows are aligned with
    603      the rows of the destination image.
    604 
    605    To accomplish these alignments, we factor the warp matrix as a
    606    product of two shear / asymmetric zoom matrices:
    607    / a b \  = /   1       0    \ * / 1+alpha  beta \
    608    \ c d /    \ gamma  1+delta /   \    0      1   /
    609    where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
    610    The horizontal shear (with alpha and beta) is applied first,
    611    then the vertical shear (with gamma and delta) is applied second.
    612 
    613    The only limitation is that, to fit this in a fixed 8-tap filter size,
    614    the fractional pixel offsets must be at most +-1. Since the horizontal filter
    615    generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
    616    within the block, the parameters must satisfy
    617    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
    618    for this filter to be applicable.
    619 
    620    Note: This function assumes that the caller has done all of the relevant
    621    checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
    622    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
    623    gamma, delta are all in range.
    624 
    625    TODO(david.barker): Maybe support scaled references?
    626 */
    627 /* A note on hardware implementation:
    628     The warp filter is intended to be implementable using the same hardware as
    629     the high-precision convolve filters from the loop-restoration and
    630     convolve-round experiments.
    631 
    632     For a single filter stage, considering all of the coefficient sets for the
    633     warp filter and the regular convolution filter, an input in the range
    634     [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
    635     before rounding.
    636 
    637     Allowing for some changes to the filter coefficient sets, call the range
    638     [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
    639     we can replace this by the range [0, 256 * 2^k], which can be stored in an
    640     unsigned value with 8 + k bits.
    641 
    642     This allows the derivation of the appropriate bit widths and offsets for
    643     the various intermediate values: If
    644 
    645     F := FILTER_BITS = 7 (or else the above ranges need adjusting)
    646          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
    647          intermediate value.
    648     H := ROUND0_BITS
    649     V := VERSHEAR_REDUCE_PREC_BITS
    650     (and note that we must have H + V = 2*F for the output to have the same
    651      scale as the input)
    652 
    653     then we end up with the following offsets and ranges:
    654     Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
    655                        uint{bd + F + 1}
    656     After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
    657     Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
    658                      uint{bd + 2*F + 2 - H}
    659     After rounding: The final value, before undoing the offset, fits into a
    660                     uint{bd + 2}.
    661 
    662     Then we need to undo the offsets before clamping to a pixel. Note that,
    663     if we do this at the end, the amount to subtract is actually independent
    664     of H and V:
    665 
    666     offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
    667                          (1 << ((bd + 2*F - H) - V))
    668                       == (1 << (bd - 1)) + (1 << bd)
    669 
    670     This allows us to entirely avoid clamping in both the warp filter and
    671     the convolve-round experiment. As of the time of writing, the Wiener filter
    672     from loop-restoration can encode a central coefficient up to 216, which
    673     leads to a maximum value of about 282 * 2^k after applying the offset.
    674     So in that case we still need to clamp.
    675 */
    676 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
    677                        int height, int stride, uint8_t *pred, int p_col,
    678                        int p_row, int p_width, int p_height, int p_stride,
    679                        int subsampling_x, int subsampling_y,
    680                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
    681                        int16_t gamma, int16_t delta) {
    682   int32_t tmp[15 * 8];
    683   const int bd = 8;
    684   const int reduce_bits_horiz = conv_params->round_0;
    685   const int reduce_bits_vert = conv_params->is_compound
    686                                    ? conv_params->round_1
    687                                    : 2 * FILTER_BITS - reduce_bits_horiz;
    688   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
    689   const int offset_bits_horiz = bd + FILTER_BITS - 1;
    690   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
    691   const int round_bits =
    692       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    693   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    694   (void)max_bits_horiz;
    695   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
    696   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
    697 
    698   for (int i = p_row; i < p_row + p_height; i += 8) {
    699     for (int j = p_col; j < p_col + p_width; j += 8) {
    700       // Calculate the center of this 8x8 block,
    701       // project to luma coordinates (if in a subsampled chroma plane),
    702       // apply the affine transformation,
    703       // then convert back to the original coordinates (if necessary)
    704       const int32_t src_x = (j + 4) << subsampling_x;
    705       const int32_t src_y = (i + 4) << subsampling_y;
    706       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
    707       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
    708       const int32_t x4 = dst_x >> subsampling_x;
    709       const int32_t y4 = dst_y >> subsampling_y;
    710 
    711       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
    712       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    713       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
    714       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    715 
    716       sx4 += alpha * (-4) + beta * (-4);
    717       sy4 += gamma * (-4) + delta * (-4);
    718 
    719       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    720       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    721 
    722       // Horizontal filter
    723       for (int k = -7; k < 8; ++k) {
    724         // Clamp to top/bottom edge of the frame
    725         const int iy = clamp(iy4 + k, 0, height - 1);
    726 
    727         int sx = sx4 + beta * (k + 4);
    728 
    729         for (int l = -4; l < 4; ++l) {
    730           int ix = ix4 + l - 3;
    731           // At this point, sx = sx4 + alpha * l + beta * k
    732           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
    733                            WARPEDPIXEL_PREC_SHIFTS;
    734           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    735           const int16_t *coeffs = warped_filter[offs];
    736 
    737           int32_t sum = 1 << offset_bits_horiz;
    738           for (int m = 0; m < 8; ++m) {
    739             // Clamp to left/right edge of the frame
    740             const int sample_x = clamp(ix + m, 0, width - 1);
    741 
    742             sum += ref[iy * stride + sample_x] * coeffs[m];
    743           }
    744           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
    745           assert(0 <= sum && sum < (1 << max_bits_horiz));
    746           tmp[(k + 7) * 8 + (l + 4)] = sum;
    747           sx += alpha;
    748         }
    749       }
    750 
    751       // Vertical filter
    752       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
    753         int sy = sy4 + delta * (k + 4);
    754         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
    755           // At this point, sy = sy4 + gamma * l + delta * k
    756           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
    757                            WARPEDPIXEL_PREC_SHIFTS;
    758           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    759           const int16_t *coeffs = warped_filter[offs];
    760 
    761           int32_t sum = 1 << offset_bits_vert;
    762           for (int m = 0; m < 8; ++m) {
    763             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
    764           }
    765 
    766           if (conv_params->is_compound) {
    767             CONV_BUF_TYPE *p =
    768                 &conv_params
    769                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
    770                            (j - p_col + l + 4)];
    771             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
    772             if (conv_params->do_average) {
    773               uint8_t *dst8 =
    774                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    775               int32_t tmp32 = *p;
    776               if (conv_params->use_dist_wtd_comp_avg) {
    777                 tmp32 = tmp32 * conv_params->fwd_offset +
    778                         sum * conv_params->bck_offset;
    779                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
    780               } else {
    781                 tmp32 += sum;
    782                 tmp32 = tmp32 >> 1;
    783               }
    784               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
    785                       (1 << (offset_bits - conv_params->round_1 - 1));
    786               *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
    787             } else {
    788               *p = sum;
    789             }
    790           } else {
    791             uint8_t *p =
    792                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    793             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
    794             assert(0 <= sum && sum < (1 << (bd + 2)));
    795             *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
    796           }
    797           sy += gamma;
    798         }
    799       }
    800     }
    801   }
    802 }
    803 
    804 static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
    805                        int width, int height, int stride, uint8_t *pred,
    806                        int p_col, int p_row, int p_width, int p_height,
    807                        int p_stride, int subsampling_x, int subsampling_y,
    808                        ConvolveParams *conv_params) {
    809   assert(wm->wmtype <= AFFINE);
    810   if (wm->wmtype == ROTZOOM) {
    811     wm->wmmat[5] = wm->wmmat[2];
    812     wm->wmmat[4] = -wm->wmmat[3];
    813   }
    814   const int32_t *const mat = wm->wmmat;
    815   const int16_t alpha = wm->alpha;
    816   const int16_t beta = wm->beta;
    817   const int16_t gamma = wm->gamma;
    818   const int16_t delta = wm->delta;
    819   av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
    820                   p_height, p_stride, subsampling_x, subsampling_y, conv_params,
    821                   alpha, beta, gamma, delta);
    822 }
    823 
    824 static int64_t frame_error(const uint8_t *const ref, int stride,
    825                            const uint8_t *const dst, int p_width, int p_height,
    826                            int p_stride) {
    827   int64_t sum_error = 0;
    828   for (int i = 0; i < p_height; ++i) {
    829     for (int j = 0; j < p_width; ++j) {
    830       sum_error +=
    831           (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
    832     }
    833   }
    834   return sum_error;
    835 }
    836 
    837 static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
    838                           int width, int height, int stride,
    839                           const uint8_t *const dst, int p_col, int p_row,
    840                           int p_width, int p_height, int p_stride,
    841                           int subsampling_x, int subsampling_y,
    842                           int64_t best_error) {
    843   int64_t gm_sumerr = 0;
    844   int warp_w, warp_h;
    845   int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
    846   int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
    847   uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
    848   ConvolveParams conv_params = get_conv_params(0, 0, 8);
    849   conv_params.use_dist_wtd_comp_avg = 0;
    850 
    851   for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
    852     for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
    853       // avoid warping extra 8x8 blocks in the padded region of the frame
    854       // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
    855       warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
    856       warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
    857       warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
    858                  WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
    859 
    860       gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
    861                                warp_w, warp_h, p_stride);
    862       if (gm_sumerr > best_error) return gm_sumerr;
    863     }
    864   }
    865   return gm_sumerr;
    866 }
    867 
    868 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
    869                         uint8_t *dst, int p_width, int p_height, int p_stride) {
    870   if (use_hbd) {
    871     return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
    872                               CONVERT_TO_SHORTPTR(dst), p_width, p_height,
    873                               p_stride, bd);
    874   }
    875   return frame_error(ref, stride, dst, p_width, p_height, p_stride);
    876 }
    877 
    878 int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
    879                        const uint8_t *ref, int width, int height, int stride,
    880                        uint8_t *dst, int p_col, int p_row, int p_width,
    881                        int p_height, int p_stride, int subsampling_x,
    882                        int subsampling_y, int64_t best_error) {
    883   if (wm->wmtype <= AFFINE)
    884     if (!get_shear_params(wm)) return 1;
    885   if (use_hbd)
    886     return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
    887                              p_width, p_height, p_stride, subsampling_x,
    888                              subsampling_y, bd, best_error);
    889   return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
    890                     p_height, p_stride, subsampling_x, subsampling_y,
    891                     best_error);
    892 }
    893 
    894 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
    895                     const uint8_t *ref, int width, int height, int stride,
    896                     uint8_t *pred, int p_col, int p_row, int p_width,
    897                     int p_height, int p_stride, int subsampling_x,
    898                     int subsampling_y, ConvolveParams *conv_params) {
    899   if (use_hbd)
    900     highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
    901                       p_width, p_height, p_stride, subsampling_x, subsampling_y,
    902                       bd, conv_params);
    903   else
    904     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
    905                p_height, p_stride, subsampling_x, subsampling_y, conv_params);
    906 }
    907 
    908 #define LS_MV_MAX 256  // max mv in 1/8-pel
    909 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
    910 #define LS_STEP 8
    911 
    912 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
    913 // the precision needed is:
    914 //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
    915 //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
    916 //   1 [for sign] +
    917 //   LEAST_SQUARES_SAMPLES_MAX_BITS
    918 //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
    919 // The value is 23
    920 #define LS_MAT_RANGE_BITS \
    921   ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
    922 
    923 // Bit-depth reduction from the full-range
    924 #define LS_MAT_DOWN_BITS 2
    925 
    926 // bits range of A, Bx and By after downshifting
    927 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
    928 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
    929 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
    930 
    931 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
    932 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
    933 #define LS_SQUARE(a)                                          \
    934   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
    935    (2 + LS_MAT_DOWN_BITS))
    936 #define LS_PRODUCT1(a, b)                                           \
    937   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
    938    (2 + LS_MAT_DOWN_BITS))
    939 #define LS_PRODUCT2(a, b)                                               \
    940   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
    941    (2 + LS_MAT_DOWN_BITS))
    942 
    943 #define USE_LIMITED_PREC_MULT 0
    944 
    945 #if USE_LIMITED_PREC_MULT
    946 
    947 #define MUL_PREC_BITS 16
    948 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
    949   int msb = 0;
    950   uint16_t mult = 0;
    951   *shift = 0;
    952   if (D != 0) {
    953     msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
    954                               : get_msb((unsigned int)D));
    955     if (msb >= MUL_PREC_BITS) {
    956       mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
    957       *shift = msb + 1 - MUL_PREC_BITS;
    958     } else {
    959       mult = (uint16_t)D;
    960       *shift = 0;
    961     }
    962   }
    963   return mult;
    964 }
    965 
    966 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
    967   int32_t ret;
    968   int16_t mshift;
    969   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
    970   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
    971   shift -= mshift;
    972   if (shift > 0) {
    973     return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
    974                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    975                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    976   } else {
    977     return (int32_t)clamp(v * (1 << (-shift)),
    978                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    979                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    980   }
    981   return ret;
    982 }
    983 
    984 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
    985   int16_t mshift;
    986   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
    987   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
    988   shift -= mshift;
    989   if (shift > 0) {
    990     return (int32_t)clamp(
    991         ROUND_POWER_OF_TWO_SIGNED(v, shift),
    992         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    993         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    994   } else {
    995     return (int32_t)clamp(
    996         v * (1 << (-shift)),
    997         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    998         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    999   }
   1000 }
   1001 
   1002 #else
   1003 
   1004 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
   1005   int64_t v = Px * (int64_t)iDet;
   1006   return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
   1007                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
   1008                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
   1009 }
   1010 
   1011 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
   1012   int64_t v = Px * (int64_t)iDet;
   1013   return (int32_t)clamp64(
   1014       ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
   1015       (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
   1016       (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
   1017 }
   1018 #endif  // USE_LIMITED_PREC_MULT
   1019 
   1020 static int find_affine_int(int np, const int *pts1, const int *pts2,
   1021                            BLOCK_SIZE bsize, int mvy, int mvx,
   1022                            WarpedMotionParams *wm, int mi_row, int mi_col) {
   1023   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
   1024   int32_t Bx[2] = { 0, 0 };
   1025   int32_t By[2] = { 0, 0 };
   1026   int i;
   1027 
   1028   const int bw = block_size_wide[bsize];
   1029   const int bh = block_size_high[bsize];
   1030   const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
   1031   const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
   1032   const int suy = rsuy * 8;
   1033   const int sux = rsux * 8;
   1034   const int duy = suy + mvy;
   1035   const int dux = sux + mvx;
   1036   const int isuy = (mi_row * MI_SIZE + rsuy);
   1037   const int isux = (mi_col * MI_SIZE + rsux);
   1038 
   1039   // Assume the center pixel of the block has exactly the same motion vector
   1040   // as transmitted for the block. First shift the origin of the source
   1041   // points to the block center, and the origin of the destination points to
   1042   // the block center added to the motion vector transmitted.
   1043   // Let (xi, yi) denote the source points and (xi', yi') denote destination
   1044   // points after origin shfifting, for i = 0, 1, 2, .... n-1.
   1045   // Then if  P = [x0, y0,
   1046   //               x1, y1
   1047   //               x2, y1,
   1048   //                ....
   1049   //              ]
   1050   //          q = [x0', x1', x2', ... ]'
   1051   //          r = [y0', y1', y2', ... ]'
   1052   // the least squares problems that need to be solved are:
   1053   //          [h1, h2]' = inv(P'P)P'q and
   1054   //          [h3, h4]' = inv(P'P)P'r
   1055   // where the affine transformation is given by:
   1056   //          x' = h1.x + h2.y
   1057   //          y' = h3.x + h4.y
   1058   //
   1059   // The loop below computes: A = P'P, Bx = P'q, By = P'r
   1060   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
   1061   // Contribution from neighbor block
   1062   for (i = 0; i < np; i++) {
   1063     const int dx = pts2[i * 2] - dux;
   1064     const int dy = pts2[i * 2 + 1] - duy;
   1065     const int sx = pts1[i * 2] - sux;
   1066     const int sy = pts1[i * 2 + 1] - suy;
   1067     // (TODO)yunqing: This comparison wouldn't be necessary if the sample
   1068     // selection is done in find_samples(). Also, global offset can be removed
   1069     // while collecting samples.
   1070     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
   1071       A[0][0] += LS_SQUARE(sx);
   1072       A[0][1] += LS_PRODUCT1(sx, sy);
   1073       A[1][1] += LS_SQUARE(sy);
   1074       Bx[0] += LS_PRODUCT2(sx, dx);
   1075       Bx[1] += LS_PRODUCT1(sy, dx);
   1076       By[0] += LS_PRODUCT1(sx, dy);
   1077       By[1] += LS_PRODUCT2(sy, dy);
   1078     }
   1079   }
   1080 
   1081   // Just for debugging, and can be removed later.
   1082   assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
   1083   assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
   1084   assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
   1085   assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
   1086   assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
   1087   assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
   1088   assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
   1089 
   1090   int64_t Det;
   1091   int16_t iDet, shift;
   1092 
   1093   // Compute Determinant of A
   1094   Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
   1095   if (Det == 0) return 1;
   1096   iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
   1097   shift -= WARPEDMODEL_PREC_BITS;
   1098   if (shift < 0) {
   1099     iDet <<= (-shift);
   1100     shift = 0;
   1101   }
   1102 
   1103   int64_t Px[2], Py[2];
   1104 
   1105   // These divided by the Det, are the least squares solutions
   1106   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
   1107   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
   1108   Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
   1109   Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
   1110 
   1111   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
   1112   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
   1113   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
   1114   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
   1115 
   1116   // Note: In the vx, vy expressions below, the max value of each of the
   1117   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
   1118   // for the first term so that the overall sum in the worst case fits
   1119   // within 32 bits overall.
   1120   int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
   1121                (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
   1122                 isuy * wm->wmmat[3]);
   1123   int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
   1124                (isux * wm->wmmat[4] +
   1125                 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
   1126   wm->wmmat[0] =
   1127       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
   1128   wm->wmmat[1] =
   1129       clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
   1130 
   1131   wm->wmmat[6] = wm->wmmat[7] = 0;
   1132   return 0;
   1133 }
   1134 
   1135 int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
   1136                     int mvx, WarpedMotionParams *wm_params, int mi_row,
   1137                     int mi_col) {
   1138   assert(wm_params->wmtype == AFFINE);
   1139 
   1140   if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
   1141                       mi_col))
   1142     return 1;
   1143 
   1144   // check compatibility with the fast warp filter
   1145   if (!get_shear_params(wm_params)) return 1;
   1146 
   1147   return 0;
   1148 }
   1149