Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @/*******************************************************************************
     20 @* @file
     21 @*  ihevcd_fmt_conv_420sp_to_rgba8888.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for format conversions
     25 @*
     26 @* @author
     27 @*  ittiam
     28 @*
     29 @* @par list of functions:
     30 @*
     31 @*
     32 @* @remarks
     33 @*  none
     34 @*
     35 @*******************************************************************************/
     36     .equ DO1STROUNDING, 0
     37 
     38     @ ARM
     39     @
     40     @ PRESERVE8
     41 
     42 .text
     43 .p2align 2
     44 
     45 
     46 
     47 
     48 @/*****************************************************************************
     49 @*                                                                            *
     50 @*  Function Name    : ihevcd_fmt_conv_420sp_to_rgba8888()                    *
     51 @*                                                                            *
     52 @*  Description      : This function conversts the image from YUV422 color    *
     53 @*                     space to RGB888 color space. The function can be       *
     54 @*                     invoked at the MB level.                               *
     55 @*                                                                            *
     56 @*  Arguments        : R0           pubY                                      *
     57 @*                     R1           pubUV                                     *
     58 @*                     R2           pusRGB                                    *
     59 @*                     R3           pusRGB                                    *
     60 @*                     [R13 #40]    usHeight                                  *
     61 @*                     [R13 #44]    usWidth                                   *
     62 @*                     [R13 #48]    usStrideY                                 *
     63 @*                     [R13 #52]    usStrideU                                 *
     64 @*                     [R13 #56]    usStrideV                                 *
     65 @*                     [R13 #60]    usStrideRGB                               *
     66 @*                                                                            *
     67 @*  Values Returned  : None                                                   *
     68 @*                                                                            *
     69 @*  Register Usage   : R0 - R14                                               *
     70 @*                                                                            *
     71 @*  Stack Usage      : 40 Bytes                                               *
     72 @*                                                                            *
     73 @*  Interruptibility : Interruptible                                          *
     74 @*                                                                            *
     75 @*  Known Limitations                                                         *
     76 @*       Assumptions: Image Width:     Assumed to be multiple of 16 and       *
     77 @*                     greater than or equal to 16                *
     78 @*                     Image Height:    Assumed to be even.                   *
     79 @*                                                                            *
     80 @*  Revision History :                                                        *
     81 @*         DD MM YYYY   Author(s)       Changes (Describe the changes made)   *
     82 @*         07 06 2010   Varshita        Draft                                 *
     83 @*         07 06 2010   Naveen Kr T     Completed                             *
     84 @*         05 08 2013   Naveen K P      Modified for HEVC                     *
     85 @*****************************************************************************/
     86     .global ihevcd_fmt_conv_420sp_to_rgba8888_a9q
     87 .type ihevcd_fmt_conv_420sp_to_rgba8888_a9q, function
     88 ihevcd_fmt_conv_420sp_to_rgba8888_a9q:
     89 
     90     @// push the registers on the stack
     91     STMFD       SP!,{R4-R12,LR}
     92 
     93 
     94     @//R0 - Y PTR
     95     @//R1 - UV PTR
     96     @//R2 - RGB PTR
     97     @//R3 - RGB PTR
     98     @//R4 - PIC WIDTH
     99     @//R5 - PIC HT
    100     @//R6 - STRIDE Y
    101     @//R7 - STRIDE U
    102     @//R8 - STRIDE V
    103     @//R9 - STRIDE RGB
    104 
    105     @//ONE ROW PROCESSING AT A TIME
    106 
    107     @//THE FOUR CONSTANTS ARE:
    108     @//C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
    109 
    110     @PLD        [R0]
    111     @PLD        [R1]
    112     @PLD        [R2]
    113 
    114 
    115     @/* can be loaded from a defined const type */
    116     MOVW        R10,#0x3311
    117     VMOV.16     D0[0],R10                   @//C1
    118 
    119     MOVW        R10,#0xF379
    120     VMOV.16     D0[1],R10                   @//C2
    121 
    122     MOVW        R10,#0xE5F8
    123     VMOV.16     D0[2],R10                   @//C3
    124 
    125     MOVW        R10,#0x4092
    126     VMOV.16     D0[3],R10                   @//C4
    127 
    128     @//LOAD CONSTANT 128 INTO A CORTEX REGISTER
    129     MOV         R10,#128
    130     VDUP.8      D1,R10
    131 
    132     @//D0 HAS C1-C2-C3-C4
    133     @// load other parameters from stack
    134     LDR         R5,[sp,#40]
    135     @LDR  R4,[sp,#44]
    136     LDR         R6,[sp,#44]
    137     LDR         R7,[sp,#48]
    138     @LDR  R8,[sp,#52]
    139     LDR         R9,[sp,#52]
    140 
    141     @// calculate offsets, offset = stride - width
    142     SUB         R10,R6,R3                   @// luma offset
    143     SUB         R11,R7,R3
    144     @, LSR #1   @// u offset
    145     @SUB     R12,R8,R3, LSR #1  @// v offset
    146     SUB         R14,R9,R3                   @// rgb offset in pixels
    147 
    148     @// calculate height loop count
    149     MOV         R5,R5, LSR #1               @// height_cnt = height / 16
    150 
    151     @// create next row pointers for rgb and luma data
    152     ADD         R7,R0,R6                    @// luma_next_row = luma + luma_stride
    153     ADD         R8,R2,R9,LSL #2             @// rgb_next_row = rgb + rgb_stride
    154 
    155 LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
    156 
    157     @//LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
    158     VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF UV
    159     @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
    160 
    161     @// calculate width loop count
    162     MOV         R6,R3, LSR #4               @// width_cnt = width / 16
    163 
    164     @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
    165     @//LOAD VALUES OF Y 8-BIT VALUES
    166     VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
    167                                             @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    168     VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
    169                                             @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    170 
    171     SUBS        R6,R6,#1
    172     BEQ         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
    173 
    174 LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
    175     @VMOV.I8 Q1,#128
    176     VUZP.8      D2,D3
    177 
    178 
    179     @//NEED TO SUBTRACT (U-128) AND (V-128)
    180     @//(D2-D1),(D3-D1)
    181     VSUBL.U8    Q2,D2,D1                    @//(U-128)
    182     VSUBL.U8    Q3,D3,D1                    @//(V-128)
    183 
    184     @//LOAD VALUES OF U&V for next row
    185     VLD1.8      {D2,D3},[R1]!               @//LOAD 8 VALUES OF U
    186     @//VLD1.8 {D3},[R2]!            @//LOAD 8 VALUES OF V
    187 
    188     @PLD        [R0]
    189     PLD         [R1]
    190 
    191     @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
    192     VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
    193     VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
    194 
    195     VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
    196     VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
    197 
    198     VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
    199     VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
    200     VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
    201     VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
    202 
    203     @//NARROW RIGHT SHIFT BY 13 FOR R&B
    204     VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
    205     VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
    206     @//Q4 - WEIGHT FOR B
    207 
    208     @//NARROW RIGHT SHIFT BY 13 FOR R&B
    209     VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
    210     VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
    211     @//Q5 - WEIGHT FOR R
    212 
    213     @//NARROW RIGHT SHIFT BY 13 FOR G
    214     VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    215     VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    216     @//Q6 - WEIGHT FOR G
    217 
    218     VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
    219     VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
    220     VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
    221 
    222     VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
    223     VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
    224     VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
    225 
    226     VQMOVUN.S16 D14,Q7
    227     VQMOVUN.S16 D15,Q9
    228     VQMOVUN.S16 D16,Q8
    229     VMOV.I8     D17,#0
    230 
    231     VZIP.8      D14,D15
    232     VZIP.8      D16,D17
    233     VZIP.16     Q7,Q8
    234 
    235 
    236     VQMOVUN.S16 D20,Q10
    237     VQMOVUN.S16 D21,Q12
    238     VQMOVUN.S16 D22,Q11
    239     VMOV.I8     D23,#0
    240 
    241     VZIP.8      D20,D21
    242     VZIP.8      D22,D23
    243     VZIP.16     Q10,Q11
    244 
    245     VZIP.32     Q7,Q10
    246     VZIP.32     Q8,Q11
    247 
    248     VST1.32     D14,[R2]!
    249     VST1.32     D15,[R2]!
    250     VST1.32     D20,[R2]!
    251     VST1.32     D21,[R2]!
    252     VST1.32     D16,[R2]!
    253     VST1.32     D17,[R2]!
    254     VST1.32     D22,[R2]!
    255     VST1.32     D23,[R2]!
    256 
    257     @//D14-D20 - TOALLY HAVE 16 VALUES
    258     @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
    259     VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
    260     VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
    261     VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
    262 
    263     VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
    264     VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
    265     VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
    266 
    267     @//COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
    268     @//LOAD VALUES OF Y 8-BIT VALUES
    269     VLD2.8      {D30,D31},[R0]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
    270                                             @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    271     VLD2.8      {D28,D29},[R7]!             @//D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
    272                                             @//D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
    273 
    274     PLD         [R0]
    275     PLD         [R7]
    276 
    277     VQMOVUN.S16 D14,Q7
    278     VQMOVUN.S16 D15,Q9
    279     VQMOVUN.S16 D16,Q8
    280     VMOV.I8     D17,#0
    281 
    282     VZIP.8      D14,D15
    283     VZIP.8      D16,D17
    284     VZIP.16     Q7,Q8
    285 
    286 
    287     VQMOVUN.S16 D20,Q10
    288     VQMOVUN.S16 D21,Q12
    289     VQMOVUN.S16 D22,Q11
    290     VMOV.I8     D23,#0
    291 
    292     VZIP.8      D20,D21
    293     VZIP.8      D22,D23
    294     VZIP.16     Q10,Q11
    295 
    296     VZIP.32     Q7,Q10
    297     VZIP.32     Q8,Q11
    298 
    299     VST1.32     D14,[R8]!
    300     VST1.32     D15,[R8]!
    301     VST1.32     D20,[R8]!
    302     VST1.32     D21,[R8]!
    303     VST1.32     D16,[R8]!
    304     VST1.32     D17,[R8]!
    305     VST1.32     D22,[R8]!
    306     VST1.32     D23,[R8]!
    307 
    308     SUBS        R6,R6,#1                    @// width_cnt -= 1
    309     BNE         LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
    310 
    311 LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
    312     @VMOV.I8 Q1,#128
    313     VUZP.8      D2,D3
    314 
    315 
    316     @//NEED TO SUBTRACT (U-128) AND (V-128)
    317     @//(D2-D1),(D3-D1)
    318     VSUBL.U8    Q2,D2,D1                    @//(U-128)
    319     VSUBL.U8    Q3,D3,D1                    @//(V-128)
    320 
    321 
    322     @//NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
    323     VMULL.S16   Q4,D4,D0[3]                 @//(U-128)*C4 FOR B
    324     VMULL.S16   Q5,D5,D0[3]                 @//(U-128)*C4 FOR B
    325 
    326     VMULL.S16   Q10,D6,D0[0]                @//(V-128)*C1 FOR R
    327     VMULL.S16   Q11,D7,D0[0]                @//(V-128)*C1 FOR R
    328 
    329     VMULL.S16   Q6,D4,D0[1]                 @//(U-128)*C2 FOR G
    330     VMLAL.S16   Q6,D6,D0[2]                 @//Q6 = (U-128)*C2 + (V-128)*C3
    331     VMULL.S16   Q7,D5,D0[1]                 @//(U-128)*C2 FOR G
    332     VMLAL.S16   Q7,D7,D0[2]                 @//Q7 = (U-128)*C2 + (V-128)*C3
    333 
    334     @//NARROW RIGHT SHIFT BY 13 FOR R&B
    335     VQSHRN.S32  D8,Q4,#13                   @//D8 = (U-128)*C4>>13 4 16-BIT VALUES
    336     VQSHRN.S32  D9,Q5,#13                   @//D9 = (U-128)*C4>>13 4 16-BIT VALUES
    337     @//Q4 - WEIGHT FOR B
    338 
    339     @//NARROW RIGHT SHIFT BY 13 FOR R&B
    340     VQSHRN.S32  D10,Q10,#13                 @//D10 = (V-128)*C1>>13 4 16-BIT VALUES
    341     VQSHRN.S32  D11,Q11,#13                 @//D11 = (V-128)*C1>>13 4 16-BIT VALUES
    342     @//Q5 - WEIGHT FOR R
    343 
    344     @//NARROW RIGHT SHIFT BY 13 FOR G
    345     VQSHRN.S32  D12,Q6,#13                  @//D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    346     VQSHRN.S32  D13,Q7,#13                  @//D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
    347     @//Q6 - WEIGHT FOR G
    348 
    349     VADDW.U8    Q7,Q4,D30                   @//Q7 - HAS Y + B
    350     VADDW.U8    Q8,Q5,D30                   @//Q8 - HAS Y + R
    351     VADDW.U8    Q9,Q6,D30                   @//Q9 - HAS Y + G
    352 
    353     VADDW.U8    Q10,Q4,D31                  @//Q10 - HAS Y + B
    354     VADDW.U8    Q11,Q5,D31                  @//Q11 - HAS Y + R
    355     VADDW.U8    Q12,Q6,D31                  @//Q12 - HAS Y + G
    356 
    357     VQMOVUN.S16 D14,Q7
    358     VQMOVUN.S16 D15,Q9
    359     VQMOVUN.S16 D16,Q8
    360     VMOV.I8     D17,#0
    361 
    362     VZIP.8      D14,D15
    363     VZIP.8      D16,D17
    364     VZIP.16     Q7,Q8
    365 
    366 
    367     VQMOVUN.S16 D20,Q10
    368     VQMOVUN.S16 D21,Q12
    369     VQMOVUN.S16 D22,Q11
    370     VMOV.I8     D23,#0
    371 
    372     VZIP.8      D20,D21
    373     VZIP.8      D22,D23
    374     VZIP.16     Q10,Q11
    375 
    376     VZIP.32     Q7,Q10
    377     VZIP.32     Q8,Q11
    378 
    379     VST1.32     D14,[R2]!
    380     VST1.32     D15,[R2]!
    381     VST1.32     D20,[R2]!
    382     VST1.32     D21,[R2]!
    383     VST1.32     D16,[R2]!
    384     VST1.32     D17,[R2]!
    385     VST1.32     D22,[R2]!
    386     VST1.32     D23,[R2]!
    387 
    388     @//D14-D20 - TOALLY HAVE 16 VALUES
    389     @//WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
    390     VADDW.U8    Q7,Q4,D28                   @//Q7 - HAS Y + B
    391     VADDW.U8    Q8,Q5,D28                   @//Q2 - HAS Y + R
    392     VADDW.U8    Q9,Q6,D28                   @//Q3 - HAS Y + G
    393 
    394     VADDW.U8    Q10,Q4,D29                  @//Q10 - HAS Y + B
    395     VADDW.U8    Q11,Q5,D29                  @//Q11 - HAS Y + R
    396     VADDW.U8    Q12,Q6,D29                  @//Q12 - HAS Y + G
    397 
    398 
    399     VQMOVUN.S16 D14,Q7
    400     VQMOVUN.S16 D15,Q9
    401     VQMOVUN.S16 D16,Q8
    402     VMOV.I8     D17,#0
    403 
    404     VZIP.8      D14,D15
    405     VZIP.8      D16,D17
    406     VZIP.16     Q7,Q8
    407 
    408 
    409     VQMOVUN.S16 D20,Q10
    410     VQMOVUN.S16 D21,Q12
    411     VQMOVUN.S16 D22,Q11
    412     VMOV.I8     D23,#0
    413 
    414     VZIP.8      D20,D21
    415     VZIP.8      D22,D23
    416     VZIP.16     Q10,Q11
    417 
    418     VZIP.32     Q7,Q10
    419     VZIP.32     Q8,Q11
    420 
    421     VST1.32     D14,[R8]!
    422     VST1.32     D15,[R8]!
    423     VST1.32     D20,[R8]!
    424     VST1.32     D21,[R8]!
    425     VST1.32     D16,[R8]!
    426     VST1.32     D17,[R8]!
    427     VST1.32     D22,[R8]!
    428     VST1.32     D23,[R8]!
    429 
    430     @// Adjust the address pointers
    431     ADD         R0,R7,R10                   @// luma = luma_next + offset
    432     ADD         R2,R8,R14,LSL #2            @// rgb = rgb_next + offset
    433 
    434     ADD         R7,R0,R3                    @// luma_next = luma + width
    435     ADD         R8,R2,R3,LSL #2             @// rgb_next_row = rgb + width
    436 
    437     ADD         R1,R1,R11                   @// adjust u pointer
    438     @ADD        R2,R2,R12           @// adjust v pointer
    439 
    440     ADD         R7,R7,R10                   @// luma_next = luma + width + offset (because of register crunch)
    441     ADD         R8,R8,R14,LSL #2            @// rgb_next_row = rgb + width + offset
    442 
    443     SUBS        R5,R5,#1                    @// height_cnt -= 1
    444 
    445     BNE         LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
    446 
    447     @//POP THE REGISTERS
    448     LDMFD       SP!,{R4-R12,PC}
    449 
    450 
    451 
    452 
    453     .section .note.GNU-stack,"",%progbits
    454 
    455