Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
      2 ; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
      3 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
      4 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
      5 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
      6 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
      7 
      8 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
      9 
     10 @src64 = common global [4 x i64] zeroinitializer, align 32
     11 @dst64 = common global [4 x i64] zeroinitializer, align 32
     12 @src32 = common global [8 x i32] zeroinitializer, align 32
     13 @dst32 = common global [8 x i32] zeroinitializer, align 32
     14 @src16 = common global [16 x i16] zeroinitializer, align 32
     15 @dst16 = common global [16 x i16] zeroinitializer, align 32
     16 @src8  = common global [32 x i8] zeroinitializer, align 32
     17 @dst8  = common global [32 x i8] zeroinitializer, align 32
     18 
     19 declare i64 @llvm.bitreverse.i64(i64)
     20 declare i32 @llvm.bitreverse.i32(i32)
     21 declare i16 @llvm.bitreverse.i16(i16)
     22 declare  i8 @llvm.bitreverse.i8(i8)
     23 
     24 define void @bitreverse_2i64() #0 {
     25 ; SSE-LABEL: @bitreverse_2i64(
     26 ; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
     27 ; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
     28 ; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
     29 ; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
     30 ; SSE-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
     31 ; SSE-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
     32 ; SSE-NEXT:    ret void
     33 ;
     34 ; AVX-LABEL: @bitreverse_2i64(
     35 ; AVX-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
     36 ; AVX-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
     37 ; AVX-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
     38 ; AVX-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
     39 ; AVX-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
     40 ; AVX-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
     41 ; AVX-NEXT:    ret void
     42 ;
     43 ; XOP-LABEL: @bitreverse_2i64(
     44 ; XOP-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
     45 ; XOP-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
     46 ; XOP-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
     47 ; XOP-NEXT:    ret void
     48 ;
     49   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
     50   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
     51   %bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
     52   %bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
     53   store i64 %bitreverse0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
     54   store i64 %bitreverse1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
     55   ret void
     56 }
     57 
     58 define void @bitreverse_4i64() #0 {
     59 ; SSE-LABEL: @bitreverse_4i64(
     60 ; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
     61 ; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
     62 ; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
     63 ; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
     64 ; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
     65 ; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
     66 ; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD2]])
     67 ; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD3]])
     68 ; SSE-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
     69 ; SSE-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
     70 ; SSE-NEXT:    store i64 [[BITREVERSE2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
     71 ; SSE-NEXT:    store i64 [[BITREVERSE3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
     72 ; SSE-NEXT:    ret void
     73 ;
     74 ; AVX1-LABEL: @bitreverse_4i64(
     75 ; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
     76 ; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
     77 ; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
     78 ; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
     79 ; AVX1-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
     80 ; AVX1-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
     81 ; AVX1-NEXT:    [[BITREVERSE2:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD2]])
     82 ; AVX1-NEXT:    [[BITREVERSE3:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD3]])
     83 ; AVX1-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
     84 ; AVX1-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
     85 ; AVX1-NEXT:    store i64 [[BITREVERSE2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
     86 ; AVX1-NEXT:    store i64 [[BITREVERSE3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
     87 ; AVX1-NEXT:    ret void
     88 ;
     89 ; AVX2-LABEL: @bitreverse_4i64(
     90 ; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
     91 ; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
     92 ; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
     93 ; AVX2-NEXT:    ret void
     94 ;
     95 ; XOP-LABEL: @bitreverse_4i64(
     96 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
     97 ; XOP-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
     98 ; XOP-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
     99 ; XOP-NEXT:    ret void
    100 ;
    101   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
    102   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
    103   %ld2 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
    104   %ld3 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
    105   %bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
    106   %bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
    107   %bitreverse2 = call i64 @llvm.bitreverse.i64(i64 %ld2)
    108   %bitreverse3 = call i64 @llvm.bitreverse.i64(i64 %ld3)
    109   store i64 %bitreverse0, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
    110   store i64 %bitreverse1, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
    111   store i64 %bitreverse2, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
    112   store i64 %bitreverse3, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
    113   ret void
    114 }
    115 
    116 define void @bitreverse_4i32() #0 {
    117 ; SSE-LABEL: @bitreverse_4i32(
    118 ; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
    119 ; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
    120 ; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
    121 ; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
    122 ; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD0]])
    123 ; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD1]])
    124 ; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD2]])
    125 ; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD3]])
    126 ; SSE-NEXT:    store i32 [[BITREVERSE0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
    127 ; SSE-NEXT:    store i32 [[BITREVERSE1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
    128 ; SSE-NEXT:    store i32 [[BITREVERSE2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
    129 ; SSE-NEXT:    store i32 [[BITREVERSE3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
    130 ; SSE-NEXT:    ret void
    131 ;
    132 ; AVX-LABEL: @bitreverse_4i32(
    133 ; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
    134 ; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
    135 ; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
    136 ; AVX-NEXT:    ret void
    137 ;
    138 ; XOP-LABEL: @bitreverse_4i32(
    139 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
    140 ; XOP-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
    141 ; XOP-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
    142 ; XOP-NEXT:    ret void
    143 ;
    144   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
    145   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
    146   %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
    147   %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
    148   %bitreverse0 = call i32 @llvm.bitreverse.i32(i32 %ld0)
    149   %bitreverse1 = call i32 @llvm.bitreverse.i32(i32 %ld1)
    150   %bitreverse2 = call i32 @llvm.bitreverse.i32(i32 %ld2)
    151   %bitreverse3 = call i32 @llvm.bitreverse.i32(i32 %ld3)
    152   store i32 %bitreverse0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
    153   store i32 %bitreverse1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
    154   store i32 %bitreverse2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
    155   store i32 %bitreverse3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
    156   ret void
    157 }
    158 
    159 define void @bitreverse_8i32() #0 {
    160 ; SSE-LABEL: @bitreverse_8i32(
    161 ; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
    162 ; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
    163 ; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
    164 ; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
    165 ; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
    166 ; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
    167 ; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
    168 ; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
    169 ; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD0]])
    170 ; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD1]])
    171 ; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD2]])
    172 ; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD3]])
    173 ; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD4]])
    174 ; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD5]])
    175 ; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD6]])
    176 ; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD7]])
    177 ; SSE-NEXT:    store i32 [[BITREVERSE0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
    178 ; SSE-NEXT:    store i32 [[BITREVERSE1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
    179 ; SSE-NEXT:    store i32 [[BITREVERSE2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
    180 ; SSE-NEXT:    store i32 [[BITREVERSE3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
    181 ; SSE-NEXT:    store i32 [[BITREVERSE4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
    182 ; SSE-NEXT:    store i32 [[BITREVERSE5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
    183 ; SSE-NEXT:    store i32 [[BITREVERSE6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
    184 ; SSE-NEXT:    store i32 [[BITREVERSE7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
    185 ; SSE-NEXT:    ret void
    186 ;
    187 ; AVX-LABEL: @bitreverse_8i32(
    188 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
    189 ; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
    190 ; AVX-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
    191 ; AVX-NEXT:    ret void
    192 ;
    193 ; XOP-LABEL: @bitreverse_8i32(
    194 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([8 x i32]* @src32 to <8 x i32>*), align 2
    195 ; XOP-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
    196 ; XOP-NEXT:    store <8 x i32> [[TMP2]], <8 x i32>* bitcast ([8 x i32]* @dst32 to <8 x i32>*), align 2
    197 ; XOP-NEXT:    ret void
    198 ;
    199   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
    200   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
    201   %ld2 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
    202   %ld3 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
    203   %ld4 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
    204   %ld5 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
    205   %ld6 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
    206   %ld7 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
    207   %bitreverse0 = call i32 @llvm.bitreverse.i32(i32 %ld0)
    208   %bitreverse1 = call i32 @llvm.bitreverse.i32(i32 %ld1)
    209   %bitreverse2 = call i32 @llvm.bitreverse.i32(i32 %ld2)
    210   %bitreverse3 = call i32 @llvm.bitreverse.i32(i32 %ld3)
    211   %bitreverse4 = call i32 @llvm.bitreverse.i32(i32 %ld4)
    212   %bitreverse5 = call i32 @llvm.bitreverse.i32(i32 %ld5)
    213   %bitreverse6 = call i32 @llvm.bitreverse.i32(i32 %ld6)
    214   %bitreverse7 = call i32 @llvm.bitreverse.i32(i32 %ld7)
    215   store i32 %bitreverse0, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
    216   store i32 %bitreverse1, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
    217   store i32 %bitreverse2, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
    218   store i32 %bitreverse3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
    219   store i32 %bitreverse4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
    220   store i32 %bitreverse5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
    221   store i32 %bitreverse6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
    222   store i32 %bitreverse7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
    223   ret void
    224 }
    225 
    226 define void @bitreverse_8i16() #0 {
    227 ; SSE-LABEL: @bitreverse_8i16(
    228 ; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
    229 ; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
    230 ; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
    231 ; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
    232 ; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
    233 ; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
    234 ; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
    235 ; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
    236 ; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD0]])
    237 ; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD1]])
    238 ; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD2]])
    239 ; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD3]])
    240 ; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD4]])
    241 ; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD5]])
    242 ; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD6]])
    243 ; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD7]])
    244 ; SSE-NEXT:    store i16 [[BITREVERSE0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
    245 ; SSE-NEXT:    store i16 [[BITREVERSE1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
    246 ; SSE-NEXT:    store i16 [[BITREVERSE2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
    247 ; SSE-NEXT:    store i16 [[BITREVERSE3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
    248 ; SSE-NEXT:    store i16 [[BITREVERSE4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
    249 ; SSE-NEXT:    store i16 [[BITREVERSE5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
    250 ; SSE-NEXT:    store i16 [[BITREVERSE6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
    251 ; SSE-NEXT:    store i16 [[BITREVERSE7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
    252 ; SSE-NEXT:    ret void
    253 ;
    254 ; AVX-LABEL: @bitreverse_8i16(
    255 ; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
    256 ; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
    257 ; AVX-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
    258 ; AVX-NEXT:    ret void
    259 ;
    260 ; XOP-LABEL: @bitreverse_8i16(
    261 ; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
    262 ; XOP-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
    263 ; XOP-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
    264 ; XOP-NEXT:    ret void
    265 ;
    266   %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
    267   %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
    268   %ld2 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
    269   %ld3 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
    270   %ld4 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
    271   %ld5 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
    272   %ld6 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
    273   %ld7 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
    274   %bitreverse0 = call i16 @llvm.bitreverse.i16(i16 %ld0)
    275   %bitreverse1 = call i16 @llvm.bitreverse.i16(i16 %ld1)
    276   %bitreverse2 = call i16 @llvm.bitreverse.i16(i16 %ld2)
    277   %bitreverse3 = call i16 @llvm.bitreverse.i16(i16 %ld3)
    278   %bitreverse4 = call i16 @llvm.bitreverse.i16(i16 %ld4)
    279   %bitreverse5 = call i16 @llvm.bitreverse.i16(i16 %ld5)
    280   %bitreverse6 = call i16 @llvm.bitreverse.i16(i16 %ld6)
    281   %bitreverse7 = call i16 @llvm.bitreverse.i16(i16 %ld7)
    282   store i16 %bitreverse0, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
    283   store i16 %bitreverse1, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
    284   store i16 %bitreverse2, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
    285   store i16 %bitreverse3, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
    286   store i16 %bitreverse4, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
    287   store i16 %bitreverse5, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
    288   store i16 %bitreverse6, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
    289   store i16 %bitreverse7, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
    290   ret void
    291 }
    292 
    293 define void @bitreverse_16i16() #0 {
    294 ; SSE-LABEL: @bitreverse_16i16(
    295 ; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
    296 ; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
    297 ; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
    298 ; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
    299 ; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
    300 ; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
    301 ; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
    302 ; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
    303 ; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8), align 2
    304 ; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 9), align 2
    305 ; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
    306 ; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
    307 ; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
    308 ; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
    309 ; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
    310 ; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
    311 ; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD0]])
    312 ; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD1]])
    313 ; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD2]])
    314 ; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD3]])
    315 ; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD4]])
    316 ; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD5]])
    317 ; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD6]])
    318 ; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD7]])
    319 ; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD8]])
    320 ; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD9]])
    321 ; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD10]])
    322 ; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD11]])
    323 ; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD12]])
    324 ; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD13]])
    325 ; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD14]])
    326 ; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD15]])
    327 ; SSE-NEXT:    store i16 [[BITREVERSE0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
    328 ; SSE-NEXT:    store i16 [[BITREVERSE1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
    329 ; SSE-NEXT:    store i16 [[BITREVERSE2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
    330 ; SSE-NEXT:    store i16 [[BITREVERSE3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
    331 ; SSE-NEXT:    store i16 [[BITREVERSE4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
    332 ; SSE-NEXT:    store i16 [[BITREVERSE5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
    333 ; SSE-NEXT:    store i16 [[BITREVERSE6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
    334 ; SSE-NEXT:    store i16 [[BITREVERSE7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
    335 ; SSE-NEXT:    store i16 [[BITREVERSE8]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8), align 2
    336 ; SSE-NEXT:    store i16 [[BITREVERSE9]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 9), align 2
    337 ; SSE-NEXT:    store i16 [[BITREVERSE10]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
    338 ; SSE-NEXT:    store i16 [[BITREVERSE11]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
    339 ; SSE-NEXT:    store i16 [[BITREVERSE12]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
    340 ; SSE-NEXT:    store i16 [[BITREVERSE13]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
    341 ; SSE-NEXT:    store i16 [[BITREVERSE14]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
    342 ; SSE-NEXT:    store i16 [[BITREVERSE15]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
    343 ; SSE-NEXT:    ret void
    344 ;
    345 ; AVX-LABEL: @bitreverse_16i16(
    346 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
    347 ; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
    348 ; AVX-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
    349 ; AVX-NEXT:    ret void
    350 ;
    351 ; XOP-LABEL: @bitreverse_16i16(
    352 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([16 x i16]* @src16 to <16 x i16>*), align 2
    353 ; XOP-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
    354 ; XOP-NEXT:    store <16 x i16> [[TMP2]], <16 x i16>* bitcast ([16 x i16]* @dst16 to <16 x i16>*), align 2
    355 ; XOP-NEXT:    ret void
    356 ;
    357   %ld0  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  0), align 2
    358   %ld1  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  1), align 2
    359   %ld2  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  2), align 2
    360   %ld3  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  3), align 2
    361   %ld4  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  4), align 2
    362   %ld5  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  5), align 2
    363   %ld6  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  6), align 2
    364   %ld7  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  7), align 2
    365   %ld8  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  8), align 2
    366   %ld9  = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64  9), align 2
    367   %ld10 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
    368   %ld11 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
    369   %ld12 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
    370   %ld13 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
    371   %ld14 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
    372   %ld15 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
    373   %bitreverse0  = call i16 @llvm.bitreverse.i16(i16 %ld0)
    374   %bitreverse1  = call i16 @llvm.bitreverse.i16(i16 %ld1)
    375   %bitreverse2  = call i16 @llvm.bitreverse.i16(i16 %ld2)
    376   %bitreverse3  = call i16 @llvm.bitreverse.i16(i16 %ld3)
    377   %bitreverse4  = call i16 @llvm.bitreverse.i16(i16 %ld4)
    378   %bitreverse5  = call i16 @llvm.bitreverse.i16(i16 %ld5)
    379   %bitreverse6  = call i16 @llvm.bitreverse.i16(i16 %ld6)
    380   %bitreverse7  = call i16 @llvm.bitreverse.i16(i16 %ld7)
    381   %bitreverse8  = call i16 @llvm.bitreverse.i16(i16 %ld8)
    382   %bitreverse9  = call i16 @llvm.bitreverse.i16(i16 %ld9)
    383   %bitreverse10 = call i16 @llvm.bitreverse.i16(i16 %ld10)
    384   %bitreverse11 = call i16 @llvm.bitreverse.i16(i16 %ld11)
    385   %bitreverse12 = call i16 @llvm.bitreverse.i16(i16 %ld12)
    386   %bitreverse13 = call i16 @llvm.bitreverse.i16(i16 %ld13)
    387   %bitreverse14 = call i16 @llvm.bitreverse.i16(i16 %ld14)
    388   %bitreverse15 = call i16 @llvm.bitreverse.i16(i16 %ld15)
    389   store i16 %bitreverse0 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  0), align 2
    390   store i16 %bitreverse1 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  1), align 2
    391   store i16 %bitreverse2 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  2), align 2
    392   store i16 %bitreverse3 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  3), align 2
    393   store i16 %bitreverse4 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  4), align 2
    394   store i16 %bitreverse5 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  5), align 2
    395   store i16 %bitreverse6 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  6), align 2
    396   store i16 %bitreverse7 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  7), align 2
    397   store i16 %bitreverse8 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  8), align 2
    398   store i16 %bitreverse9 , i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64  9), align 2
    399   store i16 %bitreverse10, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
    400   store i16 %bitreverse11, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
    401   store i16 %bitreverse12, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
    402   store i16 %bitreverse13, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
    403   store i16 %bitreverse14, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
    404   store i16 %bitreverse15, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
    405   ret void
    406 }
    407 
    408 define void @bitreverse_16i8() #0 {
    409 ; SSE-LABEL: @bitreverse_16i8(
    410 ; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
    411 ; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
    412 ; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
    413 ; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
    414 ; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
    415 ; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
    416 ; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1
    417 ; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1
    418 ; SSE-NEXT:    [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1
    419 ; SSE-NEXT:    [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1
    420 ; SSE-NEXT:    [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
    421 ; SSE-NEXT:    [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
    422 ; SSE-NEXT:    [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
    423 ; SSE-NEXT:    [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
    424 ; SSE-NEXT:    [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
    425 ; SSE-NEXT:    [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
    426 ; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD0]])
    427 ; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD1]])
    428 ; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD2]])
    429 ; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD3]])
    430 ; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD4]])
    431 ; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD5]])
    432 ; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD6]])
    433 ; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD7]])
    434 ; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD8]])
    435 ; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD9]])
    436 ; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD10]])
    437 ; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD11]])
    438 ; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD12]])
    439 ; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD13]])
    440 ; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD14]])
    441 ; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD15]])
    442 ; SSE-NEXT:    store i8 [[BITREVERSE0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1
    443 ; SSE-NEXT:    store i8 [[BITREVERSE1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1
    444 ; SSE-NEXT:    store i8 [[BITREVERSE2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1
    445 ; SSE-NEXT:    store i8 [[BITREVERSE3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1
    446 ; SSE-NEXT:    store i8 [[BITREVERSE4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1
    447 ; SSE-NEXT:    store i8 [[BITREVERSE5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1
    448 ; SSE-NEXT:    store i8 [[BITREVERSE6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1
    449 ; SSE-NEXT:    store i8 [[BITREVERSE7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1
    450 ; SSE-NEXT:    store i8 [[BITREVERSE8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1
    451 ; SSE-NEXT:    store i8 [[BITREVERSE9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1
    452 ; SSE-NEXT:    store i8 [[BITREVERSE10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
    453 ; SSE-NEXT:    store i8 [[BITREVERSE11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
    454 ; SSE-NEXT:    store i8 [[BITREVERSE12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
    455 ; SSE-NEXT:    store i8 [[BITREVERSE13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
    456 ; SSE-NEXT:    store i8 [[BITREVERSE14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
    457 ; SSE-NEXT:    store i8 [[BITREVERSE15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
    458 ; SSE-NEXT:    ret void
    459 ;
    460 ; AVX-LABEL: @bitreverse_16i8(
    461 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
    462 ; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
    463 ; AVX-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
    464 ; AVX-NEXT:    ret void
    465 ;
    466 ; XOP-LABEL: @bitreverse_16i8(
    467 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
    468 ; XOP-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
    469 ; XOP-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
    470 ; XOP-NEXT:    ret void
    471 ;
    472   %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
    473   %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
    474   %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
    475   %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
    476   %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
    477   %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
    478   %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
    479   %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
    480   %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
    481   %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
    482   %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
    483   %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
    484   %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
    485   %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
    486   %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
    487   %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
    488   %bitreverse0  = call i8 @llvm.bitreverse.i8(i8 %ld0)
    489   %bitreverse1  = call i8 @llvm.bitreverse.i8(i8 %ld1)
    490   %bitreverse2  = call i8 @llvm.bitreverse.i8(i8 %ld2)
    491   %bitreverse3  = call i8 @llvm.bitreverse.i8(i8 %ld3)
    492   %bitreverse4  = call i8 @llvm.bitreverse.i8(i8 %ld4)
    493   %bitreverse5  = call i8 @llvm.bitreverse.i8(i8 %ld5)
    494   %bitreverse6  = call i8 @llvm.bitreverse.i8(i8 %ld6)
    495   %bitreverse7  = call i8 @llvm.bitreverse.i8(i8 %ld7)
    496   %bitreverse8  = call i8 @llvm.bitreverse.i8(i8 %ld8)
    497   %bitreverse9  = call i8 @llvm.bitreverse.i8(i8 %ld9)
    498   %bitreverse10 = call i8 @llvm.bitreverse.i8(i8 %ld10)
    499   %bitreverse11 = call i8 @llvm.bitreverse.i8(i8 %ld11)
    500   %bitreverse12 = call i8 @llvm.bitreverse.i8(i8 %ld12)
    501   %bitreverse13 = call i8 @llvm.bitreverse.i8(i8 %ld13)
    502   %bitreverse14 = call i8 @llvm.bitreverse.i8(i8 %ld14)
    503   %bitreverse15 = call i8 @llvm.bitreverse.i8(i8 %ld15)
    504   store i8 %bitreverse0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
    505   store i8 %bitreverse1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
    506   store i8 %bitreverse2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
    507   store i8 %bitreverse3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
    508   store i8 %bitreverse4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
    509   store i8 %bitreverse5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
    510   store i8 %bitreverse6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
    511   store i8 %bitreverse7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
    512   store i8 %bitreverse8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
    513   store i8 %bitreverse9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
    514   store i8 %bitreverse10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
    515   store i8 %bitreverse11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
    516   store i8 %bitreverse12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
    517   store i8 %bitreverse13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
    518   store i8 %bitreverse14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
    519   store i8 %bitreverse15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
    520   ret void
    521 }
    522 
    523 define void @bitreverse_32i8() #0 {
    524 ; SSE-LABEL: @bitreverse_32i8(
    525 ; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
    526 ; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
    527 ; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
    528 ; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
    529 ; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
    530 ; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
    531 ; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1
    532 ; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1
    533 ; SSE-NEXT:    [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1
    534 ; SSE-NEXT:    [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1
    535 ; SSE-NEXT:    [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
    536 ; SSE-NEXT:    [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
    537 ; SSE-NEXT:    [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
    538 ; SSE-NEXT:    [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
    539 ; SSE-NEXT:    [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
    540 ; SSE-NEXT:    [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
    541 ; SSE-NEXT:    [[LD16:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
    542 ; SSE-NEXT:    [[LD17:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
    543 ; SSE-NEXT:    [[LD18:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
    544 ; SSE-NEXT:    [[LD19:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
    545 ; SSE-NEXT:    [[LD20:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
    546 ; SSE-NEXT:    [[LD21:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
    547 ; SSE-NEXT:    [[LD22:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
    548 ; SSE-NEXT:    [[LD23:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
    549 ; SSE-NEXT:    [[LD24:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
    550 ; SSE-NEXT:    [[LD25:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
    551 ; SSE-NEXT:    [[LD26:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
    552 ; SSE-NEXT:    [[LD27:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
    553 ; SSE-NEXT:    [[LD28:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
    554 ; SSE-NEXT:    [[LD29:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
    555 ; SSE-NEXT:    [[LD30:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
    556 ; SSE-NEXT:    [[LD31:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
    557 ; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD0]])
    558 ; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD1]])
    559 ; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD2]])
    560 ; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD3]])
    561 ; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD4]])
    562 ; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD5]])
    563 ; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD6]])
    564 ; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD7]])
    565 ; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD8]])
    566 ; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD9]])
    567 ; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD10]])
    568 ; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD11]])
    569 ; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD12]])
    570 ; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD13]])
    571 ; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD14]])
    572 ; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD15]])
    573 ; SSE-NEXT:    [[BITREVERSE16:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD16]])
    574 ; SSE-NEXT:    [[BITREVERSE17:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD17]])
    575 ; SSE-NEXT:    [[BITREVERSE18:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD18]])
    576 ; SSE-NEXT:    [[BITREVERSE19:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD19]])
    577 ; SSE-NEXT:    [[BITREVERSE20:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD20]])
    578 ; SSE-NEXT:    [[BITREVERSE21:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD21]])
    579 ; SSE-NEXT:    [[BITREVERSE22:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD22]])
    580 ; SSE-NEXT:    [[BITREVERSE23:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD23]])
    581 ; SSE-NEXT:    [[BITREVERSE24:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD24]])
    582 ; SSE-NEXT:    [[BITREVERSE25:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD25]])
    583 ; SSE-NEXT:    [[BITREVERSE26:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD26]])
    584 ; SSE-NEXT:    [[BITREVERSE27:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD27]])
    585 ; SSE-NEXT:    [[BITREVERSE28:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD28]])
    586 ; SSE-NEXT:    [[BITREVERSE29:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD29]])
    587 ; SSE-NEXT:    [[BITREVERSE30:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD30]])
    588 ; SSE-NEXT:    [[BITREVERSE31:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD31]])
    589 ; SSE-NEXT:    store i8 [[BITREVERSE0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1
    590 ; SSE-NEXT:    store i8 [[BITREVERSE1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1
    591 ; SSE-NEXT:    store i8 [[BITREVERSE2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1
    592 ; SSE-NEXT:    store i8 [[BITREVERSE3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1
    593 ; SSE-NEXT:    store i8 [[BITREVERSE4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1
    594 ; SSE-NEXT:    store i8 [[BITREVERSE5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1
    595 ; SSE-NEXT:    store i8 [[BITREVERSE6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1
    596 ; SSE-NEXT:    store i8 [[BITREVERSE7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1
    597 ; SSE-NEXT:    store i8 [[BITREVERSE8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1
    598 ; SSE-NEXT:    store i8 [[BITREVERSE9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1
    599 ; SSE-NEXT:    store i8 [[BITREVERSE10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
    600 ; SSE-NEXT:    store i8 [[BITREVERSE11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
    601 ; SSE-NEXT:    store i8 [[BITREVERSE12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
    602 ; SSE-NEXT:    store i8 [[BITREVERSE13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
    603 ; SSE-NEXT:    store i8 [[BITREVERSE14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
    604 ; SSE-NEXT:    store i8 [[BITREVERSE15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
    605 ; SSE-NEXT:    store i8 [[BITREVERSE16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
    606 ; SSE-NEXT:    store i8 [[BITREVERSE17]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
    607 ; SSE-NEXT:    store i8 [[BITREVERSE18]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
    608 ; SSE-NEXT:    store i8 [[BITREVERSE19]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
    609 ; SSE-NEXT:    store i8 [[BITREVERSE20]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
    610 ; SSE-NEXT:    store i8 [[BITREVERSE21]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
    611 ; SSE-NEXT:    store i8 [[BITREVERSE22]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
    612 ; SSE-NEXT:    store i8 [[BITREVERSE23]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
    613 ; SSE-NEXT:    store i8 [[BITREVERSE24]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
    614 ; SSE-NEXT:    store i8 [[BITREVERSE25]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
    615 ; SSE-NEXT:    store i8 [[BITREVERSE26]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
    616 ; SSE-NEXT:    store i8 [[BITREVERSE27]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
    617 ; SSE-NEXT:    store i8 [[BITREVERSE28]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
    618 ; SSE-NEXT:    store i8 [[BITREVERSE29]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
    619 ; SSE-NEXT:    store i8 [[BITREVERSE30]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
    620 ; SSE-NEXT:    store i8 [[BITREVERSE31]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
    621 ; SSE-NEXT:    ret void
    622 ;
    623 ; AVX-LABEL: @bitreverse_32i8(
    624 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
    625 ; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
    626 ; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
    627 ; AVX-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
    628 ; AVX-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
    629 ; AVX-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
    630 ; AVX-NEXT:    ret void
    631 ;
    632 ; XOP-LABEL: @bitreverse_32i8(
    633 ; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
    634 ; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
    635 ; XOP-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
    636 ; XOP-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
    637 ; XOP-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
    638 ; XOP-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
    639 ; XOP-NEXT:    ret void
    640 ;
    641   %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
    642   %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
    643   %ld2  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  2), align 1
    644   %ld3  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  3), align 1
    645   %ld4  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  4), align 1
    646   %ld5  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  5), align 1
    647   %ld6  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  6), align 1
    648   %ld7  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  7), align 1
    649   %ld8  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  8), align 1
    650   %ld9  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  9), align 1
    651   %ld10 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
    652   %ld11 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
    653   %ld12 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
    654   %ld13 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
    655   %ld14 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
    656   %ld15 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
    657   %ld16 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
    658   %ld17 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
    659   %ld18 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
    660   %ld19 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
    661   %ld20 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
    662   %ld21 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
    663   %ld22 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
    664   %ld23 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
    665   %ld24 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
    666   %ld25 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
    667   %ld26 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
    668   %ld27 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
    669   %ld28 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
    670   %ld29 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
    671   %ld30 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
    672   %ld31 = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
    673   %bitreverse0  = call i8 @llvm.bitreverse.i8(i8 %ld0)
    674   %bitreverse1  = call i8 @llvm.bitreverse.i8(i8 %ld1)
    675   %bitreverse2  = call i8 @llvm.bitreverse.i8(i8 %ld2)
    676   %bitreverse3  = call i8 @llvm.bitreverse.i8(i8 %ld3)
    677   %bitreverse4  = call i8 @llvm.bitreverse.i8(i8 %ld4)
    678   %bitreverse5  = call i8 @llvm.bitreverse.i8(i8 %ld5)
    679   %bitreverse6  = call i8 @llvm.bitreverse.i8(i8 %ld6)
    680   %bitreverse7  = call i8 @llvm.bitreverse.i8(i8 %ld7)
    681   %bitreverse8  = call i8 @llvm.bitreverse.i8(i8 %ld8)
    682   %bitreverse9  = call i8 @llvm.bitreverse.i8(i8 %ld9)
    683   %bitreverse10 = call i8 @llvm.bitreverse.i8(i8 %ld10)
    684   %bitreverse11 = call i8 @llvm.bitreverse.i8(i8 %ld11)
    685   %bitreverse12 = call i8 @llvm.bitreverse.i8(i8 %ld12)
    686   %bitreverse13 = call i8 @llvm.bitreverse.i8(i8 %ld13)
    687   %bitreverse14 = call i8 @llvm.bitreverse.i8(i8 %ld14)
    688   %bitreverse15 = call i8 @llvm.bitreverse.i8(i8 %ld15)
    689   %bitreverse16 = call i8 @llvm.bitreverse.i8(i8 %ld16)
    690   %bitreverse17 = call i8 @llvm.bitreverse.i8(i8 %ld17)
    691   %bitreverse18 = call i8 @llvm.bitreverse.i8(i8 %ld18)
    692   %bitreverse19 = call i8 @llvm.bitreverse.i8(i8 %ld19)
    693   %bitreverse20 = call i8 @llvm.bitreverse.i8(i8 %ld20)
    694   %bitreverse21 = call i8 @llvm.bitreverse.i8(i8 %ld21)
    695   %bitreverse22 = call i8 @llvm.bitreverse.i8(i8 %ld22)
    696   %bitreverse23 = call i8 @llvm.bitreverse.i8(i8 %ld23)
    697   %bitreverse24 = call i8 @llvm.bitreverse.i8(i8 %ld24)
    698   %bitreverse25 = call i8 @llvm.bitreverse.i8(i8 %ld25)
    699   %bitreverse26 = call i8 @llvm.bitreverse.i8(i8 %ld26)
    700   %bitreverse27 = call i8 @llvm.bitreverse.i8(i8 %ld27)
    701   %bitreverse28 = call i8 @llvm.bitreverse.i8(i8 %ld28)
    702   %bitreverse29 = call i8 @llvm.bitreverse.i8(i8 %ld29)
    703   %bitreverse30 = call i8 @llvm.bitreverse.i8(i8 %ld30)
    704   %bitreverse31 = call i8 @llvm.bitreverse.i8(i8 %ld31)
    705   store i8 %bitreverse0 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  0), align 1
    706   store i8 %bitreverse1 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  1), align 1
    707   store i8 %bitreverse2 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  2), align 1
    708   store i8 %bitreverse3 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  3), align 1
    709   store i8 %bitreverse4 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  4), align 1
    710   store i8 %bitreverse5 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  5), align 1
    711   store i8 %bitreverse6 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  6), align 1
    712   store i8 %bitreverse7 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  7), align 1
    713   store i8 %bitreverse8 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  8), align 1
    714   store i8 %bitreverse9 , i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64  9), align 1
    715   store i8 %bitreverse10, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
    716   store i8 %bitreverse11, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
    717   store i8 %bitreverse12, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
    718   store i8 %bitreverse13, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
    719   store i8 %bitreverse14, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
    720   store i8 %bitreverse15, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
    721   store i8 %bitreverse16, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
    722   store i8 %bitreverse17, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
    723   store i8 %bitreverse18, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
    724   store i8 %bitreverse19, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
    725   store i8 %bitreverse20, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
    726   store i8 %bitreverse21, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
    727   store i8 %bitreverse22, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
    728   store i8 %bitreverse23, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
    729   store i8 %bitreverse24, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
    730   store i8 %bitreverse25, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
    731   store i8 %bitreverse26, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
    732   store i8 %bitreverse27, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
    733   store i8 %bitreverse28, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
    734   store i8 %bitreverse29, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
    735   store i8 %bitreverse30, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
    736   store i8 %bitreverse31, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
    737   ret void
    738 }
    739 
    740 attributes #0 = { nounwind }
    741 
    742