1 ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr8 \ 2 ; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck -allow-deprecated-dag-overlap %s 3 4 ; RUN: llc -verify-machineinstrs -O3 -mcpu=pwr8 -disable-ppc-vsx-swap-removal \ 5 ; RUN: -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck -allow-deprecated-dag-overlap \ 6 ; RUN: -check-prefix=NOOPTSWAP %s 7 8 ; RUN: llc -O3 -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu \ 9 ; RUN: -verify-machineinstrs -ppc-vsr-nums-as-vr < %s | FileCheck -allow-deprecated-dag-overlap \ 10 ; RUN: -check-prefix=CHECK-P9 --implicit-check-not xxswapd %s 11 12 ; RUN: llc -O3 -mcpu=pwr9 -disable-ppc-vsx-swap-removal -mattr=-power9-vector \ 13 ; RUN: -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s \ 14 ; RUN: | FileCheck -allow-deprecated-dag-overlap -check-prefix=NOOPTSWAP %s 15 16 ; LH: 2016-11-17 17 ; Updated align attritue from 16 to 8 to keep swap instructions tests. 18 ; Changes have been made on little-endian to use lvx and stvx 19 ; instructions instead of lxvd2x/xxswapd and xxswapd/stxvd2x for 20 ; aligned vectors with elements up to 4 bytes 21 22 ; This test was generated from the following source: 23 ; 24 ; #define N 4096 25 ; int ca[N] __attribute__((aligned(16))); 26 ; int cb[N] __attribute__((aligned(16))); 27 ; int cc[N] __attribute__((aligned(16))); 28 ; int cd[N] __attribute__((aligned(16))); 29 ; 30 ; void foo () 31 ; { 32 ; int i; 33 ; for (i = 0; i < N; i++) { 34 ; ca[i] = (cb[i] + cc[i]) * cd[i]; 35 ; } 36 ; } 37 38 @cb = common global [4096 x i32] zeroinitializer, align 8 39 @cc = common global [4096 x i32] zeroinitializer, align 8 40 @cd = common global [4096 x i32] zeroinitializer, align 8 41 @ca = common global [4096 x i32] zeroinitializer, align 8 42 43 define void @foo() { 44 entry: 45 br label %vector.body 46 47 vector.body: 48 %index = phi i64 [ 0, %entry ], [ %index.next.3, %vector.body ] 49 %0 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index 50 %1 = bitcast i32* %0 to <4 x i32>* 51 %wide.load = load <4 x i32>, <4 x i32>* %1, align 8 52 %2 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index 53 %3 = bitcast i32* %2 to <4 x i32>* 54 %wide.load13 = load <4 x i32>, <4 x i32>* %3, align 8 55 %4 = add nsw <4 x i32> %wide.load13, %wide.load 56 %5 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index 57 %6 = bitcast i32* %5 to <4 x i32>* 58 %wide.load14 = load <4 x i32>, <4 x i32>* %6, align 8 59 %7 = mul nsw <4 x i32> %4, %wide.load14 60 %8 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index 61 %9 = bitcast i32* %8 to <4 x i32>* 62 store <4 x i32> %7, <4 x i32>* %9, align 8 63 %index.next = add nuw nsw i64 %index, 4 64 %10 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next 65 %11 = bitcast i32* %10 to <4 x i32>* 66 %wide.load.1 = load <4 x i32>, <4 x i32>* %11, align 8 67 %12 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next 68 %13 = bitcast i32* %12 to <4 x i32>* 69 %wide.load13.1 = load <4 x i32>, <4 x i32>* %13, align 8 70 %14 = add nsw <4 x i32> %wide.load13.1, %wide.load.1 71 %15 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next 72 %16 = bitcast i32* %15 to <4 x i32>* 73 %wide.load14.1 = load <4 x i32>, <4 x i32>* %16, align 8 74 %17 = mul nsw <4 x i32> %14, %wide.load14.1 75 %18 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next 76 %19 = bitcast i32* %18 to <4 x i32>* 77 store <4 x i32> %17, <4 x i32>* %19, align 8 78 %index.next.1 = add nuw nsw i64 %index.next, 4 79 %20 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.1 80 %21 = bitcast i32* %20 to <4 x i32>* 81 %wide.load.2 = load <4 x i32>, <4 x i32>* %21, align 8 82 %22 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.1 83 %23 = bitcast i32* %22 to <4 x i32>* 84 %wide.load13.2 = load <4 x i32>, <4 x i32>* %23, align 8 85 %24 = add nsw <4 x i32> %wide.load13.2, %wide.load.2 86 %25 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.1 87 %26 = bitcast i32* %25 to <4 x i32>* 88 %wide.load14.2 = load <4 x i32>, <4 x i32>* %26, align 8 89 %27 = mul nsw <4 x i32> %24, %wide.load14.2 90 %28 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.1 91 %29 = bitcast i32* %28 to <4 x i32>* 92 store <4 x i32> %27, <4 x i32>* %29, align 8 93 %index.next.2 = add nuw nsw i64 %index.next.1, 4 94 %30 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.2 95 %31 = bitcast i32* %30 to <4 x i32>* 96 %wide.load.3 = load <4 x i32>, <4 x i32>* %31, align 8 97 %32 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.2 98 %33 = bitcast i32* %32 to <4 x i32>* 99 %wide.load13.3 = load <4 x i32>, <4 x i32>* %33, align 8 100 %34 = add nsw <4 x i32> %wide.load13.3, %wide.load.3 101 %35 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.2 102 %36 = bitcast i32* %35 to <4 x i32>* 103 %wide.load14.3 = load <4 x i32>, <4 x i32>* %36, align 8 104 %37 = mul nsw <4 x i32> %34, %wide.load14.3 105 %38 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.2 106 %39 = bitcast i32* %38 to <4 x i32>* 107 store <4 x i32> %37, <4 x i32>* %39, align 8 108 %index.next.3 = add nuw nsw i64 %index.next.2, 4 109 %40 = icmp eq i64 %index.next.3, 4096 110 br i1 %40, label %for.end, label %vector.body 111 112 for.end: 113 ret void 114 } 115 116 ; CHECK-LABEL: @foo 117 ; CHECK-NOT: xxpermdi 118 ; CHECK-NOT: xxswapd 119 ; CHECK-P9-NOT: xxpermdi 120 121 ; CHECK: lxvd2x 122 ; CHECK: lxvd2x 123 ; CHECK-DAG: lxvd2x 124 ; CHECK-DAG: vadduwm 125 ; CHECK: vmuluwm 126 ; CHECK: stxvd2x 127 128 ; CHECK: lxvd2x 129 ; CHECK: lxvd2x 130 ; CHECK-DAG: lxvd2x 131 ; CHECK-DAG: vadduwm 132 ; CHECK: vmuluwm 133 ; CHECK: stxvd2x 134 135 ; CHECK: lxvd2x 136 ; CHECK: lxvd2x 137 ; CHECK-DAG: lxvd2x 138 ; CHECK-DAG: vadduwm 139 ; CHECK: vmuluwm 140 ; CHECK: stxvd2x 141 142 ; CHECK: lxvd2x 143 ; CHECK: lxvd2x 144 ; CHECK-DAG: lxvd2x 145 ; CHECK-DAG: vadduwm 146 ; CHECK: vmuluwm 147 ; CHECK: stxvd2x 148 149 ; NOOPTSWAP-LABEL: @foo 150 151 ; NOOPTSWAP: lxvd2x 152 ; NOOPTSWAP-DAG: lxvd2x 153 ; NOOPTSWAP-DAG: lxvd2x 154 ; NOOPTSWAP-DAG: xxswapd 155 ; NOOPTSWAP-DAG: xxswapd 156 ; NOOPTSWAP-DAG: xxswapd 157 ; NOOPTSWAP-DAG: vadduwm 158 ; NOOPTSWAP: vmuluwm 159 ; NOOPTSWAP: xxswapd 160 ; NOOPTSWAP-DAG: xxswapd 161 ; NOOPTSWAP-DAG: xxswapd 162 ; NOOPTSWAP-DAG: stxvd2x 163 ; NOOPTSWAP-DAG: stxvd2x 164 ; NOOPTSWAP: stxvd2x 165 166 ; CHECK-P9-LABEL: @foo 167 ; CHECK-P9-DAG: lxvx 168 ; CHECK-P9-DAG: lxvx 169 ; CHECK-P9-DAG: lxvx 170 ; CHECK-P9-DAG: lxvx 171 ; CHECK-P9-DAG: lxvx 172 ; CHECK-P9-DAG: lxvx 173 ; CHECK-P9-DAG: lxvx 174 ; CHECK-P9-DAG: lxvx 175 ; CHECK-P9-DAG: lxvx 176 ; CHECK-P9-DAG: lxvx 177 ; CHECK-P9-DAG: lxvx 178 ; CHECK-P9-DAG: lxvx 179 ; CHECK-P9-DAG: vadduwm 180 ; CHECK-P9-DAG: vadduwm 181 ; CHECK-P9-DAG: vadduwm 182 ; CHECK-P9-DAG: vadduwm 183 ; CHECK-P9-DAG: vmuluwm 184 ; CHECK-P9-DAG: vmuluwm 185 ; CHECK-P9-DAG: vmuluwm 186 ; CHECK-P9-DAG: vmuluwm 187 ; CHECK-P9-DAG: stxvx 188 ; CHECK-P9-DAG: stxvx 189 ; CHECK-P9-DAG: stxvx 190 ; CHECK-P9-DAG: stxvx 191 192