In [1]:
using ISPC
Linker: /usr/bin/libtool
This is the function we will call for every pixel. Declare it @inline so that the @kernel below can use it.
In [2]:
@inline function mandel(c_re, c_im, count)
z_re = c_re
z_im = c_im
i = 0
while i < count
if (z_re * z_re + z_im * z_im > 4.0f0)
break
end
new_re = z_re*z_re - z_im*z_im
new_im = 2.0f0 * z_re * z_im
z_re = c_re + new_re
z_im = c_im + new_im
i += 1
end
return i
end
Out[2]:
mandel (generic function with 1 method)
This is the main function. Tag it with @ispc so that all kernel fragments inside are extracted and compiled separately by ISPC.
In [3]:
@ispc function mandelbrot_ispc(x0, y0, x1, y1, output, max_iters)
height, width = size(output)
dx = (x1 - x0) / width
dy = (y1 - y0) / height
@kernel(`--target=avx1-i32x8`) do
for i = 1:width
@foreach(1:height) do j
x = x0 + i * dx
y = y0 + j * dy
output[j,i] = mandel(x, y, max_iters)
end
end
end
output
end;
Extracted kernel Val{symbol("##8014")}((:x0,:y0,:output,:max_iters,:height,:width,:dx,:dy))
begin
GenSym(3) = (Main.colon)(1,width)
#s40 = (top(start))(GenSym(3))
unless (top(!))((top(done))(GenSym(3),#s40)) goto 1
2:
GenSym(4) = (top(next))(GenSym(3),#s40)
i = (top(getfield))(GenSym(4),1)
#s40 = (top(getfield))(GenSym(4),2)
$(Expr(:meta, :ispc, symbol("##foreach#8022"), :foreach, (:(1:height),)))
j = (ISPC.foreachindex)(1,(Main.colon)(1,height))
x = ((top(getfield))(Base.FastMath,:add_fast))(x0,((top(getfield))(Base.FastMath,:mul_fast))(i,dx))
y = ((top(getfield))(Base.FastMath,:add_fast))(y0,((top(getfield))(Base.FastMath,:mul_fast))(j,dy))
GenSym(5) = (Main.mandel)(x,y,max_iters)
(Main.setindex!)(output,GenSym(5),j,i)
$(Expr(:meta, :ispc, symbol("##foreach#8022")))
3:
unless (top(!))((top(!))((top(done))(GenSym(3),#s40))) goto 2
1:
0:
return
end
In [4]:
output = zeros(Float32, 768, 1024);
Check that kernel fragments in the main Julia function have been replaced by kernel calls:
In [5]:
@code_lowered mandelbrot_ispc(-2.0f0, -1.0f0, 1.0f0, 1.0f0, output, 256)
Out[5]:
1-element Array{Any,1}:
:($(Expr(:lambda, Any[:x0,:y0,:x1,:y1,:output,:max_iters], Any[Any[Any[:x0,:Any,1],Any[:y0,:Any,1],Any[:x1,:Any,0],Any[:y1,:Any,0],Any[:output,:Any,1],Any[:max_iters,:Any,1],Any[:height,:Any,19],Any[:width,:Any,19],Any[symbol("#s41"),:Any,2],Any[:dx,:Any,19],Any[:dy,:Any,19]],Any[],6,Any[]], :(begin # In[3], line 2:
NewvarNode(:height)
NewvarNode(:width)
NewvarNode(:dx)
NewvarNode(:dy)
GenSym(0) = (Main.size)(output)
#s41 = (top(start))(GenSym(0))
GenSym(1) = (top(indexed_next))(GenSym(0),1,#s41)
height = (top(getfield))(GenSym(1),1)
#s41 = (top(getfield))(GenSym(1),2)
GenSym(2) = (top(indexed_next))(GenSym(0),2,#s41)
width = (top(getfield))(GenSym(2),1)
#s41 = (top(getfield))(GenSym(2),2) # In[3], line 3:
dx = (x1 - x0) / width # In[3], line 4:
dy = (y1 - y0) / height # In[3], line 5: # /Users/plantagenet/.julia/v0.5/ISPC.jl/src/macros.jl, line 141:
((top(getfield))(ISPC,:kernel_call))(Val{symbol("##8014")},x0,y0,output,max_iters,height,width,dx,dy) # In[3], line 14:
return output
end))))
Calling the main function the first time will trigger the compilation of all its fragments:
In [6]:
mandelbrot_ispc(-2.0f0, -1.0f0, 1.0f0, 1.0f0, output, 256);
Generating kernel ##8014 for argument types (Float32,Float32,Array{Float32,2},Int64,Int64,Int64,Float32,Float32)
Compile options: `--target=avx1-i32x8`
Running type inference...
Lambda function:
arguments: Any[:x0,:y0,:output,:max_iters,:height,:width,:dx,:dy]
local variables:
(symbol("#s40"),Int64,2)
(:i,Int64,18)
(:j,Int64,18)
(:x,Float32,18)
(:y,Float32,18)
(:x0,Float32,0)
(:y0,Float32,0)
(:output,Array{Float32,2},0)
(:max_iters,Int64,0)
(:height,Int64,18)
(:width,Int64,18)
(:dx,Float32,18)
(:dy,Float32,18)
(symbol("##zs#8510"),Tuple{},0)
(symbol("##zs#8511"),Tuple{},0)
(symbol("##z_re#8512"),Float32,2)
(symbol("##z_im#8513"),Float32,2)
(symbol("##i#8514"),Int64,2)
(symbol("##new_re#8515"),Float32,18)
(symbol("##new_im#8516"),Float32,18)
(symbol("####xs#8509#8517"),Tuple{},0)
(symbol("##I#8518"),Tuple{},0)
closure variables:
SSA types: Any[Union{},Union{},Union{},UnitRange{Int64},Tuple{Int64,Int64},Int64,Int64,Int64]
parameters: Any[]
begin
GenSym(3) = $(Expr(:new, UnitRange{Int64}, 1, :(((top(getfield))(Base.Intrinsics,:select_value))((Base.sle_int)(1,width::Int64),width::Int64,(Base.sub_int)(1,1)))))
#s40 = (top(getfield))(GenSym(3),:start)
unless (Base.not_int)(#s40::Int64 === (Base.add_int)((top(getfield))(GenSym(3),:stop),1)) goto 1
2:
GenSym(6) = #s40::Int64
GenSym(7) = (Base.add_int)(#s40::Int64,1)
i = GenSym(6)
#s40 = GenSym(7)
$(Expr(:meta, :ispc, symbol("##foreach#8022"), :foreach, (:(1:height),)))
j = (ISPC.foreachindex)(1,$(Expr(:new, UnitRange{Int64}, 1, :(((top(getfield))(Base.Intrinsics,:select_value))((Base.sle_int)(1,height::Int64),height::Int64,(Base.sub_int)(1,1))))))
x = (Base.FastMath.box)(Float32,((top(getfield))(Base.FastMath.Base,:add_float_fast))(x0::Float32,(Base.FastMath.box)(Float32,((top(getfield))(Base.FastMath.Base,:mul_float_fast))((Base.sitofp)(Float32,i::Int64),dx::Float32))))
y = (Base.FastMath.box)(Float32,((top(getfield))(Base.FastMath.Base,:add_float_fast))(y0::Float32,(Base.FastMath.box)(Float32,((top(getfield))(Base.FastMath.Base,:mul_float_fast))((Base.sitofp)(Float32,j::Int64),dy::Float32))))
##z_re#8512 = x::Float32
##z_im#8513 = y::Float32
##i#8514 = 0
NewvarNode(symbol("##new_re#8515"))
NewvarNode(symbol("##new_im#8516"))
unless (Base.slt_int)(##i#8514::Int64,max_iters::Int64) goto 18
15:
unless (Base.lt_float)(4.0f0,(Base.add_float)((Base.mul_float)(##z_re#8512::Float32,##z_re#8512::Float32),(Base.mul_float)(##z_im#8513::Float32,##z_im#8513::Float32))) goto 16
goto 19
16:
##new_re#8515 = (Base.sub_float)((Base.mul_float)(##z_re#8512::Float32,##z_re#8512::Float32),(Base.mul_float)(##z_im#8513::Float32,##z_im#8513::Float32))
##new_im#8516 = (Base.mul_float)((Base.mul_float)(2.0f0,##z_re#8512::Float32),##z_im#8513::Float32)
##z_re#8512 = (Base.add_float)(x::Float32,##new_re#8515::Float32)
##z_im#8513 = (Base.add_float)(y::Float32,##new_im#8516::Float32)
##i#8514 = (Base.add_int)(##i#8514::Int64,1)
17:
unless (Base.not_int)((Base.slt_int)(##i#8514::Int64,max_iters::Int64)) goto 15
18:
19:
GenSym(5) = ##i#8514::Int64
(Base.arrayset)(output::Array{Float32,2},(Base.sitofp)(Float32,GenSym(5)),j::Int64,i::Int64)
$(Expr(:meta, :ispc, symbol("##foreach#8022")))
3:
unless (Base.not_int)((Base.not_int)(#s40::Int64 === (Base.add_int)((top(getfield))(GenSym(3),:stop),1))) goto 2
1:
0:
return
end
@generated function kernel_call{##8014}(::Type{Val{##8014}}, args...)
begin
$(Expr(:meta, :inline))
@inbounds begin
if UInt(ISPC.ispc_fptr[1]) == 0
ISPC.compile_all()
if UInt(ISPC.ispc_fptr[1]) == 0
error("Could not compile ISPC kernel $(id)")
end
end
begin
ccall(ISPC.ispc_fptr[1],Void,(Float32,Float32,Ref{Float32},Int64,Int64,Int64,Int64,Int64,Float32,Float32),args[1],args[2],args[3],size(args[3],1),size(args[3],2),args[4],args[5],args[6],args[7],args[8])
end
end
end
Compiling ISPC file...
// Use ISPC's multiple dispatch capabilities to deal with the fact
// that Julia uses the same function for bitwise and boolean NOT,
// whereas the ~ operator in ISPC does not work on booleans:
inline bool __not(bool val) {return !val;} // boolean NOT
inline int8 __not(int8 val) {return ~val;} // all others are bitwise
inline int16 __not(int16 val) {return ~val;}
inline int32 __not(int32 val) {return ~val;}
inline int64 __not(int64 val) {return ~val;}
inline unsigned int8 __not(unsigned int8 val) {return ~val;}
inline unsigned int16 __not(unsigned int16 val) {return ~val;}
inline unsigned int32 __not(unsigned int32 val) {return ~val;}
inline unsigned int64 __not(unsigned int64 val) {return ~val;}
struct UnitRange {
int64 start;
int64 stop;
};
export void ispc_func_1(uniform float x0, uniform float y0, uniform float output[], uniform int64 output__len__1, uniform int64 output__len__2, uniform int64 max_iters, uniform int64 height, uniform int64 width, uniform float dx, uniform float dy) {
uniform UnitRange _gensym3 = {1, ((1 <= width) ? width : (1 - 1))};
uniform int64 _s40 = _gensym3.start;
while(__not((_s40 == (_gensym3.stop + 1)))) {
uniform int64 _gensym6 = _s40;
uniform int64 _gensym7 = (_s40 + 1);
uniform int64 i = _gensym6;
_s40 = _gensym7;
foreach(j = 1 ... (height+1)) {
float x = (x0 + (((float)i) * dx));
float y = (y0 + (((float)j) * dy));
float __z_re_8512 = x;
float __z_im_8513 = y;
int64 __i_8514 = 0;
float __new_re_8515;
float __new_im_8516;
while((__i_8514 < max_iters)) {
if ((0x1p+2 < ((__z_re_8512 * __z_re_8512) + (__z_im_8513 * __z_im_8513)))) {
break;
} else {
__new_re_8515 = ((__z_re_8512 * __z_re_8512) - (__z_im_8513 * __z_im_8513));
__new_im_8516 = ((0x1p+1 * __z_re_8512) * __z_im_8513);
__z_re_8512 = (x + __new_re_8515);
__z_im_8513 = (y + __new_im_8516);
__i_8514 = (__i_8514 + 1);
}
}
int64 _gensym5 = __i_8514;
output[((j - 1) + (output__len__1 * (i - 1)))] = ((float)_gensym5);
}
}
return;
}
ld: warning: -macosx_version_min not specified, assuming 10.10
Loaded function ispc_func_1 at Ptr{Void} @0x000000030ec987c0
Now let's call it again to get an accurate measure of execution time:
In [15]:
x0, x1 = -2.1f0, 0.8f0
y0, y1 = -1.2f0, 1.2f0
@time out = mandelbrot_ispc(x0, y0, x1, y1, output, 256);
0.091577 seconds (15 allocations: 352 bytes)
Plot the generated fractal:
In [16]:
using PyPlot
In [17]:
imshow(out, cmap="flag", aspect="equal", extent=(x0, x1, y0, y1))
display(gcf())
close()
Wow, nice!
Pure-Julia version for comparison. Note how it uses the same mandel() routine as the ISPC version. We can't use @simd here because of the branch statements.
In [26]:
@fastmath function mandelbrot_julia(x0, y0, x1, y1, output, max_iters)
height, width = size(output)
dx = (x1 - x0) / width
dy = (y1 - y0) / height
@inbounds begin
for i = 1:width
for j = 1:height
x = x0 + i * dx
y = y0 + j * dy
output[j, i] = mandel(x, y, max_iters)
end
end
end
output
end
Out[26]:
mandelbrot_julia (generic function with 1 method)
How fast does it run?
In [28]:
@time out = mandelbrot_julia(x0, y0, x1, y1, output, 256);
0.233861 seconds (4 allocations: 160 bytes)
Not bad, only about 2.5x to 3x slower. But we got a decent speedup from ISPC!
Let's look at the generated C code, x86 assembly and LLVM assembly.
In [22]:
func = ISPC.ispc_funcs[1];
In [23]:
func_code = ISPC.gen_code(func)
println(func_code)
// Use ISPC's multiple dispatch capabilities to deal with the fact
// that Julia uses the same function for bitwise and boolean NOT,
// whereas the ~ operator in ISPC does not work on booleans:
inline bool __not(bool val) {return !val;} // boolean NOT
inline int8 __not(int8 val) {return ~val;} // all others are bitwise
inline int16 __not(int16 val) {return ~val;}
inline int32 __not(int32 val) {return ~val;}
inline int64 __not(int64 val) {return ~val;}
inline unsigned int8 __not(unsigned int8 val) {return ~val;}
inline unsigned int16 __not(unsigned int16 val) {return ~val;}
inline unsigned int32 __not(unsigned int32 val) {return ~val;}
inline unsigned int64 __not(unsigned int64 val) {return ~val;}
struct UnitRange {
int64 start;
int64 stop;
};
export void ispc_func_1(uniform float x0, uniform float y0, uniform float output[], uniform int64 output__len__1, uniform int64 output__len__2, uniform int64 max_iters, uniform int64 height, uniform int64 width, uniform float dx, uniform float dy) {
uniform UnitRange _gensym3 = {1, ((1 <= width) ? width : (1 - 1))};
uniform int64 _s40 = _gensym3.start;
while(__not((_s40 == (_gensym3.stop + 1)))) {
uniform int64 _gensym6 = _s40;
uniform int64 _gensym7 = (_s40 + 1);
uniform int64 i = _gensym6;
_s40 = _gensym7;
foreach(j = 1 ... (height+1)) {
float x = (x0 + (((float)i) * dx));
float y = (y0 + (((float)j) * dy));
float __z_re_8512 = x;
float __z_im_8513 = y;
int64 __i_8514 = 0;
float __new_re_8515;
float __new_im_8516;
while((__i_8514 < max_iters)) {
if ((0x1p+2 < ((__z_re_8512 * __z_re_8512) + (__z_im_8513 * __z_im_8513)))) {
break;
} else {
__new_re_8515 = ((__z_re_8512 * __z_re_8512) - (__z_im_8513 * __z_im_8513));
__new_im_8516 = ((0x1p+1 * __z_re_8512) * __z_im_8513);
__z_re_8512 = (x + __new_re_8515);
__z_im_8513 = (y + __new_im_8516);
__i_8514 = (__i_8514 + 1);
}
}
int64 _gensym5 = __i_8514;
output[((j - 1) + (output__len__1 * (i - 1)))] = ((float)_gensym5);
}
}
return;
}
In [24]:
ISPC.ispc_native(func_code, func.file.compile_opts)
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 13, 4
.section __TEXT,__literal16,16byte_literals
.align 4
LCPI0_0:
.long 0 ## 0x0
.long 1 ## 0x1
.long 2 ## 0x2
.long 3 ## 0x3
LCPI0_1:
.long 4 ## 0x4
.long 5 ## 0x5
.long 6 ## 0x6
.long 7 ## 0x7
LCPI0_2:
.byte 0 ## 0x0
.byte 1 ## 0x1
.byte 4 ## 0x4
.byte 5 ## 0x5
.byte 8 ## 0x8
.byte 9 ## 0x9
.byte 12 ## 0xc
.byte 13 ## 0xd
.byte 8 ## 0x8
.byte 9 ## 0x9
.byte 12 ## 0xc
.byte 13 ## 0xd
.byte 12 ## 0xc
.byte 13 ## 0xd
.byte 14 ## 0xe
.byte 15 ## 0xf
LCPI0_4:
.quad 1 ## 0x1
.quad 1 ## 0x1
.section __TEXT,__const
.align 5
LCPI0_3:
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
LCPI0_5:
.space 32
.section __TEXT,__text,regular,pure_instructions
.globl _ispc_func_1___unfunfun_3C_unf_3E_unIunIunIunIunIunfunf
.align 4, 0x90
_ispc_func_1___unfunfun_3C_unf_3E_unIunIunIunIunIunfunf: ## @ispc_func_1___unfunfun_3C_unf_3E_unIunIunIunIunIunfunf
## BB#0: ## %allocas
pushq %rbp
pushq %r15
pushq %r14
pushq %rbx
subq $408, %rsp ## imm = 0x198
vmovups %ymm4, (%rsp) ## 32-byte Spill
vmovss %xmm2, 60(%rsp) ## 4-byte Spill
vmovss %xmm0, 56(%rsp) ## 4-byte Spill
vmovmskps %ymm4, %edx
leaq 1(%r9), %r10
testq %r9, %r9
movl $1, %r14d
cmovleq %r14, %r10
leal 1(%r8), %r11d
movl %r8d, %eax
sarl $31, %eax
shrl $29, %eax
addl %r8d, %eax
andl $-8, %eax
movl %r8d, %ebx
subl %eax, %ebx
negl %ebx
leal 1(%r8,%rbx), %eax
vpermilps $0, %xmm1, %xmm0 ## xmm0 = xmm1[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 160(%rsp) ## 32-byte Spill
vpermilps $0, %xmm3, %xmm0 ## xmm0 = xmm3[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 128(%rsp) ## 32-byte Spill
vmovq %rcx, %xmm1
vpcmpeqd %xmm0, %xmm0, %xmm0
vunpcklpd %xmm1, %xmm1, %xmm1 ## xmm1 = xmm1[0,0]
vinsertf128 $1, %xmm1, %ymm1, %ymm5
cmpl $255, %edx
jne LBB0_1
## BB#5: ## %for_test.outer.preheader
xorl %ecx, %ecx
testq %r9, %r9
cmovsq %rcx, %r9
vinsertf128 $1, %xmm0, %ymm0, %ymm1
movl $1, %r14d
movl $-1, %r8d
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 96(%rsp) ## 32-byte Spill
vmovdqa LCPI0_2(%rip), %xmm4 ## xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
## implicit-def: YMM0
vmovups %ymm0, (%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, -32(%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, 288(%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, 256(%rsp) ## 32-byte Spill
jmp LBB0_6
LBB0_1:
vinsertf128 $1, %xmm0, %ymm0, %ymm1
movl $-1, %r8d
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 96(%rsp) ## 32-byte Spill
vmovdqa LCPI0_2(%rip), %xmm8 ## xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
## implicit-def: YMM0
vmovups %ymm0, -32(%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, -64(%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, 288(%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, 256(%rsp) ## 32-byte Spill
jmp LBB0_2
LBB0_53: ## %for_exit519
## in Loop: Header=BB0_2 Depth=1
decl %r9d
movl %esi, %ecx
imull %r9d, %ecx
vpextrq $1, %xmm2, %rbp
vcvtsi2ssq %rbp, %xmm0, %xmm1
vmovq %xmm2, %rbp
vcvtsi2ssq %rbp, %xmm0, %xmm2
vinsertps $16, %xmm1, %xmm2, %xmm1 ## xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
vmovq %xmm3, %rbp
vcvtsi2ssq %rbp, %xmm0, %xmm2
vinsertps $32, %xmm2, %xmm1, %xmm1 ## xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
vpextrq $1, %xmm3, %rbp
vcvtsi2ssq %rbp, %xmm0, %xmm2
vinsertps $48, %xmm2, %xmm1, %xmm1 ## xmm1 = xmm1[0,1,2],xmm2[0]
vpextrq $1, %xmm7, %rbp
vcvtsi2ssq %rbp, %xmm0, %xmm2
vmovq %xmm7, %rbp
vcvtsi2ssq %rbp, %xmm0, %xmm3
vinsertps $16, %xmm2, %xmm3, %xmm2 ## xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
vmovq %xmm0, %rbp
vcvtsi2ssq %rbp, %xmm0, %xmm3
vinsertps $32, %xmm3, %xmm2, %xmm2 ## xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
vpextrq $1, %xmm0, %rbp
vcvtsi2ssq %rbp, %xmm0, %xmm0
vinsertps $48, %xmm0, %xmm2, %xmm0 ## xmm0 = xmm2[0,1,2],xmm0[0]
vinsertf128 $1, %xmm1, %ymm0, %ymm0
addl %ecx, %edx
leal -4(,%rdx,4), %ecx
movslq %ecx, %rcx
vmovups -96(%rsp), %ymm1 ## 32-byte Reload
vmaskmovps %ymm0, %ymm1, (%rdi,%rcx)
vmovups (%rsp), %ymm4 ## 32-byte Reload
vmovups 64(%rsp), %ymm1 ## 32-byte Reload
.align 4, 0x90
LBB0_2: ## %for_test288
## =>This Loop Header: Depth=1
## Child Loop BB0_36 Depth 2
## Child Loop BB0_52 Depth 3
## Child Loop BB0_38 Depth 4
## Child Loop BB0_45 Depth 2
## Child Loop BB0_46 Depth 3
movq %r14, %r9
cmpq %r10, %r9
movl $0, %ecx
cmovel %r8d, %ecx
vmovd %ecx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vandnps %ymm1, %ymm0, %ymm1
vandps %ymm4, %ymm1, %ymm0
vmovmskps %ymm0, %ecx
testl %ecx, %ecx
je LBB0_9
## BB#3: ## %for_loop290
## in Loop: Header=BB0_2 Depth=1
movl $1, %edx
cmpl $2, %eax
jl LBB0_4
## BB#35: ## %foreach_full_body317.lr.ph
## in Loop: Header=BB0_2 Depth=1
vmovups %ymm1, 64(%rsp) ## 32-byte Spill
vxorps %xmm0, %xmm0, %xmm0
vcvtsi2ssq %r9, %xmm0, %xmm0
vmulss 60(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vaddss 56(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 224(%rsp) ## 32-byte Spill
leaq -1(%r9), %rcx
imulq %rsi, %rcx
movl $1, %edx
.align 4, 0x90
LBB0_36: ## %foreach_full_body317
## Parent Loop BB0_2 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB0_52 Depth 3
## Child Loop BB0_38 Depth 4
vmovd %edx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vpaddd LCPI0_0(%rip), %xmm0, %xmm1
vpaddd LCPI0_1(%rip), %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vcvtdq2ps %ymm0, %ymm0
vmulps 128(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vaddps 160(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vmovups %ymm0, 192(%rsp) ## 32-byte Spill
vxorps %xmm2, %xmm2, %xmm2
vxorps %xmm4, %xmm4, %xmm4
vmovaps %ymm0, %ymm10
vmovups 224(%rsp), %ymm12 ## 32-byte Reload
vmovups 96(%rsp), %ymm15 ## 32-byte Reload
jmp LBB0_52
.align 4, 0x90
LBB0_51: ## %if_done400
## in Loop: Header=BB0_52 Depth=3
vandnps %ymm3, %ymm11, %ymm15
LBB0_52: ## %for_test378.outer
## Parent Loop BB0_2 Depth=1
## Parent Loop BB0_36 Depth=2
## => This Loop Header: Depth=3
## Child Loop BB0_38 Depth 4
vmovups %ymm12, 320(%rsp) ## 32-byte Spill
vmovups %ymm10, 352(%rsp) ## 32-byte Spill
vmulps %ymm12, %ymm12, %ymm6
vmulps %ymm10, %ymm10, %ymm14
vaddps %ymm6, %ymm14, %ymm13
.align 4, 0x90
LBB0_38: ## %for_test378
## Parent Loop BB0_2 Depth=1
## Parent Loop BB0_36 Depth=2
## Parent Loop BB0_52 Depth=3
## => This Inner Loop Header: Depth=4
vextractf128 $1, %ymm2, %xmm1
vextractf128 $1, %ymm5, %xmm3
vpcmpgtq %xmm1, %xmm3, %xmm0
vpcmpgtq %xmm2, %xmm5, %xmm7
vshufps $-120, %xmm0, %xmm7, %xmm0 ## xmm0 = xmm7[0,2],xmm0[0,2]
vpshufb %xmm8, %xmm0, %xmm0
vextractf128 $1, %ymm4, %xmm9
vpcmpgtq %xmm9, %xmm3, %xmm3
vpcmpgtq %xmm4, %xmm5, %xmm7
vshufps $-120, %xmm3, %xmm7, %xmm3 ## xmm3 = xmm7[0,2],xmm3[0,2]
vpshufb %xmm8, %xmm3, %xmm3
vpunpcklqdq %xmm3, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm3[0]
vpmovzxwd %xmm0, %xmm3
vpslld $31, %xmm3, %xmm3
vpsrad $31, %xmm3, %xmm3
vpunpckhwd %xmm0, %xmm0, %xmm0 ## xmm0 = xmm0[4,4,5,5,6,6,7,7]
vpslld $31, %xmm0, %xmm0
vpsrad $31, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm3, %ymm0
vandps %ymm15, %ymm0, %ymm3
vmovmskps %ymm3, %ebp
testl %ebp, %ebp
je LBB0_42
## BB#39: ## %for_loop380
## in Loop: Header=BB0_38 Depth=4
vcmpnleps LCPI0_3(%rip), %ymm13, %ymm10
vandps %ymm3, %ymm10, %ymm12
vmovmskps %ymm12, %ebx
vxorps %xmm11, %xmm11, %xmm11
testl %ebx, %ebx
je LBB0_40
## BB#37: ## %safe_if_run_true402
## in Loop: Header=BB0_38 Depth=4
vxorps %xmm15, %xmm15, %xmm15
vmovaps %ymm12, %ymm11
cmpl %ebp, %ebx
je LBB0_38
LBB0_40: ## %safe_if_after_true401
## in Loop: Header=BB0_52 Depth=3
vblendvps %ymm10, LCPI0_5(%rip), %ymm3, %ymm13
vmovmskps %ymm13, %ebx
testl %ebx, %ebx
je LBB0_41
## BB#50: ## %safe_if_run_false421
## in Loop: Header=BB0_52 Depth=3
vsubps %ymm14, %ymm6, %ymm0
vmovups 256(%rsp), %ymm7 ## 32-byte Reload
vblendvps %ymm13, %ymm0, %ymm7, %ymm7
vmovups %ymm7, 256(%rsp) ## 32-byte Spill
vmovups 320(%rsp), %ymm12 ## 32-byte Reload
vaddps %ymm12, %ymm12, %ymm0
vmovups 352(%rsp), %ymm10 ## 32-byte Reload
vmulps %ymm0, %ymm10, %ymm0
vmovups 288(%rsp), %ymm6 ## 32-byte Reload
vblendvps %ymm13, %ymm0, %ymm6, %ymm6
vmovups %ymm6, 288(%rsp) ## 32-byte Spill
vaddps 224(%rsp), %ymm7, %ymm0 ## 32-byte Folded Reload
vblendvps %ymm13, %ymm0, %ymm12, %ymm12
vaddps 192(%rsp), %ymm6, %ymm0 ## 32-byte Folded Reload
vblendvps %ymm13, %ymm0, %ymm10, %ymm10
vmovdqa LCPI0_4(%rip), %xmm0 ## xmm0 = [1,1]
vmovdqa %xmm0, %xmm7
vpaddq %xmm7, %xmm9, %xmm0
vpaddq %xmm7, %xmm4, %xmm6
vinsertf128 $1, %xmm0, %ymm6, %ymm0
vpaddq %xmm7, %xmm1, %xmm1
vpaddq %xmm7, %xmm2, %xmm6
vinsertf128 $1, %xmm1, %ymm6, %ymm1
vunpcklps %xmm13, %xmm13, %xmm6 ## xmm6 = xmm13[0,0,1,1]
vunpckhps %xmm13, %xmm13, %xmm7 ## xmm7 = xmm13[2,2,3,3]
vinsertf128 $1, %xmm7, %ymm6, %ymm6
vextractf128 $1, %ymm13, %xmm7
vblendvps %ymm6, %ymm1, %ymm2, %ymm2
vunpcklps %xmm7, %xmm7, %xmm1 ## xmm1 = xmm7[0,0,1,1]
vunpckhps %xmm7, %xmm7, %xmm6 ## xmm6 = xmm7[2,2,3,3]
vinsertf128 $1, %xmm6, %ymm1, %ymm1
vblendvps %ymm1, %ymm0, %ymm4, %ymm4
jmp LBB0_51
.align 4, 0x90
LBB0_41: ## in Loop: Header=BB0_52 Depth=3
vmovups 352(%rsp), %ymm10 ## 32-byte Reload
vmovups 320(%rsp), %ymm12 ## 32-byte Reload
jmp LBB0_51
.align 4, 0x90
LBB0_42: ## %for_exit381
## in Loop: Header=BB0_36 Depth=2
vpextrq $1, %xmm4, %rbx
vxorps %xmm0, %xmm0, %xmm0
vcvtsi2ssq %rbx, %xmm0, %xmm0
vmovq %xmm4, %rbx
vcvtsi2ssq %rbx, %xmm0, %xmm3
vinsertps $16, %xmm0, %xmm3, %xmm0 ## xmm0 = xmm3[0],xmm0[0],xmm3[2,3]
vmovq %xmm9, %rbx
vcvtsi2ssq %rbx, %xmm0, %xmm3
vinsertps $32, %xmm3, %xmm0, %xmm0 ## xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
vpextrq $1, %xmm9, %rbx
vcvtsi2ssq %rbx, %xmm0, %xmm3
vinsertps $48, %xmm3, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm3[0]
vpextrq $1, %xmm2, %rbx
vcvtsi2ssq %rbx, %xmm0, %xmm3
vmovq %xmm2, %rbx
vcvtsi2ssq %rbx, %xmm0, %xmm2
vinsertps $16, %xmm3, %xmm2, %xmm2 ## xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
vmovq %xmm1, %rbx
vcvtsi2ssq %rbx, %xmm0, %xmm3
vinsertps $32, %xmm3, %xmm2, %xmm2 ## xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
vpextrq $1, %xmm1, %rbx
vcvtsi2ssq %rbx, %xmm0, %xmm1
vinsertps $48, %xmm1, %xmm2, %xmm1 ## xmm1 = xmm2[0,1,2],xmm1[0]
leal (%rdx,%rcx), %ebx
leal -4(,%rbx,4), %ebx
movslq %ebx, %rbx
vmovups %xmm1, (%rdi,%rbx)
vmovups %xmm0, 16(%rdi,%rbx)
addl $8, %edx
cmpl %eax, %edx
jl LBB0_36
jmp LBB0_43
.align 4, 0x90
LBB0_4: ## in Loop: Header=BB0_2 Depth=1
vmovups %ymm1, 64(%rsp) ## 32-byte Spill
LBB0_43: ## %partial_inner_all_outer353
## in Loop: Header=BB0_2 Depth=1
leaq 1(%r9), %r14
cmpl %r11d, %edx
vmovups (%rsp), %ymm4 ## 32-byte Reload
vmovups 64(%rsp), %ymm1 ## 32-byte Reload
jge LBB0_2
## BB#44: ## %partial_inner_only485
## in Loop: Header=BB0_2 Depth=1
vmovd %edx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vpaddd LCPI0_0(%rip), %xmm0, %xmm1
vpaddd LCPI0_1(%rip), %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm2
vmovd %r11d, %xmm3
vpermilps $0, %xmm3, %xmm3 ## xmm3 = xmm3[0,0,0,0]
vpcmpgtd %xmm0, %xmm3, %xmm0
vpcmpgtd %xmm1, %xmm3, %xmm1
vinsertf128 $1, %xmm0, %ymm1, %ymm14
vmovups %ymm14, -96(%rsp) ## 32-byte Spill
vxorps %xmm0, %xmm0, %xmm0
vcvtsi2ssq %r9, %xmm0, %xmm0
vmulss 60(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vaddss 56(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm1
vmovups %ymm1, 224(%rsp) ## 32-byte Spill
vcvtdq2ps %ymm2, %ymm0
vmulps 128(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vaddps 160(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vmovups %ymm0, 192(%rsp) ## 32-byte Spill
vxorps %xmm7, %xmm7, %xmm7
vxorps %xmm2, %xmm2, %xmm2
vmovaps %ymm0, %ymm11
vmovaps %ymm1, %ymm10
jmp LBB0_45
.align 4, 0x90
LBB0_55: ## %safe_if_run_false559
## in Loop: Header=BB0_45 Depth=2
vsubps %ymm9, %ymm10, %ymm4
vmovups -64(%rsp), %ymm9 ## 32-byte Reload
vblendvps %ymm14, %ymm4, %ymm9, %ymm9
vmovups %ymm9, -64(%rsp) ## 32-byte Spill
vmovups 320(%rsp), %ymm10 ## 32-byte Reload
vaddps %ymm10, %ymm10, %ymm4
vmovups 352(%rsp), %ymm11 ## 32-byte Reload
vmulps %ymm4, %ymm11, %ymm4
vmovups -32(%rsp), %ymm6 ## 32-byte Reload
vblendvps %ymm14, %ymm4, %ymm6, %ymm6
vmovups %ymm6, -32(%rsp) ## 32-byte Spill
vaddps 224(%rsp), %ymm9, %ymm4 ## 32-byte Folded Reload
vblendvps %ymm14, %ymm4, %ymm10, %ymm10
vaddps 192(%rsp), %ymm6, %ymm4 ## 32-byte Folded Reload
vblendvps %ymm14, %ymm4, %ymm11, %ymm11
vmovdqa LCPI0_4(%rip), %xmm4 ## xmm4 = [1,1]
vmovdqa %xmm4, %xmm6
vpaddq %xmm6, %xmm3, %xmm3
vpaddq %xmm6, %xmm2, %xmm4
vinsertf128 $1, %xmm3, %ymm4, %ymm3
vpaddq %xmm6, %xmm0, %xmm0
vpaddq %xmm6, %xmm7, %xmm4
vinsertf128 $1, %xmm0, %ymm4, %ymm0
vunp
Out[24]:
2328
cklps %xmm14, %xmm14, %xmm4 ## xmm4 = xmm14[0,0,1,1]
vunpckhps %xmm14, %xmm14, %xmm6 ## xmm6 = xmm14[2,2,3,3]
vinsertf128 $1, %xmm6, %ymm4, %ymm4
vextractf128 $1, %ymm14, %xmm6
vblendvps %ymm4, %ymm0, %ymm7, %ymm7
vunpcklps %xmm6, %xmm6, %xmm0 ## xmm0 = xmm6[0,0,1,1]
vunpckhps %xmm6, %xmm6, %xmm4 ## xmm4 = xmm6[2,2,3,3]
vinsertf128 $1, %xmm4, %ymm0, %ymm0
vblendvps %ymm0, %ymm3, %ymm2, %ymm2
vandnps %ymm1, %ymm12, %ymm14
LBB0_45: ## %for_test516.outer
## Parent Loop BB0_2 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB0_46 Depth 3
vmovups %ymm10, 320(%rsp) ## 32-byte Spill
vmovups %ymm11, 352(%rsp) ## 32-byte Spill
vmulps %ymm10, %ymm10, %ymm10
vmulps %ymm11, %ymm11, %ymm9
vaddps %ymm10, %ymm9, %ymm15
.align 4, 0x90
LBB0_46: ## %for_test516
## Parent Loop BB0_2 Depth=1
## Parent Loop BB0_45 Depth=2
## => This Inner Loop Header: Depth=3
vextractf128 $1, %ymm7, %xmm0
vextractf128 $1, %ymm5, %xmm1
vpcmpgtq %xmm0, %xmm1, %xmm3
vpcmpgtq %xmm7, %xmm5, %xmm4
vshufps $-120, %xmm3, %xmm4, %xmm3 ## xmm3 = xmm4[0,2],xmm3[0,2]
vpshufb %xmm8, %xmm3, %xmm4
vextractf128 $1, %ymm2, %xmm3
vpcmpgtq %xmm3, %xmm1, %xmm1
vpcmpgtq %xmm2, %xmm5, %xmm6
vshufps $-120, %xmm1, %xmm6, %xmm1 ## xmm1 = xmm6[0,2],xmm1[0,2]
vpshufb %xmm8, %xmm1, %xmm1
vpunpcklqdq %xmm1, %xmm4, %xmm1 ## xmm1 = xmm4[0],xmm1[0]
vpmovzxwd %xmm1, %xmm4
vpslld $31, %xmm4, %xmm4
vpsrad $31, %xmm4, %xmm4
vpunpckhwd %xmm1, %xmm1, %xmm1 ## xmm1 = xmm1[4,4,5,5,6,6,7,7]
vpslld $31, %xmm1, %xmm1
vpsrad $31, %xmm1, %xmm1
vinsertf128 $1, %xmm1, %ymm4, %ymm1
vandps %ymm14, %ymm1, %ymm1
vmovmskps %ymm1, %ecx
testl %ecx, %ecx
je LBB0_53
## BB#47: ## %for_loop518
## in Loop: Header=BB0_46 Depth=3
vcmpnleps LCPI0_3(%rip), %ymm15, %ymm13
vandps %ymm1, %ymm13, %ymm11
vmovmskps %ymm11, %ebx
vxorps %xmm12, %xmm12, %xmm12
testl %ebx, %ebx
je LBB0_48
## BB#54: ## %safe_if_run_true540
## in Loop: Header=BB0_46 Depth=3
vxorps %xmm14, %xmm14, %xmm14
vmovaps %ymm11, %ymm12
cmpl %ecx, %ebx
je LBB0_46
LBB0_48: ## %safe_if_after_true539
## in Loop: Header=BB0_45 Depth=2
vblendvps %ymm13, LCPI0_5(%rip), %ymm1, %ymm14
vmovmskps %ymm14, %ecx
testl %ecx, %ecx
jne LBB0_55
## BB#49: ## in Loop: Header=BB0_45 Depth=2
vmovups 352(%rsp), %ymm11 ## 32-byte Reload
vmovups 320(%rsp), %ymm10 ## 32-byte Reload
vandnps %ymm1, %ymm12, %ymm14
jmp LBB0_45
.align 4, 0x90
LBB0_31: ## %for_exit156
## in Loop: Header=BB0_6 Depth=1
decl %r15d
movl %esi, %ecx
imull %r15d, %ecx
vpextrq $1, %xmm0, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vmovq %xmm0, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm0
vinsertps $16, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
vmovq %xmm3, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vinsertps $32, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
vpextrq $1, %xmm3, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vinsertps $48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0]
vpextrq $1, %xmm14, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vmovq %xmm14, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm3
vinsertps $16, %xmm1, %xmm3, %xmm1 ## xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
vmovq %xmm2, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm3
vinsertps $32, %xmm3, %xmm1, %xmm1 ## xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
vpextrq $1, %xmm2, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm2
vinsertps $48, %xmm2, %xmm1, %xmm1 ## xmm1 = xmm1[0,1,2],xmm2[0]
vinsertf128 $1, %xmm0, %ymm1, %ymm0
addl %ecx, %ebx
leal -4(,%rbx,4), %ecx
movslq %ecx, %rcx
vmovups -64(%rsp), %ymm1 ## 32-byte Reload
vmaskmovps %ymm0, %ymm1, (%rdi,%rcx)
vmovups 64(%rsp), %ymm1 ## 32-byte Reload
LBB0_6: ## %for_test.outer
## =>This Loop Header: Depth=1
## Child Loop BB0_8 Depth 2
## Child Loop BB0_12 Depth 2
## Child Loop BB0_14 Depth 3
## Child Loop BB0_22 Depth 4
## Child Loop BB0_15 Depth 5
## Child Loop BB0_26 Depth 2
## Child Loop BB0_27 Depth 3
cmpl $1, %eax
jle LBB0_7
.align 4, 0x90
LBB0_12: ## %for_test.us
## Parent Loop BB0_6 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB0_14 Depth 3
## Child Loop BB0_22 Depth 4
## Child Loop BB0_15 Depth 5
movq %r14, %r15
cmpq %r10, %r15
movl $0, %ecx
cmovel %r8d, %ecx
vmovd %ecx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vandnps %ymm1, %ymm0, %ymm1
vmovmskps %ymm1, %ecx
testl %ecx, %ecx
je LBB0_9
## BB#13: ## %foreach_full_body.lr.ph.us
## in Loop: Header=BB0_12 Depth=2
vmovups %ymm1, 64(%rsp) ## 32-byte Spill
leaq 1(%r15), %r14
vxorps %xmm0, %xmm0, %xmm0
vcvtsi2ssq %r15, %xmm0, %xmm0
vmulss 60(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vaddss 56(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 224(%rsp) ## 32-byte Spill
leaq -1(%r15), %rdx
imulq %rsi, %rdx
movl $1, %ebx
.align 4, 0x90
LBB0_14: ## %foreach_full_body.us
## Parent Loop BB0_6 Depth=1
## Parent Loop BB0_12 Depth=2
## => This Loop Header: Depth=3
## Child Loop BB0_22 Depth 4
## Child Loop BB0_15 Depth 5
vmovd %ebx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vpaddd LCPI0_0(%rip), %xmm0, %xmm1
vpaddd LCPI0_1(%rip), %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vcvtdq2ps %ymm0, %ymm0
vmulps 128(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vaddps 160(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vmovups %ymm0, 192(%rsp) ## 32-byte Spill
vxorps %xmm14, %xmm14, %xmm14
vxorps %xmm9, %xmm9, %xmm9
vmovaps %ymm0, %ymm11
vmovups 224(%rsp), %ymm12 ## 32-byte Reload
vmovups 96(%rsp), %ymm13 ## 32-byte Reload
jmp LBB0_22
.align 4, 0x90
LBB0_21: ## %if_done.us
## in Loop: Header=BB0_22 Depth=4
vandnps %ymm3, %ymm10, %ymm13
LBB0_22: ## %for_test52.outer.us
## Parent Loop BB0_6 Depth=1
## Parent Loop BB0_12 Depth=2
## Parent Loop BB0_14 Depth=3
## => This Loop Header: Depth=4
## Child Loop BB0_15 Depth 5
vmovups %ymm12, 320(%rsp) ## 32-byte Spill
vmovups %ymm11, 352(%rsp) ## 32-byte Spill
vmulps %ymm12, %ymm12, %ymm6
vmulps %ymm11, %ymm11, %ymm15
vaddps %ymm6, %ymm15, %ymm7
.align 4, 0x90
LBB0_15: ## %for_test52.us
## Parent Loop BB0_6 Depth=1
## Parent Loop BB0_12 Depth=2
## Parent Loop BB0_14 Depth=3
## Parent Loop BB0_22 Depth=4
## => This Inner Loop Header: Depth=5
vextractf128 $1, %ymm14, %xmm1
vextractf128 $1, %ymm5, %xmm3
vpcmpgtq %xmm1, %xmm3, %xmm0
vpcmpgtq %xmm14, %xmm5, %xmm2
vshufps $-120, %xmm0, %xmm2, %xmm0 ## xmm0 = xmm2[0,2],xmm0[0,2]
vpshufb %xmm4, %xmm0, %xmm0
vextractf128 $1, %ymm9, %xmm8
vpcmpgtq %xmm8, %xmm3, %xmm2
vpcmpgtq %xmm9, %xmm5, %xmm3
vshufps $-120, %xmm2, %xmm3, %xmm2 ## xmm2 = xmm3[0,2],xmm2[0,2]
vpshufb %xmm4, %xmm2, %xmm2
vpunpcklqdq %xmm2, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm2[0]
vpmovzxwd %xmm0, %xmm2
vpslld $31, %xmm2, %xmm2
vpsrad $31, %xmm2, %xmm2
vpunpckhwd %xmm0, %xmm0, %xmm0 ## xmm0 = xmm0[4,4,5,5,6,6,7,7]
vpslld $31, %xmm0, %xmm0
vpsrad $31, %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm2, %ymm0
vandps %ymm13, %ymm0, %ymm3
vmovmskps %ymm3, %ebp
testl %ebp, %ebp
je LBB0_10
## BB#16: ## %for_loop54.us
## in Loop: Header=BB0_15 Depth=5
vcmpnleps LCPI0_3(%rip), %ymm7, %ymm12
vandps %ymm3, %ymm12, %ymm11
vmovmskps %ymm11, %ecx
vxorps %xmm10, %xmm10, %xmm10
testl %ecx, %ecx
je LBB0_18
## BB#17: ## %safe_if_run_true.us
## in Loop: Header=BB0_15 Depth=5
vxorps %xmm13, %xmm13, %xmm13
vmovaps %ymm11, %ymm10
cmpl %ebp, %ecx
je LBB0_15
LBB0_18: ## %safe_if_after_true.us
## in Loop: Header=BB0_22 Depth=4
vblendvps %ymm12, LCPI0_5(%rip), %ymm3, %ymm7
vmovmskps %ymm7, %ecx
testl %ecx, %ecx
je LBB0_19
## BB#20: ## %safe_if_run_false.us
## in Loop: Header=BB0_22 Depth=4
vsubps %ymm15, %ymm6, %ymm0
vmovups 256(%rsp), %ymm6 ## 32-byte Reload
vblendvps %ymm7, %ymm0, %ymm6, %ymm6
vmovups %ymm6, 256(%rsp) ## 32-byte Spill
vmovups 320(%rsp), %ymm12 ## 32-byte Reload
vaddps %ymm12, %ymm12, %ymm0
vmovups 352(%rsp), %ymm11 ## 32-byte Reload
vmulps %ymm0, %ymm11, %ymm0
vmovups 288(%rsp), %ymm2 ## 32-byte Reload
vblendvps %ymm7, %ymm0, %ymm2, %ymm2
vmovups %ymm2, 288(%rsp) ## 32-byte Spill
vaddps 224(%rsp), %ymm6, %ymm0 ## 32-byte Folded Reload
vblendvps %ymm7, %ymm0, %ymm12, %ymm12
vaddps 192(%rsp), %ymm2, %ymm0 ## 32-byte Folded Reload
vblendvps %ymm7, %ymm0, %ymm11, %ymm11
vmovdqa LCPI0_4(%rip), %xmm0 ## xmm0 = [1,1]
vmovdqa %xmm0, %xmm6
vpaddq %xmm6, %xmm8, %xmm0
vpaddq %xmm6, %xmm9, %xmm2
vinsertf128 $1, %xmm0, %ymm2, %ymm0
vpaddq %xmm6, %xmm1, %xmm1
vpaddq %xmm6, %xmm14, %xmm2
vinsertf128 $1, %xmm1, %ymm2, %ymm1
vunpcklps %xmm7, %xmm7, %xmm2 ## xmm2 = xmm7[0,0,1,1]
vunpckhps %xmm7, %xmm7, %xmm6 ## xmm6 = xmm7[2,2,3,3]
vinsertf128 $1, %xmm6, %ymm2, %ymm2
vextractf128 $1, %ymm7, %xmm6
vblendvps %ymm2, %ymm1, %ymm14, %ymm14
vunpcklps %xmm6, %xmm6, %xmm1 ## xmm1 = xmm6[0,0,1,1]
vunpckhps %xmm6, %xmm6, %xmm2 ## xmm2 = xmm6[2,2,3,3]
vinsertf128 $1, %xmm2, %ymm1, %ymm1
vblendvps %ymm1, %ymm0, %ymm9, %ymm9
jmp LBB0_21
.align 4, 0x90
LBB0_19: ## in Loop: Header=BB0_22 Depth=4
vmovups 352(%rsp), %ymm11 ## 32-byte Reload
vmovups 320(%rsp), %ymm12 ## 32-byte Reload
jmp LBB0_21
.align 4, 0x90
LBB0_10: ## %for_exit55.us
## in Loop: Header=BB0_14 Depth=3
vpextrq $1, %xmm9, %rcx
vxorps %xmm0, %xmm0, %xmm0
vcvtsi2ssq %rcx, %xmm0, %xmm0
vmovq %xmm9, %rcx
vcvtsi2ssq %rcx, %xmm0, %xmm2
vinsertps $16, %xmm0, %xmm2, %xmm0 ## xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
vmovq %xmm8, %rcx
vcvtsi2ssq %rcx, %xmm0, %xmm2
vinsertps $32, %xmm2, %xmm0, %xmm0 ## xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
vpextrq $1, %xmm8, %rcx
vcvtsi2ssq %rcx, %xmm0, %xmm2
vinsertps $48, %xmm2, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm2[0]
vpextrq $1, %xmm14, %rcx
vcvtsi2ssq %rcx, %xmm0, %xmm2
vmovq %xmm14, %rcx
vcvtsi2ssq %rcx, %xmm0, %xmm3
vinsertps $16, %xmm2, %xmm3, %xmm2 ## xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
vmovq %xmm1, %rcx
vcvtsi2ssq %rcx, %xmm0, %xmm3
vinsertps $32, %xmm3, %xmm2, %xmm2 ## xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
vpextrq $1, %xmm1, %rcx
vcvtsi2ssq %rcx, %xmm0, %xmm1
vinsertps $48, %xmm1, %xmm2, %xmm1 ## xmm1 = xmm2[0,1,2],xmm1[0]
leal (%rbx,%rdx), %ecx
leal -4(,%rcx,4), %ecx
movslq %ecx, %rcx
vmovups %xmm1, (%rdi,%rcx)
vmovups %xmm0, 16(%rdi,%rcx)
addl $8, %ebx
cmpl %eax, %ebx
jl LBB0_14
## BB#11: ## %partial_inner_all_outer.us
## in Loop: Header=BB0_12 Depth=2
cmpl %r11d, %ebx
vmovups 64(%rsp), %ymm1 ## 32-byte Reload
jge LBB0_12
jmp LBB0_25
.align 4, 0x90
LBB0_7: ## %for_test.preheader
## in Loop: Header=BB0_6 Depth=1
decq %r14
movq %r14, %r15
.align 4, 0x90
LBB0_8: ## %for_test
## Parent Loop BB0_6 Depth=1
## => This Inner Loop Header: Depth=2
cmpq %r15, %r9
movl $0, %ecx
cmovel %r8d, %ecx
vmovd %ecx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vandnps %ymm1, %ymm0, %ymm1
vmovmskps %ymm1, %ecx
testl %ecx, %ecx
je LBB0_9
## BB#23: ## %partial_inner_all_outer
## in Loop: Header=BB0_8 Depth=2
incq %r15
cmpl $2, %r11d
jl LBB0_8
## BB#24: ## %partial_inner_only.loopexit913
## in Loop: Header=BB0_6 Depth=1
leaq 1(%r15), %r14
movl $1, %ebx
LBB0_25: ## %partial_inner_only
## in Loop: Header=BB0_6 Depth=1
vmovups %ymm1, 64(%rsp) ## 32-byte Spill
vmovd %ebx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vpaddd LCPI0_0(%rip), %xmm0, %xmm1
vpaddd LCPI0_1(%rip), %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm2
vmovd %r11d, %xmm3
vpermilps $0, %xmm3, %xmm3 ## xmm3 = xmm3[0,0,0,0]
vpcmpgtd %xmm0, %xmm3, %xmm0
vpcmpgtd %xmm1, %xmm3, %xmm1
vinsertf128 $1, %xmm0, %ymm1, %ymm11
vmovups %ymm11, -64(%rsp) ## 32-byte Spill
vxorps %xmm0, %xmm0, %xmm0
vcvtsi2ssq %r15, %xmm0, %xmm0
vmulss 60(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vaddss 56(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm1
vmovups %ymm1, 224(%rsp) ## 32-byte Spill
vcvtdq2ps %ymm2, %ymm0
vmulps 128(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vaddps 160(%rsp), %ymm0, %ymm2 ## 32-byte Folded Reload
vmovups %ymm2, 192(%rsp) ## 32-byte Spill
vxorps %xmm14, %xmm14, %xmm14
vxorps %xmm0, %xmm0, %xmm0
vmovaps %ymm2, %ymm10
vmovaps %ymm1, %ymm9
jmp LBB0_26
.align 4, 0x90
LBB0_33: ## %safe_if_run_false196
## in Loop: Header=BB0_26 Depth=2
vsubps %ymm8, %ymm9, %ymm6
vmovups -32(%rsp), %ymm8 ## 32-byte Reload
vblendvps %ymm11, %ymm6, %ymm8, %ymm8
vmovups %ymm8, -32(%rsp) ## 32-byte Spill
vmovups 320(%rsp), %ymm9 ## 32-byte Reload
vaddps %ymm9, %ymm9, %ymm6
vmovups 352(%rsp), %ymm10 ## 32-byte Reload
vmulps %ymm6, %ymm10, %ymm6
vmovups (%rsp), %ymm7 ## 32-byte Reload
vblendvps %ymm11, %ymm6, %ymm7, %ymm7
vmovups %ymm7, (%rsp) ## 32-byte Spill
vaddps 224(%rsp), %ymm8, %ymm6 ## 32-byte Folded Reload
vblendvps %ymm11, %ymm6, %ymm9, %ymm9
vaddps 192(%rsp), %ymm7, %ymm6 ## 32-byte Folded Reload
vblendvps %ymm11, %ymm6, %ymm10, %ymm10
vmovdqa LCPI0_4(%rip), %xmm6 ## xmm6 = [1,1]
vmovdqa %xmm6, %xmm7
vpaddq %xmm7, %xmm3, %xmm3
vpaddq %xmm7, %xmm0, %xmm6
vinsertf128 $1, %xmm3, %ymm6, %ymm3
vpaddq %xmm7, %xmm2, %xmm2
vpaddq %xmm7, %xmm14, %xmm6
vinsertf128 $1, %xmm2, %ymm6, %ymm2
vunpcklps %xmm11, %xmm11, %xmm6 ## xmm6 = xmm11[0,0,1,1]
vunpckhps %xmm11, %xmm11, %xmm7 ## xmm7 = xmm11[2,2,3,3]
vinsertf128 $1, %xmm7, %ymm6, %ymm6
vextractf128 $1, %ymm11, %xmm7
vblendvps %ymm6, %ymm2, %ymm14, %ymm14
vunpcklps %xmm7, %xmm7, %xmm2 ## xmm2 = xmm7[0,0,1,1]
vunpckhps %xmm7, %xmm7, %xmm6 ## xmm6 = xmm7[2,2,3,3]
vinsertf128 $1, %xmm6, %ymm2, %ymm2
vblendvps %ymm2, %ymm3, %ymm0, %ymm0
vandnps %ymm1, %ymm12, %ymm11
LBB0_26: ## %for_test153.outer
## Parent Loop BB0_6 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB0_27 Depth 3
vmovups %ymm9, 320(%rsp) ## 32-byte Spill
vmovups %ymm10, 352(%rsp) ## 32-byte Spill
vmulps %ymm9, %ymm9, %ymm9
vmulps %ymm10, %ymm10, %ymm8
vaddps %ymm9, %ymm8, %ymm13
.align 4, 0x90
LBB0_27: ## %for_test153
## Parent Loop BB0_6 Depth=1
## Parent Loop BB0_26 Depth=2
## => This Inner Loop Header: Depth=3
vextractf128 $1, %ymm14, %xmm2
vextractf128 $1, %ymm5, %xmm1
vpcmpgtq %xmm2, %xmm1, %xmm3
vpcmpgtq %xmm14, %xmm5, %xmm7
vshufps $-120, %xmm3, %xmm7, %xmm3 ## xmm3 = xmm7[0,2],xmm3[0,2]
vpshufb %xmm4, %xmm3, %xmm7
vextractf128 $1, %ymm0, %xmm3
vpcmpgtq %xmm3, %xmm1, %xmm1
vpcmpgtq %xmm0, %xmm5, %xmm6
vshufps $-120, %xmm1, %xmm6, %xmm1 ## xmm1 = xmm6[0,2],xmm1[0,2]
vpshufb %xmm4, %xmm1, %xmm1
vpunpcklqdq %xmm1, %xmm7, %xmm1 ## xmm1 = xmm7[0],xmm1[0]
vpmovzxwd %xmm1, %xmm6
vpslld $31, %xmm6, %xmm6
vpsrad $31, %xmm6, %xmm6
vpunpckhwd %xmm1, %xmm1, %xmm1 ## xmm1 = xmm1[4,4,5,5,6,6,7,7]
vpslld $31, %xmm1, %xmm1
vpsrad $31, %xmm1, %xmm1
vinsertf128 $1, %xmm1, %ymm6, %ymm1
vandps %ymm11, %ymm1, %ymm1
vmovmskps %ymm1, %edx
testl %edx, %edx
je LBB0_31
## BB#28: ## %for_loop155
## in Loop: Header=BB0_27 Depth=3
vcmpnleps LCPI0_3(%rip), %ymm13, %ymm15
vandps %ymm1, %ymm15, %ymm10
vmovmskps %ymm10, %ecx
vxorps %xmm12, %xmm12, %xmm12
testl %ecx, %ecx
je LBB0_29
## BB#32: ## %safe_if_run_true177
## in Loop: Header=BB0_27 Depth=3
vxorps %xmm11, %xmm11, %xmm11
vmovaps %ymm10, %ymm12
cmpl %edx, %ecx
je LBB0_27
LBB0_29: ## %safe_if_after_true176
## in Loop: Header=BB0_26 Depth=2
vblendvps %ymm15, LCPI0_5(%rip), %ymm1, %ymm11
vmovmskps %ymm11, %ecx
testl %ecx, %ecx
jne LBB0_33
## BB#30: ## in Loop: Header=BB0_26 Depth=2
vmovups 352(%rsp), %ymm10 ## 32-byte Reload
vmovups 320(%rsp), %ymm9 ## 32-byte Reload
vandnps %ymm1, %ymm12, %ymm11
jmp LBB0_26
LBB0_9: ## %for_exit
addq $408, %rsp ## imm = 0x198
popq %rbx
popq %r14
popq %r15
popq %rbp
vzeroupper
retq
.section __TEXT,__literal16,16byte_literals
.align 4
LCPI1_0:
.long 0 ## 0x0
.long 1 ## 0x1
.long 2 ## 0x2
.long 3 ## 0x3
LCPI1_1:
.long 4 ## 0x4
.long 5 ## 0x5
.long 6 ## 0x6
.long 7 ## 0x7
LCPI1_2:
.byte 0 ## 0x0
.byte 1 ## 0x1
.byte 4 ## 0x4
.byte 5 ## 0x5
.byte 8 ## 0x8
.byte 9 ## 0x9
.byte 12 ## 0xc
.byte 13 ## 0xd
.byte 8 ## 0x8
.byte 9 ## 0x9
.byte 12 ## 0xc
.byte 13 ## 0xd
.byte 12 ## 0xc
.byte 13 ## 0xd
.byte 14 ## 0xe
.byte 15 ## 0xf
LCPI1_4:
.quad 1 ## 0x1
.quad 1 ## 0x1
.section __TEXT,__const
.align 5
LCPI1_3:
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
.long 1082130432 ## float 4.000000e+00
LCPI1_5:
.space 32
.section __TEXT,__text,regular,pure_instructions
.globl _ispc_func_1
.align 4, 0x90
_ispc_func_1: ## @ispc_func_1
## BB#0: ## %allocas
pushq %rbp
pushq %r14
pushq %rbx
subq $320, %rsp ## imm = 0x140
vmovss %xmm2, 28(%rsp) ## 4-byte Spill
vmovss %xmm0, 24(%rsp) ## 4-byte Spill
testq %r9, %r9
leaq 1(%r9), %r10
movl $1, %r11d
cmovleq %r11, %r10
leal 1(%r8), %r9d
movl %r8d, %eax
sarl $31, %eax
shrl $29, %eax
addl %r8d, %eax
andl $-8, %eax
movl %r8d, %edx
subl %eax, %edx
negl %edx
leal 1(%r8,%rdx), %ebp
vpermilps $0, %xmm1, %xmm0 ## xmm0 = xmm1[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 128(%rsp) ## 32-byte Spill
vpermilps $0, %xmm3, %xmm0 ## xmm0 = xmm3[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 96(%rsp) ## 32-byte Spill
vmovq %rcx, %xmm0
vpcmpeqd %xmm1, %xmm1, %xmm1
vinsertf128 $1, %xmm1, %ymm1, %ymm2
vunpcklpd %xmm0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm5
movl $-1, %r8d
vinsertf128 $1, %xmm1, %ymm1, %ymm0
vmovups %ymm0, 64(%rsp) ## 32-byte Spill
vmovdqa LCPI1_2(%rip), %xmm4 ## xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
## implicit-def: YMM0
vmovups %ymm0, -32(%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, -64(%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, 256(%rsp) ## 32-byte Spill
## implicit-def: YMM0
vmovups %ymm0, 224(%rsp) ## 32-byte Spill
jmp LBB1_1
LBB1_23: ## %for_exit156
## in Loop: Header=BB1_1 Depth=1
decl %r14d
movl %esi, %eax
imull %r14d, %eax
vpextrq $1, %xmm9, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vmovq %xmm9, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm3
vinsertps $16, %xmm1, %xmm3, %xmm1 ## xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
vmovq %xmm0, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm3
vinsertps $32, %xmm3, %xmm1, %xmm1 ## xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
vpextrq $1, %xmm0, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm0
vinsertps $48, %xmm0, %xmm1, %xmm0 ## xmm0 = xmm1[0,1,2],xmm0[0]
vpextrq $1, %xmm14, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vmovq %xmm14, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm3
vinsertps $16, %xmm1, %xmm3, %xmm1 ## xmm1 = xmm3[0],xmm1[0],xmm3[2,3]
vmovq %xmm2, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm3
vinsertps $32, %xmm3, %xmm1, %xmm1 ## xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
vpextrq $1, %xmm2, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm2
vinsertps $48, %xmm2, %xmm1, %xmm1 ## xmm1 = xmm1[0,1,2],xmm2[0]
vinsertf128 $1, %xmm0, %ymm1, %ymm0
addl %eax, %ecx
leal -4(,%rcx,4), %eax
cltq
vmovups -96(%rsp), %ymm1 ## 32-byte Reload
vmaskmovps %ymm0, %ymm1, (%rdi,%rax)
vmovups 32(%rsp), %ymm2 ## 32-byte Reload
.align 4, 0x90
LBB1_1: ## %for_test
## =>This Loop Header: Depth=1
## Child Loop BB1_6 Depth 2
## Child Loop BB1_22 Depth 3
## Child Loop BB1_8 Depth 4
## Child Loop BB1_15 Depth 2
## Child Loop BB1_16 Depth 3
movq %r11, %r14
cmpq %r10, %r14
movl $0, %eax
cmovel %r8d, %eax
vmovd %eax, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vandnps %ymm2, %ymm0, %ymm2
vmovmskps %ymm2, %eax
testl %eax, %eax
je LBB1_4
## BB#2: ## %for_loop
## in Loop: Header=BB1_1 Depth=1
movl $1, %ecx
cmpl $2, %ebp
jl LBB1_3
## BB#5: ## %foreach_full_body.lr.ph
## in Loop: Header=BB1_1 Depth=1
vmovups %ymm2, 32(%rsp) ## 32-byte Spill
vxorps %xmm0, %xmm0, %xmm0
vcvtsi2ssq %r14, %xmm0, %xmm0
vmulss 28(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vaddss 24(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm0
vmovups %ymm0, 192(%rsp) ## 32-byte Spill
leaq -1(%r14), %rax
imulq %rsi, %rax
movl $1, %ecx
.align 4, 0x90
LBB1_6: ## %foreach_full_body
## Parent Loop BB1_1 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB1_22 Depth 3
## Child Loop BB1_8 Depth 4
vmovd %ecx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vpaddd LCPI1_0(%rip), %xmm0, %xmm1
vpaddd LCPI1_1(%rip), %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vcvtdq2ps %ymm0, %ymm0
vmulps 96(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vaddps 128(%rsp), %ymm0, %ymm14 ## 32-byte Folded Reload
vmovups %ymm14, 160(%rsp) ## 32-byte Spill
vxorps %xmm9, %xmm9, %xmm9
vxorps %xmm0, %xmm0, %xmm0
vmovups 192(%rsp), %ymm12 ## 32-byte Reload
vmovups 64(%rsp), %ymm13 ## 32-byte Reload
jmp LBB1_22
.align 4, 0x90
LBB1_21: ## %if_done
## in Loop: Header=BB1_22 Depth=3
vandnps %ymm1, %ymm10, %ymm13
LBB1_22: ## %for_test52.outer
## Parent Loop BB1_1 Depth=1
## Parent Loop BB1_6 Depth=2
## => This Loop Header: Depth=3
## Child Loop BB1_8 Depth 4
vmovups %ymm12, 288(%rsp) ## 32-byte Spill
vmulps %ymm12, %ymm12, %ymm6
vmulps %ymm14, %ymm14, %ymm15
vaddps %ymm6, %ymm15, %ymm7
.align 4, 0x90
LBB1_8: ## %for_test52
## Parent Loop BB1_1 Depth=1
## Parent Loop BB1_6 Depth=2
## Parent Loop BB1_22 Depth=3
## => This Inner Loop Header: Depth=4
vextractf128 $1, %ymm9, %xmm11
vextractf128 $1, %ymm5, %xmm1
vpcmpgtq %xmm11, %xmm1, %xmm3
vpcmpgtq %xmm9, %xmm5, %xmm2
vshufps $-120, %xmm3, %xmm2, %xmm2 ## xmm2 = xmm2[0,2],xmm3[0,2]
vpshufb %xmm4, %xmm2, %xmm2
vextractf128 $1, %ymm0, %xmm8
vpcmpgtq %xmm8, %xmm1, %xmm1
vpcmpgtq %xmm0, %xmm5, %xmm3
vshufps $-120, %xmm1, %xmm3, %xmm1 ## xmm1 = xmm3[0,2],xmm1[0,2]
vpshufb %xmm4, %xmm1, %xmm1
vpunpcklqdq %xmm1, %xmm2, %xmm1 ## xmm1 = xmm2[0],xmm1[0]
vpmovzxwd %xmm1, %xmm2
vpslld $31, %xmm2, %xmm2
vpsrad $31, %xmm2, %xmm2
vpunpckhwd %xmm1, %xmm1, %xmm1 ## xmm1 = xmm1[4,4,5,5,6,6,7,7]
vpslld $31, %xmm1, %xmm1
vpsrad $31, %xmm1, %xmm1
vinsertf128 $1, %xmm1, %ymm2, %ymm1
vandps %ymm13, %ymm1, %ymm1
vmovmskps %ymm1, %ebx
testl %ebx, %ebx
je LBB1_12
## BB#9: ## %for_loop54
## in Loop: Header=BB1_8 Depth=4
vcmpnleps LCPI1_3(%rip), %ymm7, %ymm12
vandps %ymm1, %ymm12, %ymm3
vmovmskps %ymm3, %edx
vxorps %xmm10, %xmm10, %xmm10
testl %edx, %edx
je LBB1_10
## BB#7: ## %safe_if_run_true
## in Loop: Header=BB1_8 Depth=4
vxorps %xmm13, %xmm13, %xmm13
vmovaps %ymm3, %ymm10
cmpl %ebx, %edx
je LBB1_8
LBB1_10: ## %safe_if_after_true
## in Loop: Header=BB1_22 Depth=3
vblendvps %ymm12, LCPI1_5(%rip), %ymm1, %ymm7
vmovmskps %ymm7, %edx
testl %edx, %edx
je LBB1_11
## BB#20: ## %safe_if_run_false
## in Loop: Header=BB1_22 Depth=3
vsubps %ymm15, %ymm6, %ymm2
vmovups 224(%rsp), %ymm6 ## 32-byte Reload
vblendvps %ymm7, %ymm2, %ymm6, %ymm6
vmovups %ymm6, 224(%rsp) ## 32-byte Spill
vmovups 288(%rsp), %ymm12 ## 32-byte Reload
vaddps %ymm12, %ymm12, %ymm2
vmulps %ymm2, %ymm14, %ymm2
vmovups 256(%rsp), %ymm3 ## 32-byte Reload
vblendvps %ymm7, %ymm2, %ymm3, %ymm3
vmovups %ymm3, 256(%rsp) ## 32-byte Spill
vaddps 192(%rsp), %ymm6, %ymm2 ## 32-byte Folded Reload
vblendvps %ymm7, %ymm2, %ymm12, %ymm12
vaddps 160(%rsp), %ymm3, %ymm2 ## 32-byte Folded Reload
vblendvps %ymm7, %ymm2, %ymm14, %ymm14
vmovdqa LCPI1_4(%rip), %xmm2 ## xmm2 = [1,1]
vmovdqa %xmm2, %xmm6
vpaddq %xmm6, %xmm8, %xmm2
vpaddq %xmm6, %xmm0, %xmm3
vinsertf128 $1, %xmm2, %ymm3, %ymm8
vpaddq %xmm6, %xmm11, %xmm3
vpaddq %xmm6, %xmm9, %xmm6
vinsertf128 $1, %xmm3, %ymm6, %ymm3
vunpcklps %xmm7, %xmm7, %xmm6 ## xmm6 = xmm7[0,0,1,1]
vunpckhps %xmm7, %xmm7, %xmm2 ## xmm2 = xmm7[2,2,3,3]
vinsertf128 $1, %xmm2, %ymm6, %ymm2
vextractf128 $1, %ymm7, %xmm6
vblendvps %ymm2, %ymm3, %ymm9, %ymm9
vunpcklps %xmm6, %xmm6, %xmm2 ## xmm2 = xmm6[0,0,1,1]
vunpckhps %xmm6, %xmm6, %xmm3 ## xmm3 = xmm6[2,2,3,3]
vinsertf128 $1, %xmm3, %ymm2, %ymm2
vblendvps %ymm2, %ymm8, %ymm0, %ymm0
jmp LBB1_21
.align 4, 0x90
LBB1_11: ## in Loop: Header=BB1_22 Depth=3
vmovups 288(%rsp), %ymm12 ## 32-byte Reload
jmp LBB1_21
.align 4, 0x90
LBB1_12: ## %for_exit55
## in Loop: Header=BB1_6 Depth=2
vpextrq $1, %xmm0, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vmovq %xmm0, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm0
vinsertps $16, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
vmovq %xmm8, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vinsertps $32, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
vpextrq $1, %xmm8, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vinsertps $48, %xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],xmm1[0]
vpextrq $1, %xmm9, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm1
vmovq %xmm9, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm2
vinsertps $16, %xmm1, %xmm2, %xmm1 ## xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
vmovq %xmm11, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm2
vinsertps $32, %xmm2, %xmm1, %xmm1 ## xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
vpextrq $1, %xmm11, %rdx
vcvtsi2ssq %rdx, %xmm0, %xmm2
vinsertps $48, %xmm2, %xmm1, %xmm1 ## xmm1 = xmm1[0,1,2],xmm2[0]
leal (%rcx,%rax), %edx
leal -4(,%rdx,4), %edx
movslq %edx, %rdx
vmovups %xmm1, (%rdi,%rdx)
vmovups %xmm0, 16(%rdi,%rdx)
addl $8, %ecx
cmpl %ebp, %ecx
jl LBB1_6
jmp LBB1_13
.align 4, 0x90
LBB1_3: ## in Loop: Header=BB1_1 Depth=1
vmovups %ymm2, 32(%rsp) ## 32-byte Spill
LBB1_13: ## %partial_inner_all_outer
## in Loop: Header=BB1_1 Depth=1
leaq 1(%r14), %r11
cmpl %r9d, %ecx
vmovups 32(%rsp), %ymm2 ## 32-byte Reload
jge LBB1_1
## BB#14: ## %partial_inner_only
## in Loop: Header=BB1_1 Depth=1
vmovd %ecx, %xmm0
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vpaddd LCPI1_0(%rip), %xmm0, %xmm1
vpaddd LCPI1_1(%rip), %xmm0, %xmm0
vinsertf128 $1, %xmm0, %ymm1, %ymm2
vmovd %r9d, %xmm3
vpermilps $0, %xmm3, %xmm3 ## xmm3 = xmm3[0,0,0,0]
vpcmpgtd %xmm0, %xmm3, %xmm0
vpcmpgtd %xmm1, %xmm3, %xmm1
vinsertf128 $1, %xmm0, %ymm1, %ymm13
vmovups %ymm13, -96(%rsp) ## 32-byte Spill
vxorps %xmm0, %xmm0, %xmm0
vcvtsi2ssq %r14, %xmm0, %xmm0
vmulss 28(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vaddss 24(%rsp), %xmm0, %xmm0 ## 4-byte Folded Reload
vpermilps $0, %xmm0, %xmm0 ## xmm0 = xmm0[0,0,0,0]
vinsertf128 $1, %xmm0, %ymm0, %ymm11
vmovups %ymm11, 192(%rsp) ## 32-byte Spill
vcvtdq2ps %ymm2, %ymm0
vmulps 96(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vaddps 128(%rsp), %ymm0, %ymm0 ## 32-byte Folded Reload
vmovups %ymm0, 160(%rsp) ## 32-byte Spill
vxorps %xmm14, %xmm14, %xmm14
vxorps %xmm9, %xmm9, %xmm9
vmovaps %ymm0, %ymm10
vmovaps %ymm11, %ymm12
jmp LBB1_15
.align 4, 0x90
LBB1_25: ## %safe_if_run_false196
## in Loop: Header=BB1_15 Depth=2
vsubps %ymm8, %ymm6, %ymm6
vmovups -64(%rsp), %ymm8 ## 32-byte Reload
vblendvps %ymm13, %ymm6, %ymm8, %ymm8
vmovups %ymm8, -64(%rsp) ## 32-byte Spill
vmovups 288(%rsp), %ymm12 ## 32-byte Reload
vaddps %ymm12, %ymm12, %ymm6
vmovaps %ymm11, %ymm10
vmulps %ymm6, %ymm10, %ymm6
vmovups -32(%rsp), %ymm7 ## 32-byte Reload
vblendvps %ymm13, %ymm6, %ymm7, %ymm7
vmovups %ymm7, -32(%rsp) ## 32-byte Spill
vaddps 192(%rsp), %ymm8, %ymm6 ## 32-byte Folded Reload
vblendvps %ymm13, %ymm6, %ymm12, %ymm12
vaddps 160(%rsp), %ymm7, %ymm6 ## 32-byte Folded Reload
vblendvps %ymm13, %ymm6, %ymm10, %ymm10
vmovdqa LCPI1_4(%rip), %xmm6 ## xmm6 = [1,1]
vmovdqa %xmm6, %xmm7
vpaddq %xmm7, %xmm0, %xmm0
vpaddq %xmm7, %xmm9, %xmm6
vinsertf128 $1, %xmm0, %ymm6, %ymm0
vpaddq %xmm7, %xmm2, %xmm2
vpaddq %xmm7, %xmm14, %xmm6
vinsertf128 $1, %xmm2, %ymm6, %ymm2
vunpcklps %xmm13, %xmm13, %xmm6 ## xmm6 = xmm13[0,0,1,1]
vunpckhps %xmm13, %xmm13, %xmm7 ## xmm7 = xmm13[2,2,3,3]
vinsertf128 $1, %xmm7, %ymm6, %ymm6
vextractf128 $1, %ymm13, %xmm7
vblendvps %ymm6, %ymm2, %ymm14, %ymm14
vunpcklps %xmm7, %xmm7, %xmm2 ## xmm2 = xmm7[0,0,1,1]
vunpckhps %xmm7, %xmm7, %xmm6 ## xmm6 = xmm7[2,2,3,3]
vinsertf128 $1, %xmm6, %ymm2, %ymm2
vblendvps %ymm2, %ymm0, %ymm9, %ymm9
vandnps %ymm1, %ymm3, %ymm13
LBB1_15: ## %for_test153.outer
## Parent Loop BB1_1 Depth=1
## => This Loop Header: Depth=2
## Child Loop BB1_16 Depth 3
vmovups %ymm12, 288(%rsp) ## 32-byte Spill
vmulps %ymm12, %ymm12, %ymm6
vmulps %ymm10, %ymm10, %ymm8
vmovaps %ymm10, %ymm11
vaddps %ymm6, %ymm8, %ymm15
.align 4, 0x90
LBB1_16: ## %for_test153
## Parent Loop BB1_1 Depth=1
## Parent Loop BB1_15 Depth=2
## => This Inner Loop Header: Depth=3
vextractf128 $1, %ymm14, %xmm2
vextractf128 $1, %ymm5, %xmm1
vpcmpgtq %xmm2, %xmm1, %xmm0
vpcmpgtq %xmm14, %xmm5, %xmm3
vshufps $-120, %xmm0, %xmm3, %xmm0 ## xmm0 = xmm3[0,2],xmm0[0,2]
vpshufb %xmm4, %xmm0, %xmm3
vextractf128 $1, %ymm9, %xmm0
vpcmpgtq %xmm0, %xmm1, %xmm1
vpcmpgtq %xmm9, %xmm5, %xmm7
vshufps $-120, %xmm1, %xmm7, %xmm1 ## xmm1 = xmm7[0,2],xmm1[0,2]
vpshufb %xmm4, %xmm1, %xmm1
vpunpcklqdq %xmm1, %xmm3, %xmm1 ## xmm1 = xmm3[0],xmm1[0]
vpmovzxwd %xmm1, %xmm3
vpslld $31, %xmm3, %xmm3
vpsrad $31, %xmm3, %xmm3
vpunpckhwd %xmm1, %xmm1, %xmm1 ## xmm1 = xmm1[4,4,5,5,6,6,7,7]
vpslld $31, %xmm1, %xmm1
vpsrad $31, %xmm1, %xmm1
vinsertf128 $1, %xmm1, %ymm3, %ymm1
vandps %ymm13, %ymm1, %ymm1
vmovmskps %ymm1, %eax
testl %eax, %eax
je LBB1_23
## BB#17: ## %for_loop155
## in Loop: Header=BB1_16 Depth=3
vcmpnleps LCPI1_3(%rip), %ymm15, %ymm12
vandps %ymm1, %ymm12, %ymm10
vmovmskps %ymm10, %edx
vpxor %xmm3, %xmm3, %xmm3
testl %edx, %edx
je LBB1_18
## BB#24: ## %safe_if_run_true177
## in Loop: Header=BB1_16 Depth=3
vxorps %xmm13, %xmm13, %xmm13
vmovaps %ymm10, %ymm3
cmpl %eax, %edx
je LBB1_16
LBB1_18: ## %safe_if_after_true176
## in Loop: Header=BB1_15 Depth=2
vblendvps %ymm12, LCPI1_5(%rip), %ymm1, %ymm13
vmovmskps %ymm13, %eax
testl %eax, %eax
jne LBB1_25
## BB#19: ## in Loop: Header=BB1_15 Depth=2
vmovaps %ymm11, %ymm10
vmovups 288(%rsp), %ymm12 ## 32-byte Reload
vandnps %ymm1, %ymm3, %ymm13
jmp LBB1_15
LBB1_4: ## %for_exit
addq $320, %rsp ## imm = 0x140
popq %rbx
popq %r14
popq %rbp
vzeroupper
retq
.subsections_via_symbols
In [25]:
ISPC.ispc_llvm(func_code, func.file.compile_opts)
; ModuleID = '<stdin>'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin13.4.0"
; Function Attrs: nounwind readnone
declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) #0
; Function Attrs: nounwind
declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x float>, <8 x float>) #1
; Function Attrs: nounwind readnone
declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) #0
; Function Attrs: nounwind
define void @ispc_func_1___unfunfun_3C_unf_3E_unIunIunIunIunIunfunf(float %x0, float %y0, float* noalias nocapture %output, i64 %output__len__1, i64 %output__len__2, i64 %max_iters, i64 %height, i64 %width, float %dx, float %dy, <8 x i32> %__mask) #1 {
allocas:
%floatmask.i = bitcast <8 x i32> %__mask to <8 x float>
%v.i = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i) #0
%cmp.i = icmp eq i32 %v.i, 255
%lessequal__width_load = icmp sgt i64 %width, 0
%width.op = add i64 %width, 1
%add__gensym318_stop_ = select i1 %lessequal__width_load, i64 %width.op, i64 1
%add_height_load_ = add i64 %height, 1
%add_height_load__to_int32 = trunc i64 %add_height_load_ to i32
%nitems = add i32 %add_height_load__to_int32, -1
%nextras = srem i32 %nitems, 8
%aligned_end = sub i32 %add_height_load__to_int32, %nextras
%before_aligned_end44888 = icmp sgt i32 %aligned_end, 1
%y0_load_broadcast_init = insertelement <8 x float> undef, float %y0, i32 0
%y0_load_broadcast = shufflevector <8 x float> %y0_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%dy_load_broadcast_init = insertelement <8 x float> undef, float %dy, i32 0
%dy_load_broadcast = shufflevector <8 x float> %dy_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%max_iters_load_broadcast_init = insertelement <8 x i64> undef, i64 %max_iters, i32 0
%max_iters_load_broadcast = shufflevector <8 x i64> %max_iters_load_broadcast_init, <8 x i64> undef, <8 x i32> zeroinitializer
%output_load_ptr2int_2void = bitcast float* %output to i8*
br i1 %cmp.i, label %for_test.outer, label %for_test288.outer
for_test.outer: ; preds = %for_exit156, %allocas
%blend.i.i795850.ph = phi <8 x float> [ %blend.i.i795851.ph, %for_exit156 ], [ undef, %allocas ]
%blend.i.i798847.ph = phi <8 x float> [ %blend.i.i798848.ph, %for_exit156 ], [ undef, %allocas ]
%blend.i.i826837.ph = phi <8 x float> [ %blend.i.i826838.lcssa.lcssa, %for_exit156 ], [ undef, %allocas ]
%blend.i.i833.ph = phi <8 x float> [ %blend.i.i834.lcssa.lcssa, %for_exit156 ], [ undef, %allocas ]
%_s40.0.ph = phi i64 [ %add__s40_load30_.lcssa, %for_exit156 ], [ 1, %allocas ]
%internal_mask_memory.0.ph = phi <8 x i32> [ %"oldMask&test.lcssa882", %for_exit156 ], [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %allocas ]
br i1 %before_aligned_end44888, label %for_test.us, label %for_test
for_test.us: ; preds = %partial_inner_all_outer.us, %for_test.outer
%blend.i.i826837.us = phi <8 x float> [ %blend.i.i826839.ph.us, %partial_inner_all_outer.us ], [ %blend.i.i826837.ph, %for_test.outer ]
%blend.i.i833.us = phi <8 x float> [ %blend.i.i835.ph.us, %partial_inner_all_outer.us ], [ %blend.i.i833.ph, %for_test.outer ]
%_s40.0.us = phi i64 [ %add__s40_load30_.us, %partial_inner_all_outer.us ], [ %_s40.0.ph, %for_test.outer ]
%internal_mask_memory.0.us = phi <8 x i32> [ %"oldMask&test.us", %partial_inner_all_outer.us ], [ %internal_mask_memory.0.ph, %for_test.outer ]
%equal__s40_load_add__gensym318_stop_.us = icmp eq i64 %_s40.0.us, %add__gensym318_stop_
%equal__s40_load_add__gensym318_stop_to_i_bool.us = sext i1 %equal__s40_load_add__gensym318_stop_.us to i32
%equal__s40_load_add__gensym318_stop_to_i_bool_broadcast_init.us = insertelement <8 x i32> undef, i32 %equal__s40_load_add__gensym318_stop_to_i_bool.us, i32 0
%equal__s40_load_add__gensym318_stop_to_i_bool_broadcast.us = shufflevector <8 x i32> %equal__s40_load_add__gensym318_stop_to_i_bool_broadcast_init.us, <8 x i32> undef, <8 x i32> zeroinitializer
%val_load_logicalnot.i728.us = xor <8 x i32> %equal__s40_load_add__gensym318_stop_to_i_bool_broadcast.us, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%"oldMask&test.us" = and <8 x i32> %internal_mask_memory.0.us, %val_load_logicalnot.i728.us
%floatmask.i725.us = bitcast <8 x i32> %"oldMask&test.us" to <8 x float>
%v.i726.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i725.us) #0
%cmp.i727.us = icmp eq i32 %v.i726.us, 0
br i1 %cmp.i727.us, label %for_exit, label %foreach_full_body.lr.ph.us
partial_inner_all_outer.us: ; preds = %for_exit55.us
%before_full_end.us = icmp slt i32 %new_counter.us, %add_height_load__to_int32
br i1 %before_full_end.us, label %partial_inner_only, label %for_test.us
foreach_full_body.us: ; preds = %foreach_full_body.lr.ph.us, %for_exit55.us
%counter.1891.us = phi i32 [ 1, %foreach_full_body.lr.ph.us ], [ %new_counter.us, %for_exit55.us ]
%blend.i.i834890.us = phi <8 x float> [ %blend.i.i833.us, %foreach_full_body.lr.ph.us ], [ %blend.i.i835.ph.us, %for_exit55.us ]
%blend.i.i826838889.us = phi <8 x float> [ %blend.i.i826837.us, %foreach_full_body.lr.ph.us ], [ %blend.i.i826839.ph.us, %for_exit55.us ]
%smear_counter_init48.us = insertelement <8 x i32> undef, i32 %counter.1891.us, i32 0
%smear_counter49.us = shufflevector <8 x i32> %smear_counter_init48.us, <8 x i32> undef, <8 x i32> zeroinitializer
%iter_val50.us = add <8 x i32> %smear_counter49.us, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%j_load51_to_float.us = sitofp <8 x i32> %iter_val50.us to <8 x float>
%mul_j_load51_to_float_dy_load_broadcast.us = fmul <8 x float> %dy_load_broadcast, %j_load51_to_float.us
%add_y0_load_broadcast_mul_j_load51_to_float_dy_load_broadcast.us = fadd <8 x float> %y0_load_broadcast, %mul_j_load51_to_float_dy_load_broadcast.us
br label %for_test52.outer.us
for_test52.us: ; preds = %for_test52.outer.us, %safe_if_run_true.us
%internal_mask_memory.2.us = phi <8 x i32> [ zeroinitializer, %safe_if_run_true.us ], [ %internal_mask_memory.2.ph.us, %for_test52.outer.us ]
%"oldMask&test60.us" = select <8 x i1> %less___i_8514_load_max_iters_load_broadcast.us, <8 x i32> %internal_mask_memory.2.us, <8 x i32> zeroinitializer
%floatmask.i722.us = bitcast <8 x i32> %"oldMask&test60.us" to <8 x float>
%v.i723.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i722.us) #0
%cmp.i724.us = icmp eq i32 %v.i723.us, 0
br i1 %cmp.i724.us, label %for_exit55.us, label %for_loop54.us
for_loop54.us: ; preds = %for_test52.us
%"oldMask&test70.us" = select <8 x i1> %less__add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68.us, <8 x i32> %"oldMask&test60.us", <8 x i32> zeroinitializer
%floatmask.i719.us = bitcast <8 x i32> %"oldMask&test70.us" to <8 x float>
%v.i720.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i719.us) #0
%cmp.i721.us = icmp eq i32 %v.i720.us, 0
br i1 %cmp.i721.us, label %safe_if_after_true.us, label %safe_if_run_true.us
safe_if_run_true.us: ; preds = %for_loop54.us
%"equal_finished&func_internal_mask&function_mask66.us" = icmp eq i32 %v.i720.us, %v.i723.us
br i1 %"equal_finished&func_internal_mask&function_mask66.us", label %for_test52.us, label %safe_if_after_true.us
safe_if_after_true.us: ; preds = %safe_if_run_true.us, %for_loop54.us
%break_lanes_memory58.1.us = phi <8 x i32> [ %"oldMask&test70.us", %safe_if_run_true.us ], [ zeroinitializer, %for_loop54.us ]
%0 = bitcast <8 x i32> %"oldMask&test60.us" to <8 x float>
%floatmask.i716.us = select <8 x i1> %less__add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68.us, <8 x float> zeroinitializer, <8 x float> %0
%v.i717.us = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i716.us) #0
%cmp.i718.us = icmp eq i32 %v.i717.us, 0
br i1 %cmp.i718.us, label %if_done.us, label %safe_if_run_false.us
safe_if_run_false.us: ; preds = %safe_if_after_true.us
%sub_mul___z_re_8512_load82___z_re_8512_load83_mul___z_im_8513_load84___z_im_8513_load85.us = fsub <8 x float> %mul___z_re_8512_load___z_re_8512_load67.us, %mul___z_im_8513_load___z_im_8513_load68.us
%blend.i.i.us = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i835.ph.us, <8 x float> %sub_mul___z_re_8512_load82___z_re_8512_load83_mul___z_im_8513_load84___z_im_8513_load85.us, <8 x float> %floatmask.i716.us) #1
%mul____z_re_8512_load87.us = fmul <8 x float> %blend.i.i823828.ph.us, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul____z_re_8512_load87___z_im_8513_load88.us = fmul <8 x float> %blend.i.i820830.ph.us, %mul____z_re_8512_load87.us
%blend.i.i826.us = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i826839.ph.us, <8 x float> %mul_mul____z_re_8512_load87___z_im_8513_load88.us, <8 x float> %floatmask.i716.us) #1
%add_x_load90___new_re_8515_load.us = fadd <8 x float> %add_x0_load_mul_i_load_to_float_dx_load_broadcast.us, %blend.i.i.us
%blend.i.i823.us = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i823828.ph.us, <8 x float> %add_x_load90___new_re_8515_load.us, <8 x float> %floatmask.i716.us) #1
%add_y_load92___new_im_8516_load.us = fadd <8 x float> %add_y0_load_broadcast_mul_j_load51_to_float_dy_load_broadcast.us, %blend.i.i826.us
%blend.i.i820.us = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i820830.ph.us, <8 x float> %add_y_load92___new_im_8516_load.us, <8 x float> %floatmask.i716.us) #1
%add___i_8514_load94_.us = add <8 x i64> %final.i817832.ph.us, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%old01.i803.us = shufflevector <8 x i64> %final.i817832.ph.us, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%old01f.i804.us = bitcast <4 x i64> %old01.i803.us to <8 x float>
%new01.i805.us = shufflevector <8 x i64> %add___i_8514_load94_.us, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%new01f.i806.us = bitcast <4 x i64> %new01.i805.us to <8 x float>
%mask01.i807.us = shufflevector <8 x float> %floatmask.i716.us, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
%result01f.i808.us = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f.i804.us, <8 x float> %new01f.i806.us, <8 x float> %mask01.i807.us) #1
%result01.i809.us = bitcast <8 x float> %result01f.i808.us to <4 x i64>
%old23.i810.us = shufflevector <8 x i64> %final.i817832.ph.us, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%old23f.i811.us = bitcast <4 x i64> %old23.i810.us to <8 x float>
%new23.i812.us = shufflevector <8 x i64> %add___i_8514_load94_.us, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%new23f.i813.us = bitcast <4 x i64> %new23.i812.us to <8 x float>
%mask23.i814.us = shufflevector <8 x float> %floatmask.i716.us, <8 x float> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
%result23f.i815.us = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f.i811.us, <8 x float> %new23f.i813.us, <8 x float> %mask23.i814.us) #1
%result23.i816.us = bitcast <8 x float> %result23f.i815.us to <4 x i64>
%final.i817.us = shufflevector <4 x i64> %result01.i809.us, <4 x i64> %result23.i816.us, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
br label %if_done.us
if_done.us: ; preds = %safe_if_run_false.us, %safe_if_after_true.us
%blend.i.i826840.us = phi <8 x float> [ %blend.i.i826839.ph.us, %safe_if_after_true.us ], [ %blend.i.i826.us, %safe_if_run_false.us ]
%blend.i.i836.us = phi <8 x float> [ %blend.i.i835.ph.us, %safe_if_after_true.us ], [ %blend.i.i.us, %safe_if_run_false.us ]
%final.i817831.us = phi <8 x i64> [ %final.i817832.ph.us, %safe_if_after_true.us ], [ %final.i817.us, %safe_if_run_false.us ]
%blend.i.i820829.us = phi <8 x float> [ %blend.i.i820830.ph.us, %safe_if_after_true.us ], [ %blend.i.i820.us, %safe_if_run_false.us ]
%blend.i.i823827.us = phi <8 x float> [ %blend.i.i823828.ph.us, %safe_if_after_true.us ], [ %blend.i.i823.us, %safe_if_run_false.us ]
%"!(break|continue)_lanes.us" = xor <8 x i32> %break_lanes_memory58.1.us, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask102.us = and <8 x i32> %"oldMask&test60.us", %"!(break|continue)_lanes.us"
br label %for_test52.outer.us
for_exit55.us: ; preds = %for_test52.us
%_gensym5_load_to_float.us = sitofp <8 x i64> %final.i817832.ph.us to <8 x float>
%smear_counter49_cast.elt0.us = zext i32 %counter.1891.us to i64
%add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast.elt0.us = add i64 %smear_counter49_cast.elt0.us, %mul_output__len__1_load_sub_i_load115_.us
%add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast_cast.elt0.us = trunc i64 %add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast.elt0.us to i32
%shl_add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast_cast_.elt0.us = shl i32 %add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast_cast.elt0.us, 2
%"varying+const_offsets.elt0.us" = add i32 %shl_add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast_cast_.elt0.us, -4
%1 = sext i32 %"varying+const_offsets.elt0.us" to i64
%ptr.us = getelementptr i8* %output_load_ptr2int_2void, i64 %1, !filename !2, !first_line !3, !first_column !4, !last_line !3, !last_column !5
%ptrcast.us = bitcast i8* %ptr.us to <8 x float>*
store <8 x float> %_gensym5_load_to_float.us, <8 x float>* %ptrcast.us, align 4, !filename !2, !first_line !3, !first_column !4, !last_line !3, !last_column !5
%new_counter.us = add i32 %counter.1891.us, 8
%before_aligned_end44.us = icmp slt i32 %new_counter.us, %aligned_end
br i1 %before_aligned_end44.us, label %foreach_full_body.us, label %partial_inner_all_outer.us
for_test52.outer.us: ; preds = %if_done.us, %foreach_full_body.us
%blend.i.i826839.ph.us = phi <8 x float> [ %blend.i.i826840.us, %if_done.us ], [ %blend.i.i826838889.us, %foreach_full_body.us ]
%blend.i.i835.ph.us = phi <8 x float> [ %blend.i.i836.us, %if_done.us ], [ %blend.i.i834890.us, %foreach_full_body.us ]
%final.i817832.ph.us = phi <8 x i64> [ %final.i817831.us, %if_done.us ], [ zeroinitializer, %foreach_full_body.us ]
%blend.i.i820830.ph.us = phi <8 x float> [ %blend.i.i820829.us, %if_done.us ], [ %add_y0_load_broadcast_mul_j_load51_to_float_dy_load_broadcast.us, %foreach_full_body.us ]
%blend.i.i823828.ph.us = phi <8 x float> [ %blend.i.i823827.us, %if_done.us ], [ %add_x0_load_mul_i_load_to_float_dx_load_broadcast.us, %foreach_full_body.us ]
%internal_mask_memory.2.ph.us = phi <8 x i32> [ %new_mask102.us, %if_done.us ], [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %foreach_full_body.us ]
%less___i_8514_load_max_iters_load_broadcast.us = icmp slt <8 x i64> %final.i817832.ph.us, %max_iters_load_broadcast
%mul___z_re_8512_load___z_re_8512_load67.us = fmul <8 x float> %blend.i.i823828.ph.us, %blend.i.i823828.ph.us
%mul___z_im_8513_load___z_im_8513_load68.us = fmul <8 x float> %blend.i.i820830.ph.us, %blend.i.i820830.ph.us
%add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68.us = fadd <8 x float> %mul___z_im_8513_load___z_im_8513_load68.us, %mul___z_re_8512_load___z_re_8512_load67.us
%less__add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68.us = fcmp ugt <8 x float> %add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68.us, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
br label %for_test52.us
foreach_full_body.lr.ph.us: ; preds = %for_test.us
%add__s40_load30_.us = add i64 %_s40.0.us, 1
%i_load_to_float.us = sitofp i64 %_s40.0.us to float
%mul_i_load_to_float_dx_load.us = fmul float %i_load_to_float.us, %dx
%add_x0_load_mul_i_load_to_float_dx_load.us = fadd float %mul_i_load_to_float_dx_load.us, %x0
%add_x0_load_mul_i_load_to_float_dx_load_broadcast_init.us = insertelement <8 x float> undef, float %add_x0_load_mul_i_load_to_float_dx_load.us, i32 0
%add_x0_load_mul_i_load_to_float_dx_load_broadcast.us = shufflevector <8 x float> %add_x0_load_mul_i_load_to_float_dx_load_broadcast_init.us, <8 x float> undef, <8 x i32> zeroinitializer
%sub_i_load115_.us = add i64 %_s40.0.us, -1
%mul_output__len__1_load_sub_i_load115_.us = mul i64 %sub_i_load115_.us, %output__len__1
br label %foreach_full_body.us
for_test: ; preds = %partial_inner_all_outer, %for_test.outer
%_s40.0 = phi i64 [ %add__s40_load30_, %partial_inner_all_outer ], [ %_s40.0.ph, %for_test.outer ]
%internal_mask_memory.0 = phi <8 x i32> [ %"oldMask&test", %partial_inner_all_outer ], [ %internal_mask_memory.0.ph, %for_test.outer ]
%equal__s40_load_add__gensym318_stop_ = icmp eq i64 %_s40.0, %add__gensym318_stop_
%equal__s40_load_add__gensym318_stop_to_i_bool = sext i1 %equal__s40_load_add__gensym318_stop_ to i32
%equal__s40_load_add__gensym318_stop_to_i_bool_broadcast_init = insertelement <8 x i32> undef, i32 %equal__s40_load_add__gensym318_stop_to_i_bool, i32 0
%equal__s40_load_add__gensym318_stop_to_i_bool_broadcast = shufflevector <8 x i32> %equal__s40_load_add__gensym318_stop_to_i_bool_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%val_load_logicalnot.i728 = xor <8 x i32> %equal__s40_load_add__gensym318_stop_to_i_bool_broadcast, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%"oldMask&test" = and <8 x i32> %internal_mask_memory.0, %val_load_logicalnot.i728
%floatmask.i725 = bitcast <8 x i32> %"oldMask&test" to <8 x float>
%v.i726 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i725) #0
%cmp.i727 = icmp eq i32 %v.i726, 0
br i1 %cmp.i727, label %for_exit, label %partial_inner_all_outer
for_exit: ; preds = %for_test288, %for_test, %for_test.us
ret void
partial_inner_all_outer: ; preds = %for_test
%add__s40_load30_ = add i64 %_s40.0, 1
%before_full_end = icmp sgt i32 %add_height_load__to_int32, 1
br i1 %before_full_end, label %partial_inner_only, label %for_test
partial_inner_only: ; preds = %partial_inner_all_outer, %partial_inner_all_outer.us
%add__s40_load30_.lcssa = phi i64 [ %add__s40_load30_.us, %partial_inner_all_outer.us ], [ %add__s40_load30_, %partial_inner_all_outer ]
%"oldMask&test.lcssa882" = phi <8 x i32> [ %"oldMask&test.us", %partial_inner_all_outer.us ], [ %"oldMask&test", %partial_inner_all_outer ]
%_s40.0.lcssa881 = phi i64 [ %_s40.0.us, %partial_inner_all_outer.us ], [ %_s40.0, %partial_inner_all_outer ]
%counter.1.lcssa.lcssa = phi i32 [ %new_counter.us, %partial_inner_all_outer.us ], [ 1, %partial_inner_all_outer ]
%blend.i.i834.lcssa.lcssa = phi <8 x float> [ %blend.i.i835.ph.us, %partial_inner_all_outer.us ], [ %blend.i.i833.ph, %partial_inner_all_outer ]
%blend.i.i826838.lcssa.lcssa = phi <8 x float> [ %blend.i.i826839.ph.us, %partial_inner_all_outer.us ], [ %blend.i.i826837.ph, %partial_inner_all_outer ]
%smear_counter_init128 = insertelement <8 x i32> undef, i32 %counter.1.lcssa.lcssa, i32 0
%smear_counter129 = shufflevector <8 x i32> %smear_counter_init128, <8 x i32> undef, <8 x i32> zeroinitializer
%iter_val130 = add <8 x i32> %smear_counter129, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%smear_end_init131 = insertelement <8 x i32> undef, i32 %add_height_load__to_int32, i32 0
%smear_end132 = shufflevector <8 x i32> %smear_end_init131, <8 x i32> undef, <8 x i32> zeroinitializer
%cmp133 = icmp slt <8 x i32> %iter_val130, %smear_end132
%cmp133_to_boolvec = sext <8 x i1> %cmp133 to <8 x i32>
%i_load140_to_float = sitofp i64 %_s40.0.lcssa881 to float
%mul_i_load140_to_float_dx_load141 = fmul float %i_load140_to_float, %dx
%add_x0_load139_mul_i_load140_to_float_dx_load141 = fadd float %mul_i_load140_to_float_dx_load141, %x0
%add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast_init = insertelement <8 x float> undef, float %add_x0_load139_mul_i_load140_to_float_dx_load141, i32 0
%add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast = shufflevector <8 x float> %add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%j_load144_to_float = sitofp <8 x i32> %iter_val130 to <8 x float>
%mul_j_load144_to_float_dy_load145_broadcast = fmul <8 x float> %dy_load_broadcast, %j_load144_to_float
%add_y0_load143_broadcast_mul_j_load144_to_float_dy_load145_broadcast = fadd <8 x float> %y0_load_broadcast, %mul_j_load144_to_float_dy_load145_broadcast
br label %for_test153.outer
for_test153.outer: ; preds = %if_done175, %partial_inner_only
%blend.i.i795851.ph = phi <8 x float> [ %blend.i.i795852, %if_done175 ], [ %blend.i.i795850.ph, %partial_inner_only ]
%blend.i.i798848.ph = phi <8 x float> [ %blend.i.i798849, %if_done175 ], [ %blend.i.i798847.ph, %partial_inner_only ]
%final.i786846.ph = phi <8 x i64> [ %final.i786845, %if_done175 ], [ zeroinitializer, %partial_inner_only ]
%blend.i.i789844.ph = phi <8 x float> [ %blend.i.i789843, %if_done175 ], [ %add_y0_load143_broadcast_mul_j_load144_to_float_dy_load145_broadcast, %partial_inner_only ]
%blend.i.i792842.ph = phi <8 x float> [ %blend.i.i792841, %if_done175 ], [ %add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast, %partial_inner_only ]
%internal_mask_memory.4.ph = phi <8 x i32> [ %new_mask229, %if_done175 ], [ %cmp133_to_boolvec, %partial_inner_only ]
%less___i_8514_load160_max_iters_load161_broadcast = icmp slt <8 x i64> %final.i786846.ph, %max_iters_load_broadcast
%mul___z_re_8512_load170___z_re_8512_load171 = fmul <8 x float> %blend.i.i792842.ph, %blend.i.i792842.ph
%mul___z_im_8513_load172___z_im_8513_load173 = fmul <8 x float> %blend.i.i789844.ph, %blend.i.i789844.ph
%add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173 = fadd <8 x float> %mul___z_im_8513_load172___z_im_8513_load173, %mul___z_re_8512_load170___z_re_8512_load171
%less__add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173 = fcmp ugt <8 x float> %add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
br label %for_test153
for_test153: ; preds = %safe_if_run_true177, %for_test153.outer
%internal_mask_memory.4 = phi <8 x i32> [ zeroinitializer, %safe_if_run_true177 ], [ %internal_mask_memory.4.ph, %for_test153.outer ]
%"oldMask&test163" = select <8 x i1> %less___i_8514_load160_max_iters_load161_broadcast, <8 x i32> %internal_mask_memory.4, <8 x i32> zeroinitializer
%floatmask.i707 = bitcast <8 x i32> %"oldMask&test163" to <8 x float>
%v.i708 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i707) #0
%cmp.i709 = icmp eq i32 %v.i708, 0
br i1 %cmp.i709, label %for_exit156, label %for_loop155
for_loop155: ; preds = %for_test153
%"oldMask&test178" = select <8 x i1> %less__add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173, <8 x i32> %"oldMask&test163", <8 x i32> zeroinitializer
%floatmask.i704 = bitcast <8 x i32> %"oldMask&test178" to <8 x float>
%v.i705 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i704) #0
%cmp.i706 = icmp eq i32 %v.i705, 0
br i1 %cmp.i706, label %safe_if_after_true176, label %safe_if_run_true177
for_exit156: ; preds = %for_test153
%sub_i_load246_ = add i64 %_s40.0.lcssa881, -1
%mul_output__len__1_load245_sub_i_load246_ = mul i64 %sub_i_load246_, %output__len__1
%_gensym5_load248_to_float = sitofp <8 x i64> %final.i786846.ph to <8 x float>
%j.0_cast.elt0 = zext i32 %counter.1.lcssa.lcssa to i64
%add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast.elt0 = add i64 %j.0_cast.elt0, %mul_output__len__1_load245_sub_i_load246_
%add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast_cast.elt0 = trunc i64 %add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast.elt0 to i32
%shl_add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast_cast_.elt0 = shl i32 %add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast_cast.elt0, 2
%"varying+const_offsets.elt0646" = add i32 %shl_add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast_cast_.elt0, -4
%2 = sext i32 %"varying+const_offsets.elt0646" to i64
%ptr647 = getelementptr i8* %output_load_ptr2int_2void, i64 %2
%mask.i.i800 = bitcast <8 x i32> %cmp133_to_boolvec to <8 x float>
call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr647, <8 x float> %mask.i.i800, <8 x float> %_gensym5_load248_to_float) #1
br label %for_test.outer
if_done175: ; preds = %safe_if_run_false196, %safe_if_after_true176
%blend.i.i795852 = phi <8 x float> [ %blend.i.i795851.ph, %safe_if_after_true176 ], [ %blend.i.i795, %safe_if_run_false196 ]
%blend.i.i798849 = phi <8 x float> [ %blend.i.i798848.ph, %safe_if_after_true176 ], [ %blend.i.i798, %safe_if_run_false196 ]
%final.i786845 = phi <8 x i64> [ %final.i786846.ph, %safe_if_after_true176 ], [ %final.i786, %safe_if_run_false196 ]
%blend.i.i789843 = phi <8 x float> [ %blend.i.i789844.ph, %safe_if_after_true176 ], [ %blend.i.i789, %safe_if_run_false196 ]
%blend.i.i792841 = phi <8 x float> [ %blend.i.i792842.ph, %safe_if_after_true176 ], [ %blend.i.i792, %safe_if_run_false196 ]
%"!(break|continue)_lanes227" = xor <8 x i32> %break_lanes_memory159.1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask229 = and <8 x i32> %"oldMask&test163", %"!(break|continue)_lanes227"
br label %for_test153.outer
safe_if_after_true176: ; preds = %safe_if_run_true177, %for_loop155
%break_lanes_memory159.1 = phi <8 x i32> [ %"oldMask&test178", %safe_if_run_true177 ], [ zeroinitializer, %for_loop155 ]
%3 = bitcast <8 x i32> %"oldMask&test163" to <8 x float>
%floatmask.i701 = select <8 x i1> %less__add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173, <8 x float> zeroinitializer, <8 x float> %3
%v.i702 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i701) #0
%cmp.i703 = icmp eq i32 %v.i702, 0
br i1 %cmp.i703, label %if_done175, label %safe_if_run_false196
safe_if_run_true177: ; preds = %for_loop155
%"equal_finished&func193_internal_mask&function_mask169" = icmp eq i32 %v.i705, %v.i708
br i1 %"equal_finished&func193_internal_mask&function_mask169", label %for_test153, label %safe_if_after_true176
safe_if_run_false196: ; preds = %safe_if_after_true176
%sub_mul___z_re_8512_load203___z_re_8512_load204_mul___z_im_8513_load205___z_im_8513_load206 = fsub <8 x float> %mul___z_re_8512_load170___z_re_8512_load171, %mul___z_im_8513_load172___z_im_8513_load173
%blend.i.i798 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i798848.ph, <8 x float> %sub_mul___z_re_8512_load203___z_re_8512_load204_mul___z_im_8513_load205___z_im_8513_load206, <8 x float> %floatmask.i701) #1
%mul____z_re_8512_load208 = fmul <8 x float> %blend.i.i792842.ph, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul____z_re_8512_load208___z_im_8513_load209 = fmul <8 x float> %blend.i.i789844.ph, %mul____z_re_8512_load208
%blend.i.i795 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i795851.ph, <8 x float> %mul_mul____z_re_8512_load208___z_im_8513_load209, <8 x float> %floatmask.i701) #1
%add_x_load211___new_re_8515_load212 = fadd <8 x float> %add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast, %blend.i.i798
%blend.i.i792 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i792842.ph, <8 x float> %add_x_load211___new_re_8515_load212, <8 x float> %floatmask.i701) #1
%add_y_load214___new_im_8516_load215 = fadd <8 x float> %add_y0_load143_broadcast_mul_j_load144_to_float_dy_load145_broadcast, %blend.i.i795
%blend.i.i789 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i789844.ph, <8 x float> %add_y_load214___new_im_8516_load215, <8 x float> %floatmask.i701) #1
%add___i_8514_load217_ = add <8 x i64> %final.i786846.ph, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%old01.i772 = shufflevector <8 x i64> %final.i786846.ph, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%old01f.i773 = bitcast <4 x i64> %old01.i772 to <8 x float>
%new01.i774 = shufflevector <8 x i64> %add___i_8514_load217_, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%new01f.i775 = bitcast <4 x i64> %new01.i774 to <8 x float>
%mask01.i776 = shufflevector <8 x float> %floatmask.i701, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
%result01f.i777 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f.i773, <8 x float> %new01f.i775, <8 x float> %mask01.i776) #1
%result01.i778 = bitcast <8 x float> %result01f.i777 to <4 x i64>
%old23.i779 = shufflevector <8 x i64> %final.i786846.ph, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%old23f.i780 = bitcast <4 x i64> %old23.i779 to <8 x float>
%new23.i781 = shufflevector <8 x i64> %add___i_8514_load217_, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%new23f.i782 = bitcast <4 x i64> %new23.i781 to <8 x float>
%mask23.i783 = shufflevector <8 x float> %floatmask.i701, <8 x float> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
%result23f.i784 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f.i780, <8 x float> %new23f.i782, <8 x float> %mask23.i783) #1
%result23.i785 = bitcast <8 x float> %result23f.i784 to <4 x i64>
%final.i786 = shufflevector <4 x i64> %result01.i778, <4 x i64> %result23.i785, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
br label %if_done175
for_test288: ; preds = %for_test288.outer, %partial_inner_all_outer353
%blend.i.i766863 = phi <8 x float> [ %blend.i.i766864.lcssa, %partial_inner_all_outer353 ], [ %blend.i.i766863.ph, %for_test288.outer ]
%blend.i.i769859 = phi <8 x float> [ %blend.i.i769860.lcssa, %partial_inner_all_outer353 ], [ %blend.i.i769859.ph, %for_test288.outer ]
%_s40285.0 = phi i64 [ %add__s40_load312_, %partial_inner_all_outer353 ], [ %_s40285.0.ph, %for_test288.outer ]
%internal_mask_memory.6 = phi <8 x i32> [ %"oldMask&test302", %partial_inner_all_outer353 ], [ %internal_mask_memory.6.ph, %for_test288.outer ]
%equal__s40_load295_add__gensym3276296_stop_ = icmp eq i64 %_s40285.0, %add__gensym318_stop_
%equal__s40_load295_add__gensym3276296_stop_to_i_bool = sext i1 %equal__s40_load295_add__gensym3276296_stop_ to i32
%equal__s40_load295_add__gensym3276296_stop_to_i_bool_broadcast_init = insertelement <8 x i32> undef, i32 %equal__s40_load295_add__gensym3276296_stop_to_i_bool, i32 0
%equal__s40_load295_add__gensym3276296_stop_to_i_bool_broadcast = shufflevector <8 x i32> %equal__s40_load295_add__gensym3276296_stop_to_i_bool_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%val_load_logicalnot.i = xor <8 x i32> %equal__s40_load295_add__gensym3276296_stop_to_i_bool_broadcast, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%"oldMask&test302" = and <8 x i32> %internal_mask_memory.6, %val_load_logicalnot.i
%"internal_mask&function_mask306" = and <8 x i32> %"oldMask&test302", %__mask
%floatmask.i692 = bitcast <8 x i32> %"internal_mask&function_mask306" to <8 x float>
%v.i693 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i692) #0
%cmp.i694 = icmp eq i32 %v.i693, 0
br i1 %cmp.i694, label %for_exit, label %for_loop290
for_loop290: ; preds = %for_test288
%add__s40_load312_ = add i64 %_s40285.0, 1
br i1 %before_aligned_end44888, label %foreach_full_body317.lr.ph, label %partial_inner_all_outer353
foreach_full_body317.lr.ph: ; preds = %for_loop290
%i_load365_to_float = sitofp i64 %_s40285.0 to float
%mul_i_load365_to_float_dx_load366 = fmul float %i_load365_to_float, %dx
%add_x0_load364_mul_i_load365_to_float_dx_load366 = fadd float %mul_i_load365_to_float_dx_load366, %x0
%add_x0_load364_mul_i_load365_to_float_dx_load366_broadcast_init = insertelement <8 x float> undef, float %add_x0_load364_mul_i_load365_to_float_dx_load366, i32 0
%add_x0_load364_mul_i_load365_to_float_dx_load366_broadcast = shufflevector <8 x float> %add_x0_load364_mul_i_load365_to_float_dx_load366_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%sub_i_load471_ = add i64 %_s40285.0, -1
%mul_output__len__1_load470_sub_i_load471_ = mul i64 %sub_i_load471_, %output__len__1
br label %foreach_full_body317
foreach_full_body317: ; preds = %for_exit381, %foreach_full_body317.lr.ph
%counter331.1897 = phi i32 [ 1, %foreach_full_body317.lr.ph ], [ %new_counter484, %for_exit381 ]
%blend.i.i769860896 = phi <8 x float> [ %blend.i.i769859, %foreach_full_body317.lr.ph ], [ %blend.i.i769861.ph, %for_exit381 ]
%blend.i.i766864895 = phi <8 x float> [ %blend.i.i766863, %foreach_full_body317.lr.ph ], [ %blend.i.i766865.ph, %for_exit381 ]
%smear_counter_init360 = insertelement <8 x i32> undef, i32 %counter331.1897, i32 0
%smear_counter361 = shufflevector <8 x i32> %smear_counter_init360, <8 x i32> undef, <8 x i32> zeroinitializer
%iter_val362 = add <8 x i32> %smear_counter361, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%j_load369_to_float = sitofp <8 x i32> %iter_val362 to <8 x float>
%mul_j_load369_to_float_dy_load370_broadcast = fmul <8 x float> %dy_load_broadcast, %j_load369_to_float
%add_y0_load368_broadcast_mul_j_load369_to_float_dy_load370_broadcast = fadd <8 x float> %y0_load_broadcast, %mul_j_load369_to_float_dy_load370_broadcast
br label %for_test378.outer
partial_inner_all_outer353: ; preds = %for_exit381, %for_loop290
%counter331.1.lcssa = phi i32 [ 1, %for_loop290 ], [ %new_counter484, %for_exit381 ]
%blend.i.i769860.lcssa = phi <8 x float> [ %blend.i.i769859, %for_loop290 ], [ %blend.i.i769861.ph, %for_exit381 ]
%blend.i.i766864.lcssa = phi <8 x float> [ %blend.i.i766863, %for_loop290 ], [ %blend.i.i766865.ph, %for_exit381 ]
%before_full_end487 = icmp slt i32 %counter331.1.lcssa, %add_height_load__to_int32
br i1 %before_full_end487, label %partial_inner_only485, label %for_test288
for_test378: ; preds = %safe_if_run_true402, %for_test378.outer
%internal_mask_memory.8 = phi <8 x i32> [ zeroinitializer, %safe_if_run_true402 ], [ %internal_mask_memory.8.ph, %for_test378.outer ]
%"oldMask&test388" = select <8 x i1> %less___i_8514_load385_max_iters_load386_broadcast, <8 x i32> %internal_mask_memory.8, <8 x i32> zeroinitializer
%floatmask.i689 = bitcast <8 x i32> %"oldMask&test388" to <8 x float>
%v.i690 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i689) #0
%cmp.i691 = icmp eq i32 %v.i690, 0
br i1 %cmp.i691, label %for_exit381, label %for_loop380
for_loop380: ; preds = %for_test378
%"oldMask&test403" = select <8 x i1> %less__add_mul___z_re_8512_load395___z_re_8512_load396_mul___z_im_8513_load397___z_im_8513_load398, <8 x i32> %"oldMask&test388", <8 x i32> zeroinitializer
%floatmask.i686 = bitcast <8 x i32> %"oldMask&test403" to <8 x float>
%v.i687 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i686) #0
%cmp.i688 = icmp eq i32 %v.i687, 0
br i1 %cmp.i688, label %safe_if_after_true401, label %safe_if_run_true402
for_exit381: ; preds = %for_test378
%_gensym5_load473_to_float = sitofp <8 x i64> %final.i757858.ph to <8 x float>
%smear_counter361_cast.elt0 = zext i32 %counter331.1897 to i64
%add_mul_output__len__1_load470_sub_i_load471__broadcast_smear_counter361_cast.elt0 = add i64 %smear_counter361_cast.elt0, %mul_output__len__1_load470_sub_i_load471_
%add_mul_output__len__1_load470_sub_i_load471__broadcast_smear_counter361_cast_cast.elt0 = trunc i64 %add_mul_output__len__1_load470_sub_i_load471__broadcast_smear_counter361_cast.elt0 to i32
%shl_add_mul_output__len__1_load470_sub_i_load471__broadcast_smear_counter361_cast_cast_.elt0 = shl i32 %add_mul_output__len__1_load470_sub_i_load471__broadcast_smear_counter361_cast_cast.elt0, 2
%"varying+const_offsets.elt0652" = add i32 %shl_add_mul_output__len__1_load470_sub_i_load471__broadcast_smear_counter361_cast_cast_.elt0, -4
%4 = sext i32 %"varying+const_offsets.elt0652" to i64
%ptr653 = getelementptr i8* %output_load_ptr2int_2void, i64 %4, !filename !2, !first_line !3, !first_column !4, !last_line !3, !last_column !5
%ptrcast654 = bitcast i8* %ptr653 to <8 x float>*
store <8 x float> %_gensym5_load473_to_float, <8 x float>* %ptrcast654, align 4, !filename !2, !first_line !3, !first_column !4, !last_line !3, !last_column !5
%new_counter484 = add i32 %counter331.1897, 8
%before_aligned_end355 = icmp slt i32 %new_counter484, %aligned_end
br i1 %before_aligned_end355, label %foreach_full_body317, label %partial_inner_all_outer353
if_done400: ; preds = %safe_if_run_false421, %safe_if_after_true401
%blend.i.i766866 = phi <8 x float> [ %blend.i.i766865.ph, %safe_if_after_true401 ], [ %blend.i.i766, %safe_if_run_false421 ]
%blend.i.i769862 = phi <8 x float> [ %blend.i.i769861.ph, %safe_if_after_true401 ], [ %blend.i.i769, %safe_if_run_false421 ]
%final.i757857 = phi <8 x i64> [ %final.i757858.ph, %safe_if_after_true401 ], [ %final.i757, %safe_if_run_false421 ]
%blend.i.i760855 = phi <8 x float> [ %blend.i.i760856.ph, %safe_if_after_true401 ], [ %blend.i.i760, %safe_if_run_false421 ]
%blend.i.i763853 = phi <8 x float> [ %blend.i.i763854.ph, %safe_if_after_true401 ], [ %blend.i.i763, %safe_if_run_false421 ]
%"!(break|continue)_lanes452" = xor <8 x i32> %break_lanes_memory384.1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask454 = and <8 x i32> %"oldMask&test388", %"!(break|continue)_lanes452"
br label %for_test378.outer
for_test378.outer: ; preds = %if_done400, %foreach_full_body317
%blend.i.i766865.ph = phi <8 x float> [ %blend.i.i766866, %if_done400 ], [ %blend.i.i766864895, %foreach_full_body317 ]
%blend.i.i769861.ph = phi <8 x float> [ %blend.i.i769862, %if_done400 ], [ %blend.i.i769860896, %foreach_full_body317 ]
%final.i757858.ph = phi <8 x i64> [ %final.i757857, %if_done400 ], [ zeroinitializer, %foreach_full_body317 ]
%blend.i.i760856.ph = phi <8 x float> [ %blend.i.i760855, %if_done400 ], [ %add_y0_load368_broadcast_mul_j_load369_to_float_dy_load370_broadcast, %foreach_full_body317 ]
%blend.i.i763854.ph = phi <8 x float> [ %blend.i.i763853, %if_done400 ], [ %add_x0_load364_mul_i_load365_to_float_dx_load366_broadcast, %foreach_full_body317 ]
%internal_mask_memory.8.ph = phi <8 x i32> [ %new_mask454, %if_done400 ], [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %foreach_full_body317 ]
%less___i_8514_load385_max_iters_load386_broadcast = icmp slt <8 x i64> %final.i757858.ph, %max_iters_load_broadcast
%mul___z_re_8512_load395___z_re_8512_load396 = fmul <8 x float> %blend.i.i763854.ph, %blend.i.i763854.ph
%mul___z_im_8513_load397___z_im_8513_load398 = fmul <8 x float> %blend.i.i760856.ph, %blend.i.i760856.ph
%add_mul___z_re_8512_load395___z_re_8512_load396_mul___z_im_8513_load397___z_im_8513_load398 = fadd <8 x float> %mul___z_im_8513_load397___z_im_8513_load398, %mul___z_re_8512_load395___z_re_8512_load396
%less__add_mul___z_re_8512_load395___z_re_8512_load396_mul___z_im_8513_load397___z_im_8513_load398 = fcmp ugt <8 x float> %add_mul___z_re_8512_load395___z_re_8512_load396_mul___z_im_8513_load397___z_im_8513_load398, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
br label %for_test378
safe_if_after_true401: ; preds = %safe_if_run_true402, %for_loop380
%break_lanes_memory384.1 = phi <8 x i32> [ %"oldMask&test403", %safe_if_run_true402 ], [ zeroinitializer, %for_loop380 ]
%5 = bitcast <8 x i32> %"oldMask&test388" to <8 x float>
%floatmask.i683 = select <8 x i1> %less__add_mul___z_re_8512_load395___z_re_8512_load396_mul___z_im_8513_load397___z_im_8513_load398, <8 x float> zeroinitializer, <8 x float> %5
%v.i684 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i683) #0
%cmp.i685 = icmp eq i32 %v.i684, 0
br i1 %cmp.i685, label %if_done400, label %safe_if_run_false421
safe_if_run_true402: ; preds = %for_loop380
%"equal_finished&func418_internal_mask&function_mask394" = icmp eq i32 %v.i687, %v.i690
br i1 %"equal_finished&func418_internal_mask&function_mask394", label %for_test378, label %safe_if_after_true401
safe_if_run_false421: ; preds = %safe_if_after_true401
%sub_mul___z_re_8512_load428___z_re_8512_load429_mul___z_im_8513_load430___z_im_8513_load431 = fsub <8 x float> %mul___z_re_8512_load395___z_re_8512_load396, %mul___z_im_8513_load397___z_im_8513_load398
%blend.i.i769 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i769861.ph, <8 x float> %sub_mul___z_re_8512_load428___z_re_8512_load429_mul___z_im_8513_load430___z_im_8513_load431, <8 x float> %floatmask.i683) #1
%mul____z_re_8512_load433 = fmul <8 x float> %blend.i.i763854.ph, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul____z_re_8512_load433___z_im_8513_load434 = fmul <8 x float> %blend.i.i760856.ph, %mul____z_re_8512_load433
%blend.i.i766 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i766865.ph, <8 x float> %mul_mul____z_re_8512_load433___z_im_8513_load434, <8 x float> %floatmask.i683) #1
%add_x_load436___new_re_8515_load437 = fadd <8 x float> %add_x0_load364_mul_i_load365_to_float_dx_load366_broadcast, %blend.i.i769
%blend.i.i763 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i763854.ph, <8 x float> %add_x_load436___new_re_8515_load437, <8 x float> %floatmask.i683) #1
%add_y_load439___new_im_8516_load440 = fadd <8 x float> %add_y0_load368_broadcast_mul_j_load369_to_float_dy_load370_broadcast, %blend.i.i766
%blend.i.i760 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i760856.ph, <8 x float> %add_y_load439___new_im_8516_load440, <8 x float> %floatmask.i683) #1
%add___i_8514_load442_ = add <8 x i64> %final.i757858.ph, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%old01.i743 = shufflevector <8 x i64> %final.i757858.ph, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%old01f.i744 = bitcast <4 x i64> %old01.i743 to <8 x float>
%new01.i745 = shufflevector <8 x i64> %add___i_8514_load442_, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%new01f.i746 = bitcast <4 x i64> %new01.i745 to <8 x float>
%mask01.i747 = shufflevector <8 x float> %floatmask.i683, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
%result01f.i748 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f.i744, <8 x float> %new01f.i746, <8 x float> %mask01.i747) #1
%result01.i749 = bitcast <8 x float> %result01f.i748 to <4 x i64>
%old23.i750 = shufflevector <8 x i64> %final.i757858.ph, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%old23f.i751 = bitcast <4 x i64> %old23.i750 to <8 x float>
%new23.i752 = shufflevector <8 x i64> %add___i_8514_load442_, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%new23f.i753 = bitcast <4 x i64> %new23.i752 to <8 x float>
%mask23.i754 = shufflevector <8 x float> %floatmask.i683, <8 x float> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
%result23f.i755 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f.i751, <8 x float> %new23f.i753, <8 x float> %mask23.i754) #1
%result23.i756 = bitcast <8 x float> %result23f.i755 to <4 x i64>
%final.i757 = shufflevector <4 x i64> %result01.i749, <4 x i64> %result23.i756, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
br label %if_done400
partial_inner_only485: ; preds = %partial_inner_all_outer353
%smear_counter_init489 = insertelement <8 x i32> undef, i32 %counter331.1.lcssa, i32 0
%smear_counter490 = shufflevector <8 x i32> %smear_counter_init489, <8 x i32> undef, <8 x i32> zeroinitializer
%iter_val491 = add <8 x i32> %smear_counter490, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%smear_end_init492 = insertelement <8 x i32> undef, i32 %add_height_load__to_int32, i32 0
%smear_end493 = shufflevector <8 x i32> %smear_end_init492, <8 x i32> undef, <8 x i32> zeroinitializer
%cmp494 = icmp slt <8 x i32> %iter_val491, %smear_end493
%cmp494_to_boolvec = sext <8 x i1> %cmp494 to <8 x i32>
%i_load503_to_float = sitofp i64 %_s40285.0 to float
%mul_i_load503_to_float_dx_load504 = fmul float %i_load503_to_float, %dx
%add_x0_load502_mul_i_load503_to_float_dx_load504 = fadd float %mul_i_load503_to_float_dx_load504, %x0
%add_x0_load502_mul_i_load503_to_float_dx_load504_broadcast_init = insertelement <8 x float> undef, float %add_x0_load502_mul_i_load503_to_float_dx_load504, i32 0
%add_x0_load502_mul_i_load503_to_float_dx_load504_broadcast = shufflevector <8 x float> %add_x0_load502_mul_i_load503_to_float_dx_load504_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%j_load507_to_float = sitofp <8 x i32> %iter_val491 to <8 x float>
%mul_j_load507_to_float_dy_load508_broadcast = fmul <8 x float> %dy_load_broadcast, %j_load507_to_float
%add_y0_load506_broadcast_mul_j_load507_to_float_dy_load508_broadcast = fadd <8 x float> %y0_load_broadcast, %mul_j_load507_to_float_dy_load508_broadcast
br label %for_test516.outer
for_test516.outer: ; preds = %if_done538, %partial_inner_only485
%blend.i.i737877.ph = phi <8 x float> [ %blend.i.i737878, %if_done538 ], [ %blend.i.i737876.ph, %partial_inner_only485 ]
%blend.i.i740874.ph = phi <8 x float> [ %blend.i.i740875, %if_done538 ], [ %blend.i.i740873.ph, %partial_inner_only485 ]
%final.i872.ph = phi <8 x i64> [ %final.i871, %if_done538 ], [ zeroinitializer, %partial_inner_only485 ]
%blend.i.i731870.ph = phi <8 x float> [ %blend.i.i731869, %if_done538 ], [ %add_y0_load506_broadcast_mul_j_load507_to_float_dy_load508_broadcast, %partial_inner_only485 ]
%blend.i.i734868.ph = phi <8 x float> [ %blend.i.i734867, %if_done538 ], [ %add_x0_load502_mul_i_load503_to_float_dx_load504_broadcast, %partial_inner_only485 ]
%internal_mask_memory.10.ph = phi <8 x i32> [ %new_mask592, %if_done538 ], [ %cmp494_to_boolvec, %partial_inner_only485 ]
%less___i_8514_load523_max_iters_load524_broadcast = icmp slt <8 x i64> %final.i872.ph, %max_iters_load_broadcast
%mul___z_re_8512_load533___z_re_8512_load534 = fmul <8 x float> %blend.i.i734868.ph, %blend.i.i734868.ph
%mul___z_im_8513_load535___z_im_8513_load536 = fmul <8 x float> %blend.i.i731870.ph, %blend.i.i731870.ph
%add_mul___z_re_8512_load533___z_re_8512_load534_mul___z_im_8513_load535___z_im_8513_load536 = fadd <8 x float> %mul___z_im_8513_load535___z_im_8513_load536, %mul___z_re_8512_load533___z_re_8512_load534
%less__add_mul___z_re_8512_load533___z_re_8512_load534_mul___z_im_8513_load535___z_im_8513_load536 = fcmp ugt <8 x float> %add_mul___z_re_8512_load533___z_re_8512_load534_mul___z_im_8513_load535___z_im_8513_load536, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
br label %for_test516
for_test516: ; preds = %safe_if_run_true540, %for_test516.outer
%internal_mask_memory.10 = phi <8 x i32> [ zeroinitializer, %safe_if_run_true540 ], [ %internal_mask_memory.10.ph, %for_test516.outer ]
%"oldMask&test526" = select <8 x i1> %less___i_8514_load523_max_iters_load524_broadcast, <8 x i32> %internal_mask_memory.10, <8 x i32> zeroinitializer
%floatmask.i674 = bitcast <8 x i32> %"oldMask&test526" to <8 x float>
%v.i675 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i674) #0
%cmp.i676 = icmp eq i32 %v.i675, 0
br i1 %cmp.i676, label %for_exit519, label %for_loop518
for_loop518: ; preds = %for_test516
%"oldMask&test541" = select <8 x i1> %less__add_mul___z_re_8512_load533___z_re_8512_load534_mul___z_im_8513_load535___z_im_8513_load536, <8 x i32> %"oldMask&test526", <8 x i32> zeroinitializer
%floatmask.i671 = bitcast <8 x i32> %"oldMask&test541" to <8 x float>
%v.i672 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i671) #0
%cmp.i673 = icmp eq i32 %v.i672, 0
br i1 %cmp.i673, label %safe_if_after_true539, label %safe_if_run_true540
for_exit519: ; preds = %for_test516
%sub_i_load609_ = add i64 %_s40285.0, -1
%mul_output__len__1_load608_sub_i_load609_ = mul i64 %sub_i_load609_, %output__len__1
%_gensym5_load611_to_float = sitofp <8 x i64> %final.i872.ph to <8 x float>
%j332.0_cast.elt0 = zext i32 %counter331.1.lcssa to i64
%add_mul_output__len__1_load608_sub_i_load609__broadcast_j332.0_cast.elt0 = add i64 %j332.0_cast.elt0, %mul_output__len__1_load608_sub_i_load609_
%add_mul_output__len__1_load608_sub_i_load609__broadcast_j332.0_cast_cast.elt0 = trunc i64 %add_mul_output__len__1_load608_sub_i_load609__broadcast_j332.0_cast.elt0 to i32
%shl_add_mul_output__len__1_load608_sub_i_load609__broadcast_j332.0_cast_cast_.elt0 = shl i32 %add_mul_output__len__1_load608_sub_i_load609__broadcast_j332.0_cast_cast.elt0, 2
%"varying+const_offsets.elt0660" = add i32 %shl_add_mul_output__len__1_load608_sub_i_load609__broadcast_j332.0_cast_cast_.elt0, -4
%6 = sext i32 %"varying+const_offsets.elt0660" to i64
%ptr661 = getelementptr i8* %output_load_ptr2int_2void, i64 %6
%mask.i.i = bitcast <8 x i32> %cmp494_to_boolvec to <8 x float>
call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr661, <8 x float> %mask.i.i, <8 x float> %_gensym5_load611_to_float) #1
br label %for_test288.outer
for_test288.outer: ; preds = %for_exit519, %allocas
%blend.i.i737876.ph = phi <8 x float> [ %blend.i.i737877.ph, %for_exit519 ], [ undef, %allocas ]
%blend.i.i740873.ph = phi <8 x float> [ %blend.i.i740874.ph, %for_exit519 ], [ undef, %allocas ]
%blend.i.i766863.ph = phi <8 x float> [ %blend.i.i766864.lcssa, %for_exit519 ], [ undef, %allocas ]
%blend.i.i769859.ph = phi <8 x float> [ %blend.i.i769860.lcssa, %for_exit519 ], [ undef, %allocas ]
%_s40285.0.ph = phi i64 [ %add__s40_load312_, %for_exit519 ], [ 1, %allocas ]
%internal_mask_memory.6.ph = phi <8 x i32> [ %"oldMask&test302", %for_exit519 ], [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %allocas ]
br label %for_test288
if_done538: ; preds = %safe_if_run_false559, %safe_if_after_true539
%blend.i.i737878 = phi <8 x float> [ %blend.i.i737877.ph, %safe_if_after_true539 ], [ %blend.i.i737, %safe_if_run_false559 ]
%blend.i.i740875 = phi <8 x float> [ %blend.i.i740874.ph, %safe_if_after_true539 ], [ %blend.i.i740, %safe_if_run_false559 ]
%final.i871 = phi <8 x i64> [ %final.i872.ph, %safe_if_after_true539 ], [ %final.i, %safe_if_run_false559 ]
%blend.i.i731869 = phi <8 x float> [ %blend.i.i731870.ph, %safe_if_after_true539 ], [ %blend.i.i731, %safe_if_run_false559 ]
%blend.i.i734867 = phi <8 x float> [ %blend.i.i734868.ph, %safe_if_after_true539 ], [ %blend.i.i734, %safe_if_run_false559 ]
%"!(break|continue)_lanes590" = xor <8 x i32> %break_lanes_memory522.1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask592 = and <8 x i32> %"oldMask&test526", %"!(break|continue)_lanes590"
br label %for_test516.outer
safe_if_after_true539: ; preds = %safe_if_run_true540, %for_loop518
%break_lanes_memory522.1 = phi <8 x i32> [ %"oldMask&test541", %safe_if_run_true540 ], [ zeroinitializer, %for_loop518 ]
%7 = bitcast <8 x i32> %"oldMask&test526" to <8 x float>
%floatmask.i668 = select <8 x i1> %less__add_mul___z_re_8512_load533___z_re_8512_load534_mul___z_im_8513_load535___z_im_8513_load536, <8 x float> zeroinitializer, <8 x float> %7
%v.i669 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i668) #0
%cmp.i670 = icmp eq i32 %v.i669, 0
br i1 %cmp.i670, label %if_done538, label %safe_if_run_false559
safe_if_run_true540: ; preds = %for_loop518
%"equal_finished&func556_internal_mask&function_mask532" = icmp eq i32 %v.i672, %v.i675
br i1 %"equal_finished&func556_internal_mask&function_mask532", label %for_test516, label %safe_if_after_true539
safe_if_run_false559: ; preds = %safe_if_after_true539
%sub_mul___z_re_8512_load566___z_re_8512_load567_mul___z_im_8513_load568___z_im_8513_load569 = fsub <8 x float> %mul___z_re_8512_load533___z_re_8512_load534, %mul___z_im_8513_load535___z_im_8513_load536
%blend.i.i740 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i740874.ph, <8 x float> %sub_mul___z_re_8512_load566___z_re_8512_load567_mul___z_im_8513_load568___z_im_8513_load569, <8 x float> %floatmask.i668) #1
%mul____z_re_8512_load571 = fmul <8 x float> %blend.i.i734868.ph, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul____z_re_8512_load571___z_im_8513_load572 = fmul <8 x float> %blend.i.i731870.ph, %mul____z_re_8512_load571
%blend.i.i737 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i737877.ph, <8 x float> %mul_mul____z_re_8512_load571___z_im_8513_load572, <8 x float> %floatmask.i668) #1
%add_x_load574___new_re_8515_load575 = fadd <8 x float> %add_x0_load502_mul_i_load503_to_float_dx_load504_broadcast, %blend.i.i740
%blend.i.i734 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i734868.ph, <8 x float> %add_x_load574___new_re_8515_load575, <8 x float> %floatmask.i668) #1
%add_y_load577___new_im_8516_load578 = fadd <8 x float> %add_y0_load506_broadcast_mul_j_load507_to_float_dy_load508_broadcast, %blend.i.i737
%blend.i.i731 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i731870.ph, <8 x float> %add_y_load577___new_im_8516_load578, <8 x float> %floatmask.i668) #1
%add___i_8514_load580_ = add <8 x i64> %final.i872.ph, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%old01.i = shufflevector <8 x i64> %final.i872.ph, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%old01f.i = bitcast <4 x i64> %old01.i to <8 x float>
%new01.i = shufflevector <8 x i64> %add___i_8514_load580_, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%new01f.i = bitcast <4 x i64> %new01.i to <8 x float>
%mask01.i = shufflevector <8 x float> %floatmask.i668, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
%result01f.i = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f.i, <8 x float> %new01f.i, <8 x float> %mask01.i) #1
%result01.i = bitcast <8 x float> %result01f.i to <4 x i64>
%old23.i = shufflevector <8 x i64> %final.i872.ph, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%old23f.i = bitcast <4 x i64> %old23.i to <8 x float>
%new23.i = shufflevector <8 x i64> %add___i_8514_load580_, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%new23f.i = bitcast <4 x i64> %new23.i to <8 x float>
%mask23.i = shufflevector <8 x float> %floatmask.i668, <8 x float> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
%result23f.i = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f.i, <8 x float> %new23f.i, <8 x float> %mask23.i) #1
%result23.i = bitcast <8 x float> %result23f.i to <4 x i64>
%final.i = shufflevector <4 x i64> %result01.i, <4 x i64> %result23.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
br label %if_done538
}
; Function Attrs: nounwind
define void @ispc_func_1(float %x0, float %y0, float* noalias nocapture %output, i64 %output__len__1, i64 %output__len__2, i64 %max_iters, i64 %height, i64 %width, float %dx, float %dy) #1 {
allocas:
%lessequal__width_load = icmp sgt i64 %width, 0
%width.op = add i64 %width, 1
%add__gensym318_stop_ = select i1 %lessequal__width_load, i64 %width.op, i64 1
%add_height_load_ = add i64 %height, 1
%add_height_load__to_int32 = trunc i64 %add_height_load_ to i32
%nitems = add i32 %add_height_load__to_int32, -1
%nextras = srem i32 %nitems, 8
%aligned_end = sub i32 %add_height_load__to_int32, %nextras
%before_aligned_end44796 = icmp sgt i32 %aligned_end, 1
%y0_load_broadcast_init = insertelement <8 x float> undef, float %y0, i32 0
%y0_load_broadcast = shufflevector <8 x float> %y0_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%dy_load_broadcast_init = insertelement <8 x float> undef, float %dy, i32 0
%dy_load_broadcast = shufflevector <8 x float> %dy_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%max_iters_load_broadcast_init = insertelement <8 x i64> undef, i64 %max_iters, i32 0
%max_iters_load_broadcast = shufflevector <8 x i64> %max_iters_load_broadcast_init, <8 x i64> undef, <8 x i32> zeroinitializer
%output_load_ptr2int_2void = bitcast float* %output to i8*
br label %for_test.outer
for_test.outer: ; preds = %for_exit156, %allocas
%blend.i.i736789.ph = phi <8 x float> [ %blend.i.i736790.ph, %for_exit156 ], [ undef, %allocas ]
%blend.i.i739786.ph = phi <8 x float> [ %blend.i.i739787.ph, %for_exit156 ], [ undef, %allocas ]
%blend.i.i765776.ph = phi <8 x float> [ %blend.i.i765777.lcssa, %for_exit156 ], [ undef, %allocas ]
%blend.i.i772.ph = phi <8 x float> [ %blend.i.i773.lcssa, %for_exit156 ], [ undef, %allocas ]
%_s40.0.ph = phi i64 [ %add__s40_load30_, %for_exit156 ], [ 1, %allocas ]
%internal_mask_memory.0.ph = phi <8 x i32> [ %"oldMask&test", %for_exit156 ], [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %allocas ]
br label %for_test
for_test: ; preds = %partial_inner_all_outer, %for_test.outer
%blend.i.i765776 = phi <8 x float> [ %blend.i.i765777.lcssa, %partial_inner_all_outer ], [ %blend.i.i765776.ph, %for_test.outer ]
%blend.i.i772 = phi <8 x float> [ %blend.i.i773.lcssa, %partial_inner_all_outer ], [ %blend.i.i772.ph, %for_test.outer ]
%_s40.0 = phi i64 [ %add__s40_load30_, %partial_inner_all_outer ], [ %_s40.0.ph, %for_test.outer ]
%internal_mask_memory.0 = phi <8 x i32> [ %"oldMask&test", %partial_inner_all_outer ], [ %internal_mask_memory.0.ph, %for_test.outer ]
%equal__s40_load_add__gensym318_stop_ = icmp eq i64 %_s40.0, %add__gensym318_stop_
%equal__s40_load_add__gensym318_stop_to_i_bool = sext i1 %equal__s40_load_add__gensym318_stop_ to i32
%equal__s40_load_add__gensym318_stop_to_i_bool_broadcast_init = insertelement <8 x i32> undef, i32 %equal__s40_load_add__gensym318_stop_to_i_bool, i32 0
%equal__s40_load_add__gensym318_stop_to_i_bool_broadcast = shufflevector <8 x i32> %equal__s40_load_add__gensym318_stop_to_i_bool_broadcast_init, <8 x i32> undef, <8 x i32> zeroinitializer
%val_load_logicalnot.i727 = xor <8 x i32> %equal__s40_load_add__gensym318_stop_to_i_bool_broadcast, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%"oldMask&test" = and <8 x i32> %internal_mask_memory.0, %val_load_logicalnot.i727
%floatmask.i724 = bitcast <8 x i32> %"oldMask&test" to <8 x float>
%v.i725 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i724) #0
%cmp.i726 = icmp eq i32 %v.i725, 0
br i1 %cmp.i726, label %for_exit, label %for_loop
for_loop: ; preds = %for_test
%add__s40_load30_ = add i64 %_s40.0, 1
br i1 %before_aligned_end44796, label %foreach_full_body.lr.ph, label %partial_inner_all_outer
foreach_full_body.lr.ph: ; preds = %for_loop
%i_load_to_float = sitofp i64 %_s40.0 to float
%mul_i_load_to_float_dx_load = fmul float %i_load_to_float, %dx
%add_x0_load_mul_i_load_to_float_dx_load = fadd float %mul_i_load_to_float_dx_load, %x0
%add_x0_load_mul_i_load_to_float_dx_load_broadcast_init = insertelement <8 x float> undef, float %add_x0_load_mul_i_load_to_float_dx_load, i32 0
%add_x0_load_mul_i_load_to_float_dx_load_broadcast = shufflevector <8 x float> %add_x0_load_mul_i_load_to_float_dx_load_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%sub_i_load115_ = add i64 %_s40.0, -1
%mul_output__len__1_load_sub_i_load115_ = mul i64 %sub_i_load115_, %output__len__1
br label %foreach_full_body
for_exit: ; preds = %for_test
ret void
foreach_full_body: ; preds = %for_exit55, %foreach_full_body.lr.ph
%counter.1799 = phi i32 [ 1, %foreach_full_body.lr.ph ], [ %new_counter, %for_exit55 ]
%blend.i.i773798 = phi <8 x float> [ %blend.i.i772, %foreach_full_body.lr.ph ], [ %blend.i.i774.ph, %for_exit55 ]
%blend.i.i765777797 = phi <8 x float> [ %blend.i.i765776, %foreach_full_body.lr.ph ], [ %blend.i.i765778.ph, %for_exit55 ]
%smear_counter_init48 = insertelement <8 x i32> undef, i32 %counter.1799, i32 0
%smear_counter49 = shufflevector <8 x i32> %smear_counter_init48, <8 x i32> undef, <8 x i32> zeroinitializer
%iter_val50 = add <8 x i32> %smear_counter49, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%j_load51_to_float = sitofp <8 x i32> %iter_val50 to <8 x float>
%mul_j_load51_to_float_dy_load_broadcast = fmul <8 x float> %dy_load_broadcast, %j_load51_to_float
%add_y0_load_broadcast_mul_j_load51_to_float_dy_load_broadcast = fadd <8 x float> %y0_load_broadcast, %mul_j_load51_to_float_dy_load_broadcast
br label %for_test52.outer
partial_inner_all_outer: ; preds = %for_exit55, %for_loop
%counter.1.lcssa = phi i32 [ 1, %for_loop ], [ %new_counter, %for_exit55 ]
%blend.i.i773.lcssa = phi <8 x float> [ %blend.i.i772, %for_loop ], [ %blend.i.i774.ph, %for_exit55 ]
%blend.i.i765777.lcssa = phi <8 x float> [ %blend.i.i765776, %for_loop ], [ %blend.i.i765778.ph, %for_exit55 ]
%before_full_end = icmp slt i32 %counter.1.lcssa, %add_height_load__to_int32
br i1 %before_full_end, label %partial_inner_only, label %for_test
for_test52: ; preds = %safe_if_run_true, %for_test52.outer
%internal_mask_memory.2 = phi <8 x i32> [ zeroinitializer, %safe_if_run_true ], [ %internal_mask_memory.2.ph, %for_test52.outer ]
%"oldMask&test60" = select <8 x i1> %less___i_8514_load_max_iters_load_broadcast, <8 x i32> %internal_mask_memory.2, <8 x i32> zeroinitializer
%floatmask.i721 = bitcast <8 x i32> %"oldMask&test60" to <8 x float>
%v.i722 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i721) #0
%cmp.i723 = icmp eq i32 %v.i722, 0
br i1 %cmp.i723, label %for_exit55, label %for_loop54
for_loop54: ; preds = %for_test52
%"oldMask&test70" = select <8 x i1> %less__add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68, <8 x i32> %"oldMask&test60", <8 x i32> zeroinitializer
%floatmask.i718 = bitcast <8 x i32> %"oldMask&test70" to <8 x float>
%v.i719 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i718) #0
%cmp.i720 = icmp eq i32 %v.i719, 0
br i1 %cmp.i720, label %safe_if_after_true, label %safe_if_run_true
for_exit55: ; preds = %for_test52
%_gensym5_load_to_float = sitofp <8 x i64> %final.i756771.ph to <8 x float>
%smear_counter49_cast.elt0 = zext i32 %counter.1799 to i64
%add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast.elt0 = add i64 %smear_counter49_cast.elt0, %mul_out
Out[25]:
2328
put__len__1_load_sub_i_load115_
%add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast_cast.elt0 = trunc i64 %add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast.elt0 to i32
%shl_add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast_cast_.elt0 = shl i32 %add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast_cast.elt0, 2
%"varying+const_offsets.elt0" = add i32 %shl_add_mul_output__len__1_load_sub_i_load115__broadcast_smear_counter49_cast_cast_.elt0, -4
%0 = sext i32 %"varying+const_offsets.elt0" to i64
%ptr = getelementptr i8* %output_load_ptr2int_2void, i64 %0, !filename !2, !first_line !3, !first_column !4, !last_line !3, !last_column !5
%ptrcast = bitcast i8* %ptr to <8 x float>*
store <8 x float> %_gensym5_load_to_float, <8 x float>* %ptrcast, align 4, !filename !2, !first_line !3, !first_column !4, !last_line !3, !last_column !5
%new_counter = add i32 %counter.1799, 8
%before_aligned_end44 = icmp slt i32 %new_counter, %aligned_end
br i1 %before_aligned_end44, label %foreach_full_body, label %partial_inner_all_outer
if_done: ; preds = %safe_if_run_false, %safe_if_after_true
%blend.i.i765779 = phi <8 x float> [ %blend.i.i765778.ph, %safe_if_after_true ], [ %blend.i.i765, %safe_if_run_false ]
%blend.i.i775 = phi <8 x float> [ %blend.i.i774.ph, %safe_if_after_true ], [ %blend.i.i, %safe_if_run_false ]
%final.i756770 = phi <8 x i64> [ %final.i756771.ph, %safe_if_after_true ], [ %final.i756, %safe_if_run_false ]
%blend.i.i759768 = phi <8 x float> [ %blend.i.i759769.ph, %safe_if_after_true ], [ %blend.i.i759, %safe_if_run_false ]
%blend.i.i762766 = phi <8 x float> [ %blend.i.i762767.ph, %safe_if_after_true ], [ %blend.i.i762, %safe_if_run_false ]
%"!(break|continue)_lanes" = xor <8 x i32> %break_lanes_memory58.1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask102 = and <8 x i32> %"oldMask&test60", %"!(break|continue)_lanes"
br label %for_test52.outer
for_test52.outer: ; preds = %if_done, %foreach_full_body
%blend.i.i765778.ph = phi <8 x float> [ %blend.i.i765779, %if_done ], [ %blend.i.i765777797, %foreach_full_body ]
%blend.i.i774.ph = phi <8 x float> [ %blend.i.i775, %if_done ], [ %blend.i.i773798, %foreach_full_body ]
%final.i756771.ph = phi <8 x i64> [ %final.i756770, %if_done ], [ zeroinitializer, %foreach_full_body ]
%blend.i.i759769.ph = phi <8 x float> [ %blend.i.i759768, %if_done ], [ %add_y0_load_broadcast_mul_j_load51_to_float_dy_load_broadcast, %foreach_full_body ]
%blend.i.i762767.ph = phi <8 x float> [ %blend.i.i762766, %if_done ], [ %add_x0_load_mul_i_load_to_float_dx_load_broadcast, %foreach_full_body ]
%internal_mask_memory.2.ph = phi <8 x i32> [ %new_mask102, %if_done ], [ <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %foreach_full_body ]
%less___i_8514_load_max_iters_load_broadcast = icmp slt <8 x i64> %final.i756771.ph, %max_iters_load_broadcast
%mul___z_re_8512_load___z_re_8512_load67 = fmul <8 x float> %blend.i.i762767.ph, %blend.i.i762767.ph
%mul___z_im_8513_load___z_im_8513_load68 = fmul <8 x float> %blend.i.i759769.ph, %blend.i.i759769.ph
%add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68 = fadd <8 x float> %mul___z_im_8513_load___z_im_8513_load68, %mul___z_re_8512_load___z_re_8512_load67
%less__add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68 = fcmp ugt <8 x float> %add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
br label %for_test52
safe_if_after_true: ; preds = %safe_if_run_true, %for_loop54
%break_lanes_memory58.1 = phi <8 x i32> [ %"oldMask&test70", %safe_if_run_true ], [ zeroinitializer, %for_loop54 ]
%1 = bitcast <8 x i32> %"oldMask&test60" to <8 x float>
%floatmask.i715 = select <8 x i1> %less__add_mul___z_re_8512_load___z_re_8512_load67_mul___z_im_8513_load___z_im_8513_load68, <8 x float> zeroinitializer, <8 x float> %1
%v.i716 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i715) #0
%cmp.i717 = icmp eq i32 %v.i716, 0
br i1 %cmp.i717, label %if_done, label %safe_if_run_false
safe_if_run_true: ; preds = %for_loop54
%"equal_finished&func_internal_mask&function_mask66" = icmp eq i32 %v.i719, %v.i722
br i1 %"equal_finished&func_internal_mask&function_mask66", label %for_test52, label %safe_if_after_true
safe_if_run_false: ; preds = %safe_if_after_true
%sub_mul___z_re_8512_load82___z_re_8512_load83_mul___z_im_8513_load84___z_im_8513_load85 = fsub <8 x float> %mul___z_re_8512_load___z_re_8512_load67, %mul___z_im_8513_load___z_im_8513_load68
%blend.i.i = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i774.ph, <8 x float> %sub_mul___z_re_8512_load82___z_re_8512_load83_mul___z_im_8513_load84___z_im_8513_load85, <8 x float> %floatmask.i715) #1
%mul____z_re_8512_load87 = fmul <8 x float> %blend.i.i762767.ph, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul____z_re_8512_load87___z_im_8513_load88 = fmul <8 x float> %blend.i.i759769.ph, %mul____z_re_8512_load87
%blend.i.i765 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i765778.ph, <8 x float> %mul_mul____z_re_8512_load87___z_im_8513_load88, <8 x float> %floatmask.i715) #1
%add_x_load90___new_re_8515_load = fadd <8 x float> %add_x0_load_mul_i_load_to_float_dx_load_broadcast, %blend.i.i
%blend.i.i762 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i762767.ph, <8 x float> %add_x_load90___new_re_8515_load, <8 x float> %floatmask.i715) #1
%add_y_load92___new_im_8516_load = fadd <8 x float> %add_y0_load_broadcast_mul_j_load51_to_float_dy_load_broadcast, %blend.i.i765
%blend.i.i759 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i759769.ph, <8 x float> %add_y_load92___new_im_8516_load, <8 x float> %floatmask.i715) #1
%add___i_8514_load94_ = add <8 x i64> %final.i756771.ph, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%old01.i742 = shufflevector <8 x i64> %final.i756771.ph, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%old01f.i743 = bitcast <4 x i64> %old01.i742 to <8 x float>
%new01.i744 = shufflevector <8 x i64> %add___i_8514_load94_, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%new01f.i745 = bitcast <4 x i64> %new01.i744 to <8 x float>
%mask01.i746 = shufflevector <8 x float> %floatmask.i715, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
%result01f.i747 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f.i743, <8 x float> %new01f.i745, <8 x float> %mask01.i746) #1
%result01.i748 = bitcast <8 x float> %result01f.i747 to <4 x i64>
%old23.i749 = shufflevector <8 x i64> %final.i756771.ph, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%old23f.i750 = bitcast <4 x i64> %old23.i749 to <8 x float>
%new23.i751 = shufflevector <8 x i64> %add___i_8514_load94_, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%new23f.i752 = bitcast <4 x i64> %new23.i751 to <8 x float>
%mask23.i753 = shufflevector <8 x float> %floatmask.i715, <8 x float> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
%result23f.i754 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f.i750, <8 x float> %new23f.i752, <8 x float> %mask23.i753) #1
%result23.i755 = bitcast <8 x float> %result23f.i754 to <4 x i64>
%final.i756 = shufflevector <4 x i64> %result01.i748, <4 x i64> %result23.i755, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
br label %if_done
partial_inner_only: ; preds = %partial_inner_all_outer
%smear_counter_init128 = insertelement <8 x i32> undef, i32 %counter.1.lcssa, i32 0
%smear_counter129 = shufflevector <8 x i32> %smear_counter_init128, <8 x i32> undef, <8 x i32> zeroinitializer
%iter_val130 = add <8 x i32> %smear_counter129, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%smear_end_init131 = insertelement <8 x i32> undef, i32 %add_height_load__to_int32, i32 0
%smear_end132 = shufflevector <8 x i32> %smear_end_init131, <8 x i32> undef, <8 x i32> zeroinitializer
%cmp133 = icmp slt <8 x i32> %iter_val130, %smear_end132
%cmp133_to_boolvec = sext <8 x i1> %cmp133 to <8 x i32>
%i_load140_to_float = sitofp i64 %_s40.0 to float
%mul_i_load140_to_float_dx_load141 = fmul float %i_load140_to_float, %dx
%add_x0_load139_mul_i_load140_to_float_dx_load141 = fadd float %mul_i_load140_to_float_dx_load141, %x0
%add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast_init = insertelement <8 x float> undef, float %add_x0_load139_mul_i_load140_to_float_dx_load141, i32 0
%add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast = shufflevector <8 x float> %add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast_init, <8 x float> undef, <8 x i32> zeroinitializer
%j_load144_to_float = sitofp <8 x i32> %iter_val130 to <8 x float>
%mul_j_load144_to_float_dy_load145_broadcast = fmul <8 x float> %dy_load_broadcast, %j_load144_to_float
%add_y0_load143_broadcast_mul_j_load144_to_float_dy_load145_broadcast = fadd <8 x float> %y0_load_broadcast, %mul_j_load144_to_float_dy_load145_broadcast
br label %for_test153.outer
for_test153.outer: ; preds = %if_done175, %partial_inner_only
%blend.i.i736790.ph = phi <8 x float> [ %blend.i.i736791, %if_done175 ], [ %blend.i.i736789.ph, %partial_inner_only ]
%blend.i.i739787.ph = phi <8 x float> [ %blend.i.i739788, %if_done175 ], [ %blend.i.i739786.ph, %partial_inner_only ]
%final.i785.ph = phi <8 x i64> [ %final.i784, %if_done175 ], [ zeroinitializer, %partial_inner_only ]
%blend.i.i730783.ph = phi <8 x float> [ %blend.i.i730782, %if_done175 ], [ %add_y0_load143_broadcast_mul_j_load144_to_float_dy_load145_broadcast, %partial_inner_only ]
%blend.i.i733781.ph = phi <8 x float> [ %blend.i.i733780, %if_done175 ], [ %add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast, %partial_inner_only ]
%internal_mask_memory.4.ph = phi <8 x i32> [ %new_mask229, %if_done175 ], [ %cmp133_to_boolvec, %partial_inner_only ]
%less___i_8514_load160_max_iters_load161_broadcast = icmp slt <8 x i64> %final.i785.ph, %max_iters_load_broadcast
%mul___z_re_8512_load170___z_re_8512_load171 = fmul <8 x float> %blend.i.i733781.ph, %blend.i.i733781.ph
%mul___z_im_8513_load172___z_im_8513_load173 = fmul <8 x float> %blend.i.i730783.ph, %blend.i.i730783.ph
%add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173 = fadd <8 x float> %mul___z_im_8513_load172___z_im_8513_load173, %mul___z_re_8512_load170___z_re_8512_load171
%less__add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173 = fcmp ugt <8 x float> %add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
br label %for_test153
for_test153: ; preds = %safe_if_run_true177, %for_test153.outer
%internal_mask_memory.4 = phi <8 x i32> [ zeroinitializer, %safe_if_run_true177 ], [ %internal_mask_memory.4.ph, %for_test153.outer ]
%"oldMask&test163" = select <8 x i1> %less___i_8514_load160_max_iters_load161_broadcast, <8 x i32> %internal_mask_memory.4, <8 x i32> zeroinitializer
%floatmask.i706 = bitcast <8 x i32> %"oldMask&test163" to <8 x float>
%v.i707 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i706) #0
%cmp.i708 = icmp eq i32 %v.i707, 0
br i1 %cmp.i708, label %for_exit156, label %for_loop155
for_loop155: ; preds = %for_test153
%"oldMask&test178" = select <8 x i1> %less__add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173, <8 x i32> %"oldMask&test163", <8 x i32> zeroinitializer
%floatmask.i703 = bitcast <8 x i32> %"oldMask&test178" to <8 x float>
%v.i704 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i703) #0
%cmp.i705 = icmp eq i32 %v.i704, 0
br i1 %cmp.i705, label %safe_if_after_true176, label %safe_if_run_true177
for_exit156: ; preds = %for_test153
%sub_i_load246_ = add i64 %_s40.0, -1
%mul_output__len__1_load245_sub_i_load246_ = mul i64 %sub_i_load246_, %output__len__1
%_gensym5_load248_to_float = sitofp <8 x i64> %final.i785.ph to <8 x float>
%j.0_cast.elt0 = zext i32 %counter.1.lcssa to i64
%add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast.elt0 = add i64 %j.0_cast.elt0, %mul_output__len__1_load245_sub_i_load246_
%add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast_cast.elt0 = trunc i64 %add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast.elt0 to i32
%shl_add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast_cast_.elt0 = shl i32 %add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast_cast.elt0, 2
%"varying+const_offsets.elt0646" = add i32 %shl_add_mul_output__len__1_load245_sub_i_load246__broadcast_j.0_cast_cast_.elt0, -4
%2 = sext i32 %"varying+const_offsets.elt0646" to i64
%ptr647 = getelementptr i8* %output_load_ptr2int_2void, i64 %2
%mask.i.i = bitcast <8 x i32> %cmp133_to_boolvec to <8 x float>
call void @llvm.x86.avx.maskstore.ps.256(i8* %ptr647, <8 x float> %mask.i.i, <8 x float> %_gensym5_load248_to_float) #1
br label %for_test.outer
if_done175: ; preds = %safe_if_run_false196, %safe_if_after_true176
%blend.i.i736791 = phi <8 x float> [ %blend.i.i736790.ph, %safe_if_after_true176 ], [ %blend.i.i736, %safe_if_run_false196 ]
%blend.i.i739788 = phi <8 x float> [ %blend.i.i739787.ph, %safe_if_after_true176 ], [ %blend.i.i739, %safe_if_run_false196 ]
%final.i784 = phi <8 x i64> [ %final.i785.ph, %safe_if_after_true176 ], [ %final.i, %safe_if_run_false196 ]
%blend.i.i730782 = phi <8 x float> [ %blend.i.i730783.ph, %safe_if_after_true176 ], [ %blend.i.i730, %safe_if_run_false196 ]
%blend.i.i733780 = phi <8 x float> [ %blend.i.i733781.ph, %safe_if_after_true176 ], [ %blend.i.i733, %safe_if_run_false196 ]
%"!(break|continue)_lanes227" = xor <8 x i32> %break_lanes_memory159.1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
%new_mask229 = and <8 x i32> %"oldMask&test163", %"!(break|continue)_lanes227"
br label %for_test153.outer
safe_if_after_true176: ; preds = %safe_if_run_true177, %for_loop155
%break_lanes_memory159.1 = phi <8 x i32> [ %"oldMask&test178", %safe_if_run_true177 ], [ zeroinitializer, %for_loop155 ]
%3 = bitcast <8 x i32> %"oldMask&test163" to <8 x float>
%floatmask.i700 = select <8 x i1> %less__add_mul___z_re_8512_load170___z_re_8512_load171_mul___z_im_8513_load172___z_im_8513_load173, <8 x float> zeroinitializer, <8 x float> %3
%v.i701 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i700) #0
%cmp.i702 = icmp eq i32 %v.i701, 0
br i1 %cmp.i702, label %if_done175, label %safe_if_run_false196
safe_if_run_true177: ; preds = %for_loop155
%"equal_finished&func193_internal_mask&function_mask169" = icmp eq i32 %v.i704, %v.i707
br i1 %"equal_finished&func193_internal_mask&function_mask169", label %for_test153, label %safe_if_after_true176
safe_if_run_false196: ; preds = %safe_if_after_true176
%sub_mul___z_re_8512_load203___z_re_8512_load204_mul___z_im_8513_load205___z_im_8513_load206 = fsub <8 x float> %mul___z_re_8512_load170___z_re_8512_load171, %mul___z_im_8513_load172___z_im_8513_load173
%blend.i.i739 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i739787.ph, <8 x float> %sub_mul___z_re_8512_load203___z_re_8512_load204_mul___z_im_8513_load205___z_im_8513_load206, <8 x float> %floatmask.i700) #1
%mul____z_re_8512_load208 = fmul <8 x float> %blend.i.i733781.ph, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
%mul_mul____z_re_8512_load208___z_im_8513_load209 = fmul <8 x float> %blend.i.i730783.ph, %mul____z_re_8512_load208
%blend.i.i736 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i736790.ph, <8 x float> %mul_mul____z_re_8512_load208___z_im_8513_load209, <8 x float> %floatmask.i700) #1
%add_x_load211___new_re_8515_load212 = fadd <8 x float> %add_x0_load139_mul_i_load140_to_float_dx_load141_broadcast, %blend.i.i739
%blend.i.i733 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i733781.ph, <8 x float> %add_x_load211___new_re_8515_load212, <8 x float> %floatmask.i700) #1
%add_y_load214___new_im_8516_load215 = fadd <8 x float> %add_y0_load143_broadcast_mul_j_load144_to_float_dy_load145_broadcast, %blend.i.i736
%blend.i.i730 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %blend.i.i730783.ph, <8 x float> %add_y_load214___new_im_8516_load215, <8 x float> %floatmask.i700) #1
%add___i_8514_load217_ = add <8 x i64> %final.i785.ph, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%old01.i = shufflevector <8 x i64> %final.i785.ph, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%old01f.i = bitcast <4 x i64> %old01.i to <8 x float>
%new01.i = shufflevector <8 x i64> %add___i_8514_load217_, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%new01f.i = bitcast <4 x i64> %new01.i to <8 x float>
%mask01.i = shufflevector <8 x float> %floatmask.i700, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
%result01f.i = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old01f.i, <8 x float> %new01f.i, <8 x float> %mask01.i) #1
%result01.i = bitcast <8 x float> %result01f.i to <4 x i64>
%old23.i = shufflevector <8 x i64> %final.i785.ph, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%old23f.i = bitcast <4 x i64> %old23.i to <8 x float>
%new23.i = shufflevector <8 x i64> %add___i_8514_load217_, <8 x i64> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%new23f.i = bitcast <4 x i64> %new23.i to <8 x float>
%mask23.i = shufflevector <8 x float> %floatmask.i700, <8 x float> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
%result23f.i = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %old23f.i, <8 x float> %new23f.i, <8 x float> %mask23.i) #1
%result23.i = bitcast <8 x float> %result23f.i to <4 x i64>
%final.i = shufflevector <4 x i64> %result01.i, <4 x i64> %result23.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
br label %if_done175
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
!llvm.ident = !{!0}
!llvm.module.flags = !{!1}
!0 = !{!"clang version 3.6.1 (tags/RELEASE_361/final 238309)"}
!1 = !{i32 1, !"PIC Level", i32 2}
!2 = !{!"<stdin>"}
!3 = !{i32 47}
!4 = !{i32 13}
!5 = !{i32 59}
In [ ]:
In [ ]:
Content source: damiendr/ISPC.jl
Similar notebooks: