1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126 | // -----// IR Dump After SpecializeEncodingsPass (iree-stream-specialize-encodings) //----- //
#encoding = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32], layouts = [#iree_cpu.cpu_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [16, 1], outerDimsPerm = [0, 1]}}>]>
#encoding1 = #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [f32, f32, f32], layouts = [#iree_cpu.cpu_encoding_layout<configuration = {encoding_info = {innerDimsPos = [1, 0], innerTileSizes = [16, 1], outerDimsPerm = [1, 0]}}>]>
#encoding2 = #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f32, f32], layouts = [#iree_cpu.cpu_encoding_layout<configuration = {encoding_info = {innerDimsPos = [0, 1], innerTileSizes = [16, 16], outerDimsPerm = [0, 1]}}>]>
#executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver4", cpu_features = "+mmx,+popcnt,+sse,+sse2,+sse3,+ssse3,+sse4.1,+sse4.2,+avx,+avx2,+sse4a,+fma,+avx512f,+bmi,+bmi2,+aes,+pclmul,+avx512vl,+avx512bw,+avx512dq,+avx512cd,+avx512vbmi,+avx512ifma,+avx512vpopcntdq,+avx512vbmi2,+gfni,+vpclmulqdq,+avx512vnni,+avx512bitalg,+avx512bf16,+adx,+clflushopt,+clwb,+clzero,+cx16,+cx8,+f16c,+fsgsbase,+crc32,+invpcid,+rdpru,+sahf,+lzcnt,+movbe,+mwaitx,+x87,+pku,+evex512,+prfchw,+rdpid,+rdrnd,+rdseed,+sha,+shstk,+vaes,+wbnoinvd,+xsave,+xsavec,+xsaveopt,+xsaves,+fxsr", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver= #iree_cpu.cpu_encoding_layout<>, native_vector_size = 64 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
#device_target_local = #hal.device.target<"local", [#executable_target_embedded_elf_x86_64_]> : !hal.device
#encoding3 = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 32, 32, 32>>
#encoding4 = #iree_encoding.encoding<operand_index = 1 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 32, 32, 32>>
#encoding5 = #iree_encoding.encoding<operand_index = 2 : index, op_type = matmul, element_types = [f32, f32, f32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 32, 32, 32>>
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
util.global private @__device_0 = #device_target_local
stream.executable private @foo_dispatch_0 {
stream.executable.export public @foo_dispatch_0_set_encoding_LHS_DxD workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_0_set_encoding_LHS_DxD(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index
%2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #encoding>>{%0, %1}
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
%5 = iree_encoding.set_encoding %4 : tensor<?x?xf32> -> tensor<?x?xf32, #encoding3>
flow.dispatch.tensor.store %5, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #encoding3> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #encoding>>{%0, %1}
return
}
}
}
stream.executable private @foo_dispatch_1 {
stream.executable.export public @foo_dispatch_1_set_encoding_RHS_DxD workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_1_set_encoding_RHS_DxD(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index
%2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1}
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #encoding1>>{%0, %1}
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32>>{%0, %1} -> tensor<?x?xf32>
%5 = iree_encoding.set_encoding %4 : tensor<?x?xf32> -> tensor<?x?xf32, #encoding4>
flow.dispatch.tensor.store %5, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32, #encoding4> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #encoding1>>{%0, %1}
return
}
}
}
stream.executable private @foo_dispatch_2 {
stream.executable.export public @foo_dispatch_2_matmul_DxDxD_f32 workgroups(%arg0: index, %arg1: index, %arg2: index, %arg3: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1, %arg2, %arg3
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_2_matmul_DxDxD_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: !stream.binding) {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = flow.dispatch.workload.ordinal %arg2, 0 : index
%1 = flow.dispatch.workload.ordinal %arg3, 1 : index
%2 = flow.dispatch.workload.ordinal %arg4, 2 : index
%3 = flow.dispatch.workload.ordinal %arg5, 3 : index
%4 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding>>{%2, %0}
%5 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding1>>{%1, %3}
%6 = stream.binding.subspan %arg6[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #encoding2>>{%2, %3}
%7 = flow.dispatch.tensor.load %4, offsets = [0, 0], sizes = [%2, %0], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding>>{%2, %0} -> tensor<?x?xf32, #encoding3>
%8 = flow.dispatch.tensor.load %5, offsets = [0, 0], sizes = [%1, %3], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding1>>{%1, %3} -> tensor<?x?xf32, #encoding4>
%9 = tensor.empty(%2, %3) : tensor<?x?xf32, #encoding5>
%10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<?x?xf32, #encoding5>) -> tensor<?x?xf32, #encoding5>
%11 = linalg.matmul ins(%7, %8 : tensor<?x?xf32, #encoding3>, tensor<?x?xf32, #encoding4>) outs(%10 : tensor<?x?xf32, #encoding5>) -> tensor<?x?xf32, #encoding5>
flow.dispatch.tensor.store %11, %6, offsets = [0, 0], sizes = [%2, %3], strides = [1, 1] : tensor<?x?xf32, #encoding5> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32, #encoding2>>{%2, %3}
return
}
}
}
stream.executable private @foo_dispatch_3 {
stream.executable.export public @foo_dispatch_3_unset_encoding_RESULT_DxD workgroups(%arg0: index, %arg1: index) -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice %arg0, %arg1
stream.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @foo_dispatch_3_unset_encoding_RESULT_DxD(%arg0: !stream.binding, %arg1: index, %arg2: index, %arg3: !stream.binding) {
%c0 = arith.constant 0 : index
%0 = flow.dispatch.workload.ordinal %arg1, 0 : index
%1 = flow.dispatch.workload.ordinal %arg2, 1 : index
%2 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding2>>{%0, %1}
%3 = stream.binding.subspan %arg3[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
%4 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<?x?xf32, #encoding2>>{%0, %1} -> tensor<?x?xf32, #encoding5>
%5 = iree_encoding.unset_encoding %4 : tensor<?x?xf32, #encoding5> -> tensor<?x?xf32>{%0, %1}
flow.dispatch.tensor.store %5, %3, offsets = [0, 0], sizes = [%0, %1], strides = [1, 1] : tensor<?x?xf32> -> !flow.dispatch.tensor<writeonly:tensor<?x?xf32>>{%0, %1}
return
}
}
}
util.func public @foo(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @foo(%input0: tensor<?x?xf32>, %input1: tensor<?x?xf32>) -> (%output0: tensor<?x?xf32>)"}} {
%0 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[0] : index
%1 = hal.buffer_view.dim<%arg0 : !hal.buffer_view>[1] : index
%element_type_f32 = hal.element_type<f32> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%0, %1]) type(%element_type_f32) encoding(%dense_row_major)
%2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xf32>{%0, %1} : index
%3 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg0 : !hal.buffer_view -> tensor<?x?xf32>{%0, %1} in !stream.resource<external>{%2}
%4 = stream.async.transfer %3 : !stream.resource<external>{%2} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%2}
%5 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[0] : index
%6 = hal.buffer_view.dim<%arg1 : !hal.buffer_view>[1] : index
hal.buffer_view.assert<%arg1 : !hal.buffer_view> message("input1") shape([%5, %6]) type(%element_type_f32) encoding(%dense_row_major)
%7 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xf32>{%5, %6} : index
%8 = stream.tensor.import on(#hal.device.affinity<@__device_0>) %arg1 : !hal.buffer_view -> tensor<?x?xf32>{%5, %6} in !stream.resource<external>{%7}
%9 = stream.async.transfer %8 : !stream.resource<external>{%7} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%7}
%10 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xf32, #encoding>{%0, %1} : index
%11 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @foo_dispatch_0::@foo_dispatch_0_set_encoding_LHS_DxD[%0, %1](%4, %0, %1) : (tensor<?x?xf32>{%0, %1} in !stream.resource<*>{%2}, index, index) -> tensor<?x?xf32, #encoding>{%0, %1} in !stream.resource<*>{%10}
%12 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xf32, #encoding1>{%5, %6} : index
%13 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @foo_dispatch_1::@foo_dispatch_1_set_encoding_RHS_DxD[%5, %6](%9, %5, %6) : (tensor<?x?xf32>{%5, %6} in !stream.resource<*>{%7}, index, index) -> tensor<?x?xf32, #encoding1>{%5, %6} in !stream.resource<*>{%12}
%14 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xf32, #encoding2>{%0, %6} : index
%15 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @foo_dispatch_2::@foo_dispatch_2_matmul_DxDxD_f32[%1, %5, %0, %6](%11, %13, %1, %5, %0, %6) : (tensor<?x?xf32, #encoding>{%0, %1} in !stream.resource<*>{%10}, tensor<?x?xf32, #encoding1>{%5, %6} in !stream.resource<*>{%12}, index, index, index, index) -> tensor<?x?xf32, #encoding2>{%0, %6} in !stream.resource<*>{%14}
%16 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<?x?xf32>{%0, %6} : index
%17 = stream.tensor.dispatch on(#hal.device.affinity<@__device_0>) @foo_dispatch_3::@foo_dispatch_3_unset_encoding_RESULT_DxD[%0, %6](%15, %0, %6) : (tensor<?x?xf32, #encoding2>{%0, %6} in !stream.resource<*>{%14}, index, index) -> tensor<?x?xf32>{%0, %6} in !stream.resource<*>{%16}
%18 = stream.async.transfer %17 : !stream.resource<*>{%16} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%16}
%19 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %18 : tensor<?x?xf32>{%0, %6} in !stream.resource<external>{%16} -> !hal.buffer_view
util.return %19 : !hal.buffer_view
}
}
|