branch: master
prog.s
2508 bytesRaw
.global _start
_start:
.rodata
.align 0x10
.global code.kd
.type code.kd,STT_OBJECT
# amd_kernel_code_t (must be at 0x440 for kernel_code_entry_byte_offset to be right)
code.kd:
# amd_kernel_..., amd_machine_...
.long 0,0,0,0
# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset
.long 0x00000bc0,0x00000000,0x00000000,0x00000000
# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size
.long 0,0,0,0
# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size
.long 0x60af0000,0x0000009e,0x00000408,0x00000000
# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 | AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64
# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP | AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE
# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 0xF
# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1
.text
.global code
.type code,STT_FUNC
code:
# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state
# s[0:1] contains the kernarg_address
# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs?
s_load_b64 s[2:3], s[0:1], null

s_mov_b32 s8, 0
loop:
s_addk_i32 s8, 1
s_cmp_eq_u32 s8, 100000
// FLOPS
s_cbranch_scc0 loop

# wait for the s_load_b64
s_waitcnt lgkmcnt(0)

v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 2.0
global_store_b32 v0, v1, s[2:3]

# Deallocate all VGPRs for this wave. Use only when next instruction is S_ENDPGM.
s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
s_endpgm
s_code_end

.amdgpu_metadata
amdhsa.kernels:
  - .args:
      - .address_space:  global
        .name:           a
        .offset:         0
        .size:           8
        .type_name:      'float*'
        .value_kind:     global_buffer
    .group_segment_fixed_size: 0
    .kernarg_segment_align: 8
    .kernarg_segment_size: 8
    .language:       OpenCL C
    .language_version:
      - 1
      - 2
    .max_flat_workgroup_size: 256
    .name:           code
    .private_segment_fixed_size: 0
    .sgpr_count:     2
    .sgpr_spill_count: 0
    .symbol:         code.kd
    .uses_dynamic_stack: false
    .vgpr_count:     256
    .vgpr_spill_count: 0
    .wavefront_size: 32
amdhsa.target:   amdgcn-amd-amdhsa--gfx1100
amdhsa.version:
  - 1
  - 2
.end_amdgpu_metadata