embeded/raspberry pi

aarch, armv8 asimd build (neon)

구차니 2021. 6. 30. 18:25


5.7.2 Advanced SIMD Mnemonics Although derived from the AArch32 Advanced SIMD syntax, a number of changes have been made to harmonise with the AArch64 core integer and floating point instruction set syntax, and to unify AArch32’s divergent “architectural” and “programmers’” notations: • The ‘V’ mnemonic prefix has been removed, and S/U/F/P added to indicate signed/unsigned/floatingpoint/polynomial data type. The mnemonic always indicates the data type(s) of the operation. • The vector organisation (element size and number of lanes) is described by the register qualifiers and never by a mnemonic qualifier. See the description of the vector register syntax in §4.4.2 above. • The ‘P’ prefix for “pairwise” operations becomes a suffix. • A ‘V’ suffix has been added for the new reduction (across-all-lanes) operations • A ‘2’ suffix has been added for the new widening/narrowing “second part” instructions, described below. • Vector compares now use the integer condition code names to indicate whether an integer comparison is signed or unsigned (e.g. CMLT, CMLO, CMGE, CMHI, etc) • Some mnemonics have been renamed where the removal of the V prefix caused clash with the core instruction set mnemonics.

ADD Vd.<T>, Vn.<T>, Vm.<T> 
Integer add (vector). Where <T> is 8B, 16B, 4H, 8H, 2S, 4S or 2D

[링크 : https://www.element14.com/community/servlet/JiveServlet/previewBody/41836-102-1-229511/ARM.Reference_Manual.pdf]


$ gcc neon.c -fopt-info-vec -O3
neon.c:10:9: note: loop vectorized


vadd 이런게 안보이네?

$ objdump -d a.out

a.out:     file format elf64-littleaarch64

Disassembly of section .init:

00000000000005d0 <_init>:
 5d0:   a9bf7bfd        stp     x29, x30, [sp, #-16]!
 5d4:   910003fd        mov     x29, sp
 5d8:   94000043        bl      6e4 <call_weak_fn>
 5dc:   a8c17bfd        ldp     x29, x30, [sp], #16
 5e0:   d65f03c0        ret

Disassembly of section .plt:

00000000000005f0 <.plt>:
 5f0:   a9bf7bf0        stp     x16, x30, [sp, #-16]!
 5f4:   90000090        adrp    x16, 10000 <__FRAME_END__+0xf680>
 5f8:   f947fe11        ldr     x17, [x16, #4088]
 5fc:   913fe210        add     x16, x16, #0xff8
 600:   d61f0220        br      x17
 604:   d503201f        nop
 608:   d503201f        nop
 60c:   d503201f        nop

0000000000000610 <__cxa_finalize@plt>:
 610:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 614:   f9400211        ldr     x17, [x16]
 618:   91000210        add     x16, x16, #0x0
 61c:   d61f0220        br      x17

0000000000000620 <__libc_start_main@plt>:
 620:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 624:   f9400611        ldr     x17, [x16, #8]
 628:   91002210        add     x16, x16, #0x8
 62c:   d61f0220        br      x17

0000000000000630 <__gmon_start__@plt>:
 630:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 634:   f9400a11        ldr     x17, [x16, #16]
 638:   91004210        add     x16, x16, #0x10
 63c:   d61f0220        br      x17

0000000000000640 <abort@plt>:
 640:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 644:   f9400e11        ldr     x17, [x16, #24]
 648:   91006210        add     x16, x16, #0x18
 64c:   d61f0220        br      x17

0000000000000650 <printf@plt>:
 650:   b0000090        adrp    x16, 11000 <__cxa_finalize@GLIBC_2.17>
 654:   f9401211        ldr     x17, [x16, #32]
 658:   91008210        add     x16, x16, #0x20
 65c:   d61f0220        br      x17

Disassembly of section .text:

0000000000000660 <main>:
 660:   d13003ff        sub     sp, sp, #0xc00
 664:   912003e0        add     x0, sp, #0x800
 668:   911003e2        add     x2, sp, #0x400
 66c:   910003e1        mov     x1, sp
 670:   913003e3        add     x3, sp, #0xc00
 674:   d503201f        nop
 678:   3cc10401        ldr     q1, [x0], #16
 67c:   3cc10440        ldr     q0, [x2], #16
 680:   eb03001f        cmp     x0, x3
 684:   4ea18400        add     v0.4s, v0.4s, v1.4s
 688:   3c810420        str     q0, [x1], #16
 68c:   54ffff61        b.ne    678 <main+0x18>  // b.any
 690:   b94003e1        ldr     w1, [sp]
 694:   90000000        adrp    x0, 0 <_init-0x5d0>
 698:   b94403e2        ldr     w2, [sp, #1024]
 69c:   91214000        add     x0, x0, #0x850
 6a0:   b94803e3        ldr     w3, [sp, #2048]
 6a4:   913003ff        add     sp, sp, #0xc00
 6a8:   17ffffea        b       650 <printf@plt>

00000000000006ac <_start>:
 6ac:   d280001d        mov     x29, #0x0                       // #0
 6b0:   d280001e        mov     x30, #0x0                       // #0
 6b4:   aa0003e5        mov     x5, x0
 6b8:   f94003e1        ldr     x1, [sp]
 6bc:   910023e2        add     x2, sp, #0x8
 6c0:   910003e6        mov     x6, sp
 6c4:   90000080        adrp    x0, 10000 <__FRAME_END__+0xf680>
 6c8:   f947ec00        ldr     x0, [x0, #4056]
 6cc:   90000083        adrp    x3, 10000 <__FRAME_END__+0xf680>
 6d0:   f947e863        ldr     x3, [x3, #4048]
 6d4:   90000084        adrp    x4, 10000 <__FRAME_END__+0xf680>
 6d8:   f947d884        ldr     x4, [x4, #4016]
 6dc:   97ffffd1        bl      620 <__libc_start_main@plt>
 6e0:   97ffffd8        bl      640 <abort@plt>

00000000000006e4 <call_weak_fn>:
 6e4:   90000080        adrp    x0, 10000 <__FRAME_END__+0xf680>
 6e8:   f947e400        ldr     x0, [x0, #4040]
 6ec:   b4000040        cbz     x0, 6f4 <call_weak_fn+0x10>
 6f0:   17ffffd0        b       630 <__gmon_start__@plt>
 6f4:   d65f03c0        ret

00000000000006f8 <deregister_tm_clones>:
 6f8:   b0000080        adrp    x0, 11000 <__cxa_finalize@GLIBC_2.17>
 6fc:   9100e000        add     x0, x0, #0x38
 700:   b0000081        adrp    x1, 11000 <__cxa_finalize@GLIBC_2.17>
 704:   9100e021        add     x1, x1, #0x38
 708:   eb00003f        cmp     x1, x0
 70c:   540000a0        b.eq    720 <deregister_tm_clones+0x28>  // b.none
 710:   90000081        adrp    x1, 10000 <__FRAME_END__+0xf680>
 714:   f947dc21        ldr     x1, [x1, #4024]
 718:   b4000041        cbz     x1, 720 <deregister_tm_clones+0x28>
 71c:   d61f0020        br      x1
 720:   d65f03c0        ret
 724:   d503201f        nop

0000000000000728 <register_tm_clones>:
 728:   b0000080        adrp    x0, 11000 <__cxa_finalize@GLIBC_2.17>
 72c:   9100e000        add     x0, x0, #0x38
 730:   b0000081        adrp    x1, 11000 <__cxa_finalize@GLIBC_2.17>
 734:   9100e021        add     x1, x1, #0x38
 738:   cb000021        sub     x1, x1, x0
 73c:   9343fc21        asr     x1, x1, #3
 740:   8b41fc21        add     x1, x1, x1, lsr #63
 744:   9341fc21        asr     x1, x1, #1
 748:   b40000a1        cbz     x1, 75c <register_tm_clones+0x34>
 74c:   90000082        adrp    x2, 10000 <__FRAME_END__+0xf680>
 750:   f947f042        ldr     x2, [x2, #4064]
 754:   b4000042        cbz     x2, 75c <register_tm_clones+0x34>
 758:   d61f0040        br      x2
 75c:   d65f03c0        ret

0000000000000760 <__do_global_dtors_aux>:
 760:   a9be7bfd        stp     x29, x30, [sp, #-32]!
 764:   910003fd        mov     x29, sp
 768:   f9000bf3        str     x19, [sp, #16]
 76c:   b0000093        adrp    x19, 11000 <__cxa_finalize@GLIBC_2.17>
 770:   3940e260        ldrb    w0, [x19, #56]
 774:   35000140        cbnz    w0, 79c <__do_global_dtors_aux+0x3c>
 778:   90000080        adrp    x0, 10000 <__FRAME_END__+0xf680>
 77c:   f947e000        ldr     x0, [x0, #4032]
 780:   b4000080        cbz     x0, 790 <__do_global_dtors_aux+0x30>
 784:   b0000080        adrp    x0, 11000 <__cxa_finalize@GLIBC_2.17>
 788:   f9401800        ldr     x0, [x0, #48]
 78c:   97ffffa1        bl      610 <__cxa_finalize@plt>
 790:   97ffffda        bl      6f8 <deregister_tm_clones>
 794:   52800020        mov     w0, #0x1                        // #1
 798:   3900e260        strb    w0, [x19, #56]
 79c:   f9400bf3        ldr     x19, [sp, #16]
 7a0:   a8c27bfd        ldp     x29, x30, [sp], #32
 7a4:   d65f03c0        ret

00000000000007a8 <frame_dummy>:
 7a8:   17ffffe0        b       728 <register_tm_clones>
 7ac:   d503201f        nop

00000000000007b0 <__libc_csu_init>:
 7b0:   a9bc7bfd        stp     x29, x30, [sp, #-64]!
 7b4:   910003fd        mov     x29, sp
 7b8:   a90153f3        stp     x19, x20, [sp, #16]
 7bc:   90000094        adrp    x20, 10000 <__FRAME_END__+0xf680>
 7c0:   91370294        add     x20, x20, #0xdc0
 7c4:   a9025bf5        stp     x21, x22, [sp, #32]
 7c8:   90000095        adrp    x21, 10000 <__FRAME_END__+0xf680>
 7cc:   9136e2b5        add     x21, x21, #0xdb8
 7d0:   cb150294        sub     x20, x20, x21
 7d4:   2a0003f6        mov     w22, w0
 7d8:   a90363f7        stp     x23, x24, [sp, #48]
 7dc:   aa0103f7        mov     x23, x1
 7e0:   aa0203f8        mov     x24, x2
 7e4:   9343fe94        asr     x20, x20, #3
 7e8:   97ffff7a        bl      5d0 <_init>
 7ec:   b4000174        cbz     x20, 818 <__libc_csu_init+0x68>
 7f0:   d2800013        mov     x19, #0x0                       // #0
 7f4:   d503201f        nop
 7f8:   f8737aa3        ldr     x3, [x21, x19, lsl #3]
 7fc:   aa1803e2        mov     x2, x24
 800:   91000673        add     x19, x19, #0x1
 804:   aa1703e1        mov     x1, x23
 808:   2a1603e0        mov     w0, w22
 80c:   d63f0060        blr     x3
 810:   eb13029f        cmp     x20, x19
 814:   54ffff21        b.ne    7f8 <__libc_csu_init+0x48>  // b.any
 818:   a94153f3        ldp     x19, x20, [sp, #16]
 81c:   a9425bf5        ldp     x21, x22, [sp, #32]
 820:   a94363f7        ldp     x23, x24, [sp, #48]
 824:   a8c47bfd        ldp     x29, x30, [sp], #64
 828:   d65f03c0        ret
 82c:   d503201f        nop

0000000000000830 <__libc_csu_fini>:
 830:   d65f03c0        ret

Disassembly of section .fini:

0000000000000834 <_fini>:
 834:   a9bf7bfd        stp     x29, x30, [sp, #-16]!
 838:   910003fd        mov     x29, sp
 83c:   a8c17bfd        ldp     x29, x30, [sp], #16
 840:   d65f03c0        ret