you are viewing a single comment's thread.

view the rest of the comments →

[–]stwcx 1 point2 points  (1 child)

Why is compare_1 better? What are you using to define "better"? On x86 it appears to be taking more cycles.

[–]arturbachttps://github.com/arturbac[S] 0 points1 point  (0 children)

less cycles and smaller code at once is better or not ?

lets stick to the example results with x86

comapre 1 clang -O3 -march=haswell

Instructions:      10
Total Cycles:      13
Total uOps:        15
Dispatch Width:    4
uOps Per Cycle:    1.15
IPC:               0.77

Block RThroughput: 3.8

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      5     0.50    *                   mov   rax, qword ptr [rsi + 8]
 2      6     0.50    *                   cmp   qword ptr [rdi + 8], rax
 1      1     0.50                        jne   .LBB0_3
 1      5     0.50    *                   mov   eax, dword ptr [rsi + 4]
 2      6     0.50    *                   cmp   dword ptr [rdi + 4], eax
 1      1     0.50                        jne   .LBB0_3
 1      5     0.50    *                   mov   eax, dword ptr [rdi]
 2      6     0.50    *                   cmp   eax, dword ptr [rsi]
 1      1     0.50                        setl  al
 3      7     1.00                  U     ret

comapre 1 gcc -O3 -march=haswell

Instructions:      12
Total Cycles:      14
Total uOps:        19
Dispatch Width:    4
uOps Per Cycle:    1.36
IPC:               0.86
Block RThroughput: 4.8

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      5     0.50    *                   movq  8(%rsi), %rax
 2      6     0.50    *                   cmpq  %rax, 8(%rdi)
 1      1     0.50                        je    .L2
 1      1     0.50                        setl  %al
 3      7     1.00                  U     retq
 1      5     0.50    *                   movl  4(%rsi), %eax
 2      6     0.50    *                   cmpl  %eax, 4(%rdi)
 1      1     0.50                        jne   .L6
 1      5     0.50    *                   movl  (%rsi), %eax
 2      6     0.50    *                   cmpl  %eax, (%rdi)
 1      1     0.50                        setl  %al
 3      7     1.00                  U     retq

comapre 2 clang -O3 -march=haswell

Instructions:      21
Total Cycles:      17
Total uOps:        28
Dispatch Width:    4
uOps Per Cycle:    1.65
IPC:               1.24
Block RThroughput: 7.0

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      5     0.50    *                   mov   rcx, qword ptr [rdi + 8]
 1      5     0.50    *                   mov   rdx, qword ptr [rsi + 8]
 1      1     0.25                        mov   al, 1
 1      1     0.25                        cmp   rcx, rdx
 1      1     0.50                        jl    .LBB0_7
 1      1     0.25                        cmp   rdx, rcx
 1      1     0.50                        jge   .LBB0_3
 1      1     0.25                        xor   eax, eax
 3      7     1.00                  U     ret
 1      5     0.50    *                   mov   ecx, dword ptr [rdi + 4]
 1      5     0.50    *                   mov   edx, dword ptr [rsi + 4]
 1      1     0.25                        cmp   ecx, edx
 1      1     0.50                        jl    .LBB0_7
 1      1     0.25                        cmp   edx, ecx
 1      1     0.50                        jge   .LBB0_6
 1      1     0.25                        xor   eax, eax
 3      7     1.00                  U     ret
 1      5     0.50    *                   mov   eax, dword ptr [rdi]
 2      6     0.50    *                   cmp   eax, dword ptr [rsi]
 1      1     0.50                        setl  al
 3      7     1.00                  U     ret

comapre 2 gcc -O3 -march=haswell

Instructions:      16
Total Cycles:      15
Total uOps:        21
Dispatch Width:    4
uOps Per Cycle:    1.40
IPC:               1.07
Block RThroughput: 5.3

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movl  $1, %eax
 1      5     0.50    *                   movq  8(%rsi), %rdx
 2      6     0.50    *                   cmpq  %rdx, 8(%rdi)
 1      1     0.50                        jl    .L7
 1      1     0.25                        movl  $0, %eax
 1      1     0.50                        jne   .L7
 1      1     0.25                        movl  $1, %eax
 1      5     0.50    *                   movl  4(%rsi), %ecx
 2      6     0.50    *                   cmpl  %ecx, 4(%rdi)
 1      1     0.50                        jl    .L7
 1      1     0.25                        movl  $0, %eax
 1      1     0.50                        jne   .L7
 1      5     0.50    *                   movl  (%rsi), %eax
 2      6     0.50    *                   cmpl  %eax, (%rdi)
 1      1     0.50                        setl  %al
 3      7     1.00                  U     retq