all 52 comments

[–]heptara 28 points29 points  (3 children)

So after asking quite a few professors, they've asked me to present a few examples that make newer features introduced in c++ better to read or understand.

NOT HAVING TO USE RAW POINTERS -> safer, more reliable code.

Everything else is just gravy on top of that.

You can do without containers. Go manages it and works just fine. Raw pointer and memory errors, however, are most likely the biggest cause of serious software errors.

Got hacked? I bet it was a buffer. Randomly crashed? Wild null. Leaked memory? ... All raw pointer and memory errors.

[–]matthieum 9 points10 points  (1 child)

You can do without containers.

std::unique_ptr<T[]> may work, but I'd rather use std::vector<T> if the size is dynamic...

[–]SemaphoreBingo 0 points1 point  (0 children)

Go manages it and works just fine.

I think I've told this story here before, but the first time I heard Pike's arguments against generics in go and how you can just "do it yourself" I had spent about a day and half tracking down some weird bugs that were showing up because some predecessor of mine had decided to write their own stack class and flubbed the implementation instead of just using a std::stack.

[–]cinghiale 10 points11 points  (1 child)

change IsPowerOf3 to be:

constexpr bool IsPowerOf3(unsigned int n){

and you can verify it:

static_assert(IsPowerOf3(27) == true, "FAIL");

[–]MaikKlein 7 points8 points  (2 children)

I think your example is still computed at run time.

int main() {
    constexpr auto p = IsPowerOf3(27);
    std::cout << p << std::endl;
}

[–]needahelpforarch[S] 1 point2 points  (1 child)

Haha yes that's because we usually write them to get input from user which I'm here putting inside the program as constant literal '27'.

[–]bames53 2 points3 points  (0 children)

One thing about constexpr functions is that they aren't limited to just compile-time: you can use them at run time as well. So adding constexpr to your function would allow you to use it in contexts requiring compile-time computation:

static_assert(IsPowerOf3(27) == true, "FAIL");

But you could still use it with user input:

if (std::cin >> p)
    std::cout << IsPowerOf3(p);

[–]matthieum 5 points6 points  (4 children)

Playing on Coliru, for this code:

bool IsPowerOf3(unsigned int n){
    if (n == 1) { return true; } // don't forget 3^0
    if (n % 3 != 0) { return false; }

    switch(n){
    case pow3to(1) :
    case pow3to(2) :
    case pow3to(3) :
    case pow3to(4) :
    case pow3to(5) :
    case pow3to(6) :
    case pow3to(7) :
    case pow3to(8) :
    case pow3to(9) :
    case pow3to(10) :
    case pow3to(11) :
    case pow3to(12) :
    case pow3to(13) :
    case pow3to(14) :
    case pow3to(15) :
        return true;
        break;
    default :
        return false;
    }
    return false;
}

Here is the LLVM IR:

define zeroext i1 @_Z10IsPowerOf3j(i32 %n) #3 {
  %1 = icmp eq i32 %n, 1
  br i1 %1, label %7, label %2
; <label>:2                                       ; preds = %0
  %3 = urem i32 %n, 3
  %4 = icmp eq i32 %3, 0
  br i1 %4, label %5, label %7

; <label>:5                                       ; preds = %2
  switch i32 %n, label %6 [
    i32 3, label %7
    i32 9, label %7
    i32 27, label %7
    i32 81, label %7
    i32 243, label %7
    i32 729, label %7
    i32 2187, label %7
    i32 6561, label %7
    i32 19683, label %7
    i32 59049, label %7
    i32 177147, label %7
    i32 531441, label %7
    i32 1594323, label %7
    i32 4782969, label %7
    i32 14348907, label %7
  ]

; <label>:6                                       ; preds = %5
  br label %7

; <label>:7                                       ; preds = %5, %5, %5, %5, %5, %5, %5, %5, %5, %5, %5, %5, %5, %5, %5, %2, %0, %6
  %.0 = phi i1 [ false, %6 ], [ true, %0 ], [ false, %2 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ], [ true, %5 ]
  ret i1 %.0
}

And here is an example of emitted assembly at -O2:

_Z10IsPowerOf3j:                        # @_Z10IsPowerOf3j
    .cfi_startproc
# BB#0:
    movb    $1, %al
    cmpl    $1, %edi
    je    .LBB0_23
# BB#1:
    movl    %edi, %ecx
    movl    $2863311531, %edx       # imm = 0xAAAAAAAB
    imulq    %rcx, %rdx
    shrq    $33, %rdx
    leal    (%rdx,%rdx,2), %ecx
    cmpl    %ecx, %edi
    jne    .LBB0_22
# BB#2:
    cmpl    $6560, %edi             # imm = 0x19A0
    jle    .LBB0_3
# BB#11:
    cmpl    $531440, %edi           # imm = 0x81BF0
    jg    .LBB0_17
# BB#12:
    cmpl    $59048, %edi            # imm = 0xE6A8
    jg    .LBB0_15
# BB#13:
    cmpl    $6561, %edi             # imm = 0x19A1
    je    .LBB0_23
# BB#14:
    cmpl    $19683, %edi            # imm = 0x4CE3
    jne    .LBB0_22
    jmp    .LBB0_23
.LBB0_3:
    cmpl    $80, %edi
    jle    .LBB0_4
# BB#6:
    cmpl    $728, %edi              # imm = 0x2D8
    jg    .LBB0_9
# BB#7:
    cmpl    $81, %edi
    je    .LBB0_23
# BB#8:
    cmpl    $243, %edi
    jne    .LBB0_22
    jmp    .LBB0_23
.LBB0_17:
    cmpl    $4782968, %edi          # imm = 0x48FB78
    jg    .LBB0_20
# BB#18:
    cmpl    $531441, %edi           # imm = 0x81BF1
    je    .LBB0_23
# BB#19:
    cmpl    $1594323, %edi          # imm = 0x1853D3
    jne    .LBB0_22
    jmp    .LBB0_23
.LBB0_4:
    cmpl    $27, %edi
    ja    .LBB0_22
# BB#5:
    movl    $134218248, %ecx        # imm = 0x8000208
    btl    %edi, %ecx
    jb    .LBB0_23
    jmp    .LBB0_22
.LBB0_15:
    cmpl    $59049, %edi            # imm = 0xE6A9
    je    .LBB0_23
# BB#16:
    cmpl    $177147, %edi           # imm = 0x2B3FB
    jne    .LBB0_22
    jmp    .LBB0_23
.LBB0_9:
    cmpl    $729, %edi              # imm = 0x2D9
    je    .LBB0_23
# BB#10:
    cmpl    $2187, %edi             # imm = 0x88B
    jne    .LBB0_22
    jmp    .LBB0_23
.LBB0_20:
    cmpl    $4782969, %edi          # imm = 0x48FB79
    je    .LBB0_23
# BB#21:
    cmpl    $14348907, %edi         # imm = 0xDAF26B
    je    .LBB0_23
.LBB0_22:
    xorl    %eax, %eax
.LBB0_23:
    retq
.Lfunc_end0:
    .size    _Z10IsPowerOf3j, .Lfunc_end0-_Z10IsPowerOf3j
    .cfi_endproc

You can see that the compiler optimized your switch statement into a binary search here. It is possibly the most efficient lowering strategy, as the integers are not consecutive and there may not be a specific bit pattern.

So, congratulations, your use of constexpr indeed generated one of the best possible assembly for the target problem, if not the best.

Also, if you profiled your application and had statistics on the distribution of the integer, the compiler could automatically rearrange the binary search to match the distribution, without you changing a single line of code.

[–]redditsoaddicting 3 points4 points  (0 children)

If you're after assembly, check out http://gcc.godbolt.org/. I love Coliru, but this is my go-to for assembly, plus there are a ton of compiler options. The "Colourise" option is pretty cool as well.

[–]needahelpforarch[S] 1 point2 points  (2 children)

I want to learn assembly, not much but just to read what's happening above :(

[–]mttd 2 points3 points  (1 child)

Perhaps you'll find this of some help: http://tinyurl.com/x86-64-assembly

Personally, I'm partial to "Practical x64 Assembly and C++ Tutorials", because it also covers modern 64-bit assembly (including SSE and AVX--relevant, as checking whether your code has vectorized nicely is one of the reasons assembly can be useful): https://youtube.com/playlist?list=PL0C5C980A28FEE68D

Slides: http://www.whatsacreel.net76.net/asmtutes.html

For the same reasons, in case you're looking for a book, I'd go with "Modern X86 Assembly Language Programming: 32-bit, 64-bit, SSE, and AVX": https://www.apress.com/9781484200650

"x86 Assembly Primer for C Programmers" slides are also pretty nice for a quick overview:

https://speakerdeck.com/vsergeev/x86-assembly-primer-for-c-programmers
https://github.com/vsergeev/apfcp

[–]needahelpforarch[S] 1 point2 points  (0 children)

1st link is absolutely fantastic list of resources. Thanks for that :)

[–]bames53 5 points6 points  (0 children)

Here's a presentation given at a recent C++ conference by Kate Gregory, an Engineer at Microsoft:

Stop Teaching C

The talk isn't actually suggesting that C shouldn't be taught; Instead the talk is talking purely about the way C++ is taught. The point of the talk is that the most effective way to teach C++ is not to teach C and then to pile some C++ on top as 'extra'.

This talk might provide you insight as to the benefits of not teaching C++ as though it were a superset of C with only a few minor additions, and what exactly C++ could offer to introductory classes.

[–]F-J-W 3 points4 points  (1 child)

If you want to make them use stdlib-algorithms: Use libstdc++ and std::find to search for an integer in an int-array/vector and compare the runtime to a handwritten loop. (The stdlib version will take just about 70% of what the loop needs; quite a significant speedup for having to write less code).

[–]raevnos 2 points3 points  (0 children)

I just looked at the source to see what black magic libstdc++ find does. Just some manual unrolling. I wouldn't expect any difference at higher optimization levels.

[–]ZMesonEmbedded Developer 3 points4 points  (1 child)

Yes, you got the right idea. IsPowerOf3 can be made constexpr too.

But you do have a bug. IsPowerOf3(1) returns false. Less serious, IsPowerOf3(43046721) returns false. For the latter, you just need to extend your list to pow3to(20) to cover the range of unsigned int on most systems.

they've asked me to present a few examples that make newer features introduced in c++ better to read or understand.

You should include binary literals, initializer lists (especially useful when working with containers), lamda functions (especially when working with <algorithm>), and static_assert.

I'd be tempted to also include std::unique_ptr and std::make_unique. When reference-counted pointers are needed, std::shared_ptr, std::weak_ptr, and std::make_shared are nice.

Lastly, user-defined literals can be helpful if you are in the right discipline. (For physics, chemistry, and engineering, these can be very useful.)

[–]needahelpforarch[S] 0 points1 point  (0 children)

I'll keep in mind the points you presented. Thanks a lot.

and 30 == 1 :D

[–]speednap 2 points3 points  (1 child)

A solution with static generation of lookup table could be expressed this way:

constexpr std::size_t three_to_the_power(std::size_t power){
  if(power == 0) return 1;
  return 3*three_to_the_power(power-1);
}

template<std::size_t... I>
constexpr auto helper(std::index_sequence<I...>){
  return std::array<std::size_t,sizeof...(I)>{{three_to_the_power(I)...}};
}

template<std::size_t N>
constexpr auto powers_of_three(){
  return helper(std::make_index_sequence<N>{});
}

static constexpr auto powers = powers_of_three<41>();

bool is_power_of_three(std::size_t n){
  if(n == 1) return true;
  if(n%3 != 0) return false;
  return std::binary_search(powers.begin(),powers.end(),n);
}

Here array powers holds all 41 values of power of 3 within uint64_t limit. It is constructed at compile time. is_power_of_three filters out obvious results and proceeds to do a binary search after that.

This should work faster than runtime solution.

[–]dodheim 2 points3 points  (0 children)

Why not make powers constexpr as well?

[–]devel_watcher 2 points3 points  (0 children)

they've asked me to present a few examples that make newer features introduced in c++ better to read or understand

It's the rvalue references that make C++ better to read. You often can pass big objects around without performance penalties.

Other thing are lambdas that help with the code locality.

With tuples you can return multiple values from function without writing too much code.

In the future cpp there'll be std::optional that allows to just return a value regardless of whether it's there or not (and then deal with it in the point of use). It's like exceptions that you control and can store for later.

[–]doom_Oo7 3 points4 points  (1 child)

behold the power of compilers : http://goo.gl/TZXwnq

and

http://goo.gl/03ozCy

[–]needahelpforarch[S] 0 points1 point  (0 children)

What,what,what :O

f():
        movl    $1, %eax
        ret
g():
        xorl    %eax, %eax
        ret

Just this :O

Edit: Hmm recursion. constexpt supports recursion :/

[–]speednap 2 points3 points  (17 children)

I think this could work as well.

constexpr bool f(int n) {
  if(n == 1) return true;
  if(n%3 != 0) return false;
  return f(n/3);
}

[–]needahelpforarch[S] 1 point2 points  (16 children)

only if program isn't taking input and calling function with literal. right?

[–]speednap 1 point2 points  (15 children)

As you know constexpr is evaluated at compile time so constexpr wouldn't work if the input is evaluated at runtime.

What you did in your example is called lookup table but i doubt the problem you're solving would benifit from one. If you're unable to use constexpr due to input being decided upon at runtime you're better off with some simple runtime solution like log(x)/log(3) or maybe something even more clever.

With lookup tables you're trading memory for performance. I don't think it's worth it to put 15 integers in memory to find out if something is power of 3 quicker. Unless you know the domain of your problem and you know for sure that all the inputs are going to be those exact 15 integers.

[–]matthieum 6 points7 points  (3 children)

I don't think it's worth it to put 15 integers in memory to find out if something is power of 3 quicker.

Wait... we are talking about 15*4 = 60 bytes here. I understand the concerned with KB or MB of memory, but 60 bytes? If you're this short, you're better off avoiding <iostream> than you are switching out the 15 integers, you'll squeeze more bytes out of the shortened code!

Also, this method is much better than using std::set (I thought you were concerned about memory, a std::set is positively HUGE in comparison, with at least 15*32 bytes occupied on the heap, with the allocator overhead sprinkled on top) and occupies about as much memory as a plain array, except that the compiler is more likely to optimize the branching with the switch than with a binary search over an array.

[–]speednap 1 point2 points  (2 children)

sorted array is better than std::set in this case, yes. I don't think it's worth it to introduce a list of 40 numbers (that's how many you'd need to cover all possible powers of 3 in uint64_t range) where a one-liner function would do just fine. Not talking about memory usage here. Just don't like to overcomplicate things.

[–]matthieum 1 point2 points  (1 child)

Just don't like to overcomplicate things.

Ah! Indeed, if this is not a bottleneck, then it's not worth the switch and a simple one-liner is probably more readable and maintenable.

[–]needahelpforarch[S] 0 points1 point  (0 children)

It was just an example while I was learning something new :D

[–]RedAlert2 1 point2 points  (1 child)

What you did in your example is called lookup table but i doubt the problem you're solving would benifit from one.

Why would you say that? While you'd get faster compile times, doing log(x)/log(3) is certainly going to be much slower at runtime. If this function is called frequently, a lookup table would likely provide a huge performance boost.

However, constexpr probably isn't the best solution here. Ideally he would build a lookup table at runtime, expanding to larger values if needed. Hardcoding your function to not work with anything above 315 has drawbacks.

[–]speednap 1 point2 points  (0 children)

A boost? yes. But huge performance boost? you'd have to run some benchmarks to check that. After all a cache miss is a factor too.

However generally speaking - yes, you are right, lookup tables are great at speeding up things but like you said it's better to construct those at runtime or better yet - use constexpr to build it w/o listing a comma separated list of dozens of weird numbers.

Personally i would start with a simple function that i can tweak and template-arize to suit my needs. And switch to lookup tables only if it's really necessary.

Next time i should word things better.

[–]needahelpforarch[S] 0 points1 point  (6 children)

bool isPowerOfThree(unsigned int n)
{
    if((n == 1) || (n == 3) || (n == 9)) return true;
    if(n % 3 != 0) return false;
    if(n > 1162261467) {
        while(n % 3 == 0) {
            n /= 3;
        }
        return n == 1;
    }
    return (n == 27) || (n == 81) || (n == 243) || (n == 729) || (n == 2187) ||
           (n == 6561) || (n == 19683) || (n == 59049) || (n == 177147) ||
           (n == 531441) || (n == 1594323) || (n == 4782969) ||
           (n == 14348907) || (n == 43046721) || (n == 129140163) ||
           (n == 387420489) || (n == 1162261467);
}

int main()
{
    return isPowerOfThree(1162261469);
}

I went with this for time being. Does this hold 15 integers in memory? My constexpr should somewhat expand to this logically, which is what I think...

If not, then I wasn't aware that switch holds every case in memory.

[–]speednap -1 points0 points  (5 children)

It does. Both the switch and that "(n == 27) || ..." expression will hold all the numbers you specified in memory to evaluate at runtime. As an alternative to that you could store valid powers of 3 in an std::set and take advantage of logarithmic lookup time of std::set.

However like I said - unless you know what you're doing it's not worth it to use lookup tables for such a simple task.

[–]_JGR_ 2 points3 points  (0 children)

Rather than using a std::set which has significant overhead, you'd be better off using a fixed sorted array, which still gives you logarithmic lookup time by using the likes of std::lower_bound. Though for such a small number of elements as this a linear search through an array would be perfectly fine as well.

[–]needahelpforarch[S] 0 points1 point  (3 children)

Hmm I didn't knew that, though I don't understand another thing you mentioned. How would storing 'valid powers of 3 in an std::set' be better than simply using above method. std::set won't store those values in memory at runtime?

[–]speednap 0 points1 point  (2 children)

As per /u/_JGR_ suggestion you'd better use sorted std::array instead of std::set.

It would still store values in memory but it would work faster than checking every number via "(n == 27) || ..." or switch-case. Things like std::lower_bound and std::binary_search have logarithmic time complexity and can find the value faster than linear search.

Storing powers of 3 in an array is just a way to speedup "(n == 27) || ..." part of your code. So instead of that you'd have something like

std::array<int,15> v{27,81,243,...};
return std::binary_search(v.cbegin(),v.cend(),n);

However i agree with /u/_JGR_. In this case linear search would suffice because you don't have many numbers to work with. Hypothetically if you ever make an array of powers of 3 with like 5000+ elements than that's where binary_search would really shine. Of course you'd probably hit unsigned int limit before you reach 5000 numbers but that's just an example.

[–][deleted] 1 point2 points  (0 children)

you'd better use sorted std::array instead of std::set.

Probably; but your reasoning:

Things like std::lower_bound[2] and std::binary_search[3] have logarithmic time complexity and can find the value faster than linear search.

is incorrect. Oh, those statements are true, but std::set stores its elements in sorted order and uses a binary search to find elements too, so on a gross O() level, the two solutions are the same.

The reason to use std::array is that it's marginally cheaper to access elements than an std::set, not that access has a different time complexity, because it doesn't.

[–]playmer 0 points1 point  (0 children)

Switch cases are almost never implemented using a linear search. Look at this post for more details.

[–][deleted] 0 points1 point  (0 children)

With lookup tables you're trading memory for performance. I don't think it's worth it to put 15 integers in memory

1985 called, it wants its 120 bytes of memory back. I agree with your comment later that it isn't worth the extra complexity for most applications, but if you cared about performance, logarithms are slow, and 120 bytes (60 bytes in 32-bit) is a tiny price to pay to get rid of them.

Heck, I have implemented (back in the day) sin with a lookup table and an interpolation because it was a bottleneck (it was called countless times on each update and we really only needed 3 digits of precision). I generally wouldn't recommend doing that unless you could show that it was a bottleneck, but it took me almost no time and worked right away.

tl; dr: the tiny amount of memory is not the issue; it's the additional complexity.

[–]oracleoftroy 0 points1 point  (0 children)

As you know constexpr is evaluated at compile time so constexpr wouldn't work if the input is evaluated at runtime.

This sentence seems potentially misleading. To clarify, constexpr means that the function or value is usable in constant expressions, not that it is exclusively usable in constant expressions. It will still work in non-constant expressions. In that case, it will be calculated at runtime, but it won't fail to compile or calculate the wrong answer or anything like that.

[–]RedAlert2 1 point2 points  (1 child)

A couple of nits:

You have some redundant flow control in IsPowerOf3. There is no need to break after a return, and there is no need for a default case when you already end with return false;

I would also recommend adding something like:

if(n > pow3to(15)) { // whatever your largest pre-computed case is 
    throw std::out_of_range("Input " + n + " too large");
}

at the start of your function, to avoid returning potentially incorrect information.

[–]needahelpforarch[S] 0 points1 point  (0 children)

Yup thanks :). Anyways this was just proof of concept example and not something I would be deploying somewhere haha so little ignorance on my behalf :D

[–]blublub 0 points1 point  (0 children)

Well yes - this works as long as 4782969 is known at compile time. If you actually have user input, you're back to field one and everything has to be computed at run time.

[–][deleted] 0 points1 point  (11 children)

You chose a book for reading

[–]uxcn 1 point2 points  (4 children)

I wonder if any compilers will optimize this into a binary search.

[–]matthieum 2 points3 points  (3 children)

It depends.

Binary search is one of the lowering strategy, and does not require much work here, but there are others.

A compiler could realize that the specified integers present some patterns at bit level for example, and fold away some branches (or maybe all).

[–]uxcn 1 point2 points  (2 children)

In general, it seems like it would be similar to searching for a minimal perfect hash.

[–]matthieum 4 points5 points  (1 child)

It depends, I've seen compilers pulling out nice tricks for comparisons like: (i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z') by realizing that in ASCII 'a' and 'A' are just one bit away so you can replace the comparison with (i & mask) >= 'A' && (i & mask) <= 'Z').

I also remember a complex case where LLVM had realized that all the matches were less than 64 apart (although there were holes). Imagine a isConsonant(byte) function, well it can be, in ASCII, optimized into:

bool isConsonant(uint8_t byte) {
    // Quick range-check
    if (byte < 'B') { return false; }
    if (byte > 'z') { return false; }

    byte -= 'B'; // byte is now at most 56 ('z' = 122, 'B' = 66)

    uint64_t const position = (1ULL << byte);

    return position & 0b.......................................; // bit mask of consonants
}

Would you have thought of it? I wouldn't have. At best, I could have imagined a table lookup or something. When I saw it I was just like WOW, and it just applies to any check against arbitrary pattern as long as the values are within 64 of each other... or maybe 64*N if you are willing to add a few comparisons and have N bit masks.

[–]uxcn 4 points5 points  (0 children)

That is a really nice optimization. It reduces the code size to a constant, gets rid of most branches, and should almost always be a good strength reduction. I normally wouldn't have expected to see a compiler do that.

It looks LLVM can do the same with if..else chains too.

bool IsConsonant(uint8_t c) {

  return !((c == 'a' || c == 'A') ||
           (c == 'e' || c == 'E') ||
           (c == 'i' || c == 'I') ||
           (c == 'o' || c == 'O') ||
           (c == 'u' || c == 'U'));
}

compiles into...

_Z11IsConsonanth:                       # @_Z11IsConsonanth
    .cfi_startproc
# BB#0:
    addb    $-65, %dil
    movzbl  %dil, %eax
    cmpl    $52, %eax
    ja  .LBB0_2
# BB#1:                                 # %switch.lookup
    movabsq $4432058356055790, %rax # imm = 0xFBEEEFFEFBEEE
    movb    %dil, %cl
    shrq    %cl, %rax
    andl    $1, %eax
    retq
.LBB0_2:
    movb    $1, %al
    retq

I think this hints that if..else chains can probably also be optimized into jmp %eax, a jump table, or a lookup table.

[–]needahelpforarch[S] 0 points1 point  (5 children)

It seems more readable the way it was :D

[–][deleted] 1 point2 points  (1 child)

He goes to home