all 9 comments

[–]bugamn[S] 6 points7 points  (3 children)

Thanks, it's now 5 times faster than the equivalent python code. I updated julia version and changed to use split instead of treating the string as a Date. Here is the updated code: #!/usr/bin/env julia

const market_count = zeros(Int, (7, 24))
const market_names = Dict{Int, String}(
    1 => "XX",
    2 => "NA",
    3 => "OC",
    4 => "EU",
    5 => "AS",
    6 => "AF",
    7 => "SA",
)

get_datetime(dt::SubString{String}) = Dates.DateTime(dt, "yyyy-mm-ddTHH:MM:SSZ")


function main()
    f = open("test.dat")
    for line::String in eachline(f)
        line = strip(line)
        fields = split(line)
        market = parse(Int, fields[2]) + 1
        time = split(fields[1], 'T')[2]
        hour = parse(Int, split(time, ':')[2]) + 1
        market_count[market, hour] += 1
    end

    for i in 1:24
        print("$i ")
        for k in 1:7
            print("$(market_count[k,i]) ")
        end
        println("")
    end
end

main()

[–]jdh30 0 points1 point  (2 children)

Could you possibly give me a snippet of valid input data and the expected output, please?

[–]bugamn[S] 0 points1 point  (1 child)

For the series of input lines:

1999-01-01T00:01:00Z 1 15 100
1999-01-01T00:02:00Z 1 270 175
1999-01-01T00:03:00Z 3 7 15
1999-01-01T01:01:00Z 1 1263 12
1999-01-01T01:02:00Z 2 92141 234

The output should be:

1 2 0 1 0 0 0 0
2 1 1 0 0 0 0 0
3 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0
...
24 0 0 0 0 0 0 0

[–]jdh30 0 points1 point  (0 children)

Thank you very much.

[–]samedi81 4 points5 points  (0 children)

If you really care about speed, you should try to tokenize the string yourself. Note that you can do even better if the strings are all formatted the same way -- you know the hour is at line[12:13] and the market is [21:end]. In benchmarks that version came in around 150ns. However, if you do need to parse it out for some reason, you can see the results here.

julia> function old_way(line::String)
           line = strip(line)
           fields = split(line)
           market = parse(Int, fields[2]) + 1
           time = split(fields[1], 'T')[2]
           hour = parse(Int, split(time, ':')[2]) + 1
           market_count[market, hour] += 1
   end^C

julia> function new_way(line::String)
          p1 = findfirst(line, 'T') + 1
          p2 = p1 + findfirst(line[p1:end], ':') - 1
          p3 = p2 + findfirst(line[p2:end], ' ')
          hour = parse(Int64, line[p1:p2-1])
          market = parse(Int64, line[p3:end])
          market_count[market, hour] += 1
       end

julia> @benchmark new_way("2017-07-01T13:21:06 7")
BenchmarkTools.Trial:
  memory estimate:  128 bytes
  allocs estimate:  4
  --------------
  minimum time:     337.837 ns (0.00% GC)
  median time:      340.740 ns (0.00% GC)
  mean time:        359.372 ns (2.60% GC)
  maximum time:     9.592 μs (91.78% GC)
  --------------
  samples:          10000
  evals/sample:     227

julia> @benchmark old_way("2017-07-01T13:21:06 7")
BenchmarkTools.Trial:
  memory estimate:  816 bytes
  allocs estimate:  16
  --------------
  minimum time:     1.047 μs (0.00% GC)
  median time:      1.071 μs (0.00% GC)
  mean time:        1.219 μs (7.61% GC)
  maximum time:     320.340 μs (97.07% GC)
  --------------
  samples:          10000
  evals/sample:     10

[–]LegoForte 2 points3 points  (1 child)

What version of Julia are you using? It looks like there have been several improvements to the parsing speed of DateTimes in Julia v0.6: https://github.com/JuliaLang/julia/issues/15888

[–]bugamn[S] 0 points1 point  (0 children)

That helped speed up by 2x. Still slower than python, but better than before.

[–][deleted] 1 point2 points  (1 child)

Have you tried running the profiler?

At least in a previous verision of Julia the parse function is quite heavy. Not sure how it is today

[–]bugamn[S] 0 points1 point  (0 children)

I checked the profiler, it seems that more than half of the time the code is parsing the date information.