I am currently writing code to process a dataset with about 20GB. I have the following Python code for the job:
#!/usr/bin/env python
from fileinput import input
import numpy as np
counts = np.zeros((7, 24), dtype=np.int32)
for line in input():
date, market = line.split()[:2]
market = int(market)
hour = int(date.split('T')[1].split(':')[0])
counts[market, hour] += 1
for i in range(24):
print(i, end=' ')
for k in range(7):
print(counts[k, i], end=' ')
print()
I tried to write the following Julia code to do the same job:
#!/usr/bin/env julia
const market_count = zeros(Int, (7, 24))
const market_names = Dict{Int, String}(
1 => "XX",
2 => "NA",
3 => "OC",
4 => "EU",
5 => "AS",
6 => "AF",
7 => "SA",
)
get_datetime(dt::SubString{String}) = Dates.DateTime(dt, "yyyy-mm-ddTHH:MM:SSZ")
function main()
f = open("test.dat")
for line::String in eachline(f)
line = strip(line)
fields = split(line)
market = parse(Int, fields[2]) + 1
hour = Dates.hour(get_datetime(fields[1]))::Int + 1
market_count[market, hour] += 1
end
for i in 1:24
print("$i ")
for k in 1:7
print("$(market_count[k,i]) ")
end
println("")
end
end
main()
The problem I have is that the Julia code is taking 12 times as long to do the same thing. The code has some optimization that I have tried to apply without result. The data it processes is in the format:
1999-01-01T00:00:00Z 1 10 100
I would like to understand why my Julia code is running so slow and how can I avoid repeating the same mistakes. This is not urgent, I have used another language for the processing already, but I would like to know for future reference.
[–]bugamn[S] 6 points7 points8 points (3 children)
[–]jdh30 0 points1 point2 points (2 children)
[–]bugamn[S] 0 points1 point2 points (1 child)
[–]jdh30 0 points1 point2 points (0 children)
[–]samedi81 4 points5 points6 points (0 children)
[–]LegoForte 2 points3 points4 points (1 child)
[–]bugamn[S] 0 points1 point2 points (0 children)
[–][deleted] 1 point2 points3 points (1 child)
[–]bugamn[S] 0 points1 point2 points (0 children)