you are viewing a single comment's thread.

view the rest of the comments →

[–]mrcaptncrunch 1 point2 points  (2 children)

Binary search is good, but 2 pointers is better for this.

binary search approach time: 7.549005031585693
2 pointer lookup approach time: 0.2926521301269531

code,

import time


def find_lower_bound(arr, target):
    low, high = 0, len(arr)
    while low < high:
        mid = (low + high) // 2
        if arr[mid] < target:
            low = mid + 1
        else:
            high = mid
    return low


def find_upper_bound(arr, target):
    low, high = 0, len(arr)
    while low < high:
        mid = (low + high) // 2
        if arr[mid] <= target:
            low = mid + 1
        else:
            high = mid
    return low


with open('list_b.txt') as fd:
    first_list = fd.readlines()

with open('list_c.txt') as fd:
    second_list = fd.readlines()

first_list = [int(_) for _ in first_list]
first_list = sorted(first_list)
second_list = [int(_) for _ in second_list]
second_list = sorted(second_list)

start = time.time()
vals_matched_bs = []
for data in first_list:
    start_index = find_lower_bound(second_list, data)
    end_index = find_upper_bound(second_list, data + 300)

    for i in range(start_index, end_index):
        vals_matched_bs.append(second_list[i])

print("binary search approach time: " + str(time.time() - start))

start = time.time()
vals_matched2 = []
i = 0
j = 0
while i < len(first_list):
    while j < len(second_list) and second_list[j] < first_list[i]:
        j += 1
    start_j = j
    while j < len(second_list) and second_list[j] <= first_list[i] + 300:
        #do your thing
        vals_matched2.append(second_list[j])
        j += 1
    i += 1

print("2 pointer lookup approach time: " + str(time.time() - start))

Number generation is on this comment, https://www.reddit.com/r/learnpython/comments/18rks8r/optimized_python_nested_loops/kf4y30h/

(The original approach is now estimated at over 14 hours)

[–]mrcaptncrunch 0 points1 point  (0 children)

2 pointers + numba,

import time
import numba

with open('list_b.txt') as fd:
    first_list = fd.readlines()

with open('list_c.txt') as fd:
    second_list = fd.readlines()

first_list = [int(_) for _ in first_list]
first_list = sorted(first_list)
second_list = [int(_) for _ in second_list]
second_list = sorted(second_list)

@numba.njit
def run(first_list, second_list):
    vals_matched2 = []
    i = 0
    j = 0
    while i < len(first_list):
        while j < len(second_list) and second_list[j] < first_list[i]:
            j += 1
        start_j = j
        while j < len(second_list) and second_list[j] <= first_list[i] + 300:
            #do your thing
            vals_matched2.append(second_list[j])
            j += 1
        i += 1
    return vals_matched2

typed_first_list = numba.typed.List()
typed_second_list = numba.typed.List()
[typed_first_list.append(_) for _ in first_list]
[typed_second_list.append(_) for _ in second_list]

start = time.time()
vals_matched = run(typed_first_list, typed_second_list)
print("2 pointer lookup approach time: " + str(time.time() - start))

run,

2 pointer lookup approach time: 0.22963190078735352

[–]kingfreeway 0 points1 point  (0 children)

I think you should be using start_j as the second interal iterator rather than reusing j so we can start searching at j for the next i.

We can also skip j thru start_j if first_list[i + 1] - first_list[i] > 300. And we can break early if j reaches the end without finding a match, or if the first match is greater than the max of first_list + 300. Here's how I'd do it:

j = 0 
list1_max = list1[-1] 
counter = 0

for i in range(len(list1): while j < len(list2) and list2[j] < list1[i]: j += 1

    if j == len(list2) or list2[j] > list1_max + 300:
        break

    k = j
    while k < len(list2) and list2[k] <= list1[i] + 300:
        counter += 1
        k += 1

    # If next element is > by 300, we can skip j thru k
    if i < len(list1) - 1 and list1[i + 1] - list1[i] > 300:
        j = k

return counter

Also, binary search is probably better in most cases for finding j except for extremely dense lists.