@@ -126,150 +126,56 @@ def left_outer_join(ndarray[int64_t] left, ndarray[int64_t] right,
126126
127127
128128def left_outer_asof_join (ndarray[int64_t] left , ndarray[int64_t] right ,
129- Py_ssize_t max_groups , sort = True ,
129+ Py_ssize_t max_groups , # ignored
130130 bint allow_exact_matches = 1 ,
131- left_distance = None ,
132- right_distance = None ,
131+ left_values = None ,
132+ right_values = None ,
133133 tolerance = None ):
134134
135135 cdef:
136- Py_ssize_t i, j, k, count = 0
137- Py_ssize_t loc, left_pos, right_pos, position
138- Py_ssize_t offset
139- ndarray[int64_t] left_count, right_count
140- ndarray left_sorter, right_sorter, rev
136+ Py_ssize_t left_pos, right_pos, left_size, right_size
141137 ndarray[int64_t] left_indexer, right_indexer
142- int64_t lc, rc, tol, left_val, right_val, diff, indexer
143- ndarray[int64_t] ld, rd
144- bint has_tol = 0
138+ bint has_tolerance = 0
139+ ndarray[int64_t] left_values_, right_values_
140+ int64_t tolerance_
145141
146142 # if we are using tolerance, set our objects
147- if left_distance is not None and right_distance is not None and tolerance is not None :
148- has_tol = 1
149- ld = left_distance
150- rd = right_distance
151- tol = tolerance
143+ if left_values is not None and right_values is not None and tolerance is not None :
144+ has_tolerance = 1
145+ left_values_ = left_values
146+ right_values_ = right_values
147+ tolerance_ = tolerance
152148
153- # NA group in location 0
154- left_sorter, left_count = groupsort_indexer(left, max_groups)
155- right_sorter, right_count = groupsort_indexer(right, max_groups)
149+ left_size = len (left)
150+ right_size = len (right)
156151
157- # First pass, determine size of result set, do not use the NA group
158- for i in range (1 , max_groups + 1 ):
159- if right_count[i] > 0 :
160- count += left_count[i] * right_count[i]
161- else :
162- count += left_count[i]
152+ left_indexer = np.empty(left_size, dtype = np.int64)
153+ right_indexer = np.empty(left_size, dtype = np.int64)
163154
164- # group 0 is the NA group
165- left_pos = 0
166155 right_pos = 0
167- position = 0
168-
169- # exclude the NA group
170- left_pos = left_count[0 ]
171- right_pos = right_count[0 ]
172-
173- left_indexer = np.empty(count, dtype = np.int64)
174- right_indexer = np.empty(count, dtype = np.int64)
175-
176- for i in range (1 , max_groups + 1 ):
177- lc = left_count[i]
178- rc = right_count[i]
179-
180- if rc == 0 :
181- for j in range (lc):
182- indexer = position + j
183- left_indexer[indexer] = left_pos + j
184-
185- # take the most recent value
186- # if we are not the first
187- if right_pos:
188-
189- if has_tol:
190-
191- left_val = ld[left_pos + j]
192- right_val = rd[right_pos - 1 ]
193- diff = left_val - right_val
194-
195- # do we allow exact matches
196- if allow_exact_matches:
197- if diff > tol:
198- right_indexer[indexer] = - 1
199- continue
200- elif not allow_exact_matches:
201- if diff >= tol or lc == rc:
202- right_indexer[indexer] = - 1
203- continue
204-
205- right_indexer[indexer] = right_pos - 1
206- else :
207- right_indexer[indexer] = - 1
208- position += lc
156+ for left_pos in range (left_size):
157+ # restart right_pos if it went negative in a previous iteration
158+ if right_pos < 0 :
159+ right_pos = 0
160+
161+ # find last position in right whose value is less than left's value
162+ if allow_exact_matches:
163+ while right_pos < right_size and right[right_pos] <= left[left_pos]:
164+ right_pos += 1
209165 else :
210- for j in range (lc):
211- offset = position + j * rc
212- for k in range (rc):
213-
214- indexer = offset + k
215- left_indexer[indexer] = left_pos + j
216-
217- if has_tol:
218-
219- left_val = ld[left_pos + j]
220- right_val = rd[right_pos + k]
221- diff = left_val - right_val
222-
223- # do we allow exact matches
224- if allow_exact_matches:
225- if diff > tol:
226- right_indexer[indexer] = - 1
227- continue
228-
229- # we don't allow exact matches
230- elif not allow_exact_matches:
231- if diff >= tol or lc == rc:
232- right_indexer[indexer] = - 1
233- else :
234- right_indexer[indexer] = right_pos - 1
235- continue
236-
237- else :
238-
239- # do we allow exact matches
240- if not allow_exact_matches:
241-
242- if right_pos:
243- right_indexer[indexer] = right_pos - 1
244- else :
245- right_indexer[indexer] = - 1
246- continue
247-
248- right_indexer[indexer] = right_pos + k
249- position += lc * rc
250- left_pos += lc
251- right_pos += rc
252-
253- left_indexer = _get_result_indexer(left_sorter, left_indexer)
254- right_indexer = _get_result_indexer(right_sorter, right_indexer)
255-
256- if not sort: # if not asked to sort, revert to original order
257- if len (left) == len (left_indexer):
258- # no multiple matches for any row on the left
259- # this is a short-cut to avoid groupsort_indexer
260- # otherwise, the `else` path also works in this case
261- if left_sorter.dtype != np.int_:
262- left_sorter = left_sorter.astype(np.int_)
263-
264- rev = np.empty(len (left), dtype = np.int_)
265- rev.put(left_sorter, np.arange(len (left)))
266- else :
267- rev, _ = groupsort_indexer(left_indexer, len (left))
268-
269- if rev.dtype != np.int_:
270- rev = rev.astype(np.int_)
271- right_indexer = right_indexer.take(rev)
272- left_indexer = left_indexer.take(rev)
166+ while right_pos < right_size and right[right_pos] < left[left_pos]:
167+ right_pos += 1
168+ right_pos -= 1
169+
170+ # save positions as the desired index
171+ left_indexer[left_pos] = left_pos
172+ right_indexer[left_pos] = right_pos
173+
174+ # if needed, verify that tolerance is met
175+ if has_tolerance and right_pos != - 1 :
176+ diff = left_values[left_pos] - right_values[right_pos]
177+ if diff > tolerance_:
178+ right_indexer[left_pos] = - 1
273179
274180 return left_indexer, right_indexer
275181
0 commit comments