@@ -66,8 +66,7 @@ mca_smsc_endpoint_t *mca_smsc_xpmem_get_endpoint(opal_proc_t *peer_proc)
6666 return & endpoint -> super ;
6767}
6868
69- /* look up the remote pointer in the peer rcache and attach if
70- * necessary */
69+ /* look up the remote pointer in the peer rcache and attach if necessary */
7170void * mca_smsc_xpmem_map_peer_region (mca_smsc_endpoint_t * endpoint , uint64_t flags ,
7271 void * remote_ptr , size_t size , void * * local_ptr )
7372{
@@ -77,71 +76,81 @@ void *mca_smsc_xpmem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t fla
7776 mca_rcache_base_registration_t * reg = NULL ;
7877 xpmem_addr_t xpmem_addr ;
7978 uintptr_t base , bound ;
79+ size_t region_size ;
8080 int rc ;
8181
82+ /* base is the first byte of the region, bound is the last (inclusive) */
8283 base = OPAL_DOWN_ALIGN ((uintptr_t ) remote_ptr , attach_align , uintptr_t );
83- bound = OPAL_ALIGN ((uintptr_t ) remote_ptr + size , attach_align , uintptr_t );
84+ bound = OPAL_ALIGN ((uintptr_t ) remote_ptr + size , attach_align , uintptr_t ) - 1 ;
8485 if (OPAL_UNLIKELY (bound > xpmem_endpoint -> address_max )) {
8586 bound = xpmem_endpoint -> address_max ;
8687 }
88+ region_size = bound - base + 1 ;
89+
90+ printf ("user ptr %p size %lu base %p bound %p\n" , remote_ptr , size , base , bound );
91+ printf ("search base %p len %p\n" , base , region_size );
92+
93+ rc = mca_rcache_base_vma_find (vma_module , (void * ) base , region_size , & reg );
94+ assert (OPAL_SUCCESS == rc );
8795
88- rc = mca_rcache_base_vma_find (vma_module , (void * ) base , bound - base , & reg );
89- assert (rc != OPAL_SUCCESS );
90-
9196 // TODO Add rcache stats?
92-
93- /* For a number of operations here, while support for multiple threads is
94- * existent, might not produce the fully ideal result. Looks like this can't
95- * be fully resolved without respective support from the regcache tree.
96- * TODO finish comment. is it accurate?*/
97-
97+
9898 // TODO what if reg is deleted between finding it and atomically fetching the
99- // ref count? Or will the tree block? And this could also happen inside the
100- // tree's code.
101-
99+ // ref count? Or will the tree block? (this could also happen inside the tree's code)
100+
102101 if (reg ) {
102+ printf ("region match %p-%p\n" , reg -> base , reg -> bound );
103+
103104 int32_t old_ref_count = opal_atomic_fetch_add_32 (& reg -> ref_count , 1 );
104105 if (0 == old_ref_count ) {
105106 /* Registration is being deleted by another thread
106107 * in mca_smsc_xpmem_unmap_peer_region, ignore it. */
107108 reg = NULL ;
108109 }
109-
110+
110111 // TODO what if two threads increment the ref counter while a third one is
111112 // deleting it? One of the increment-threads will see 1 as the old value
112113 // and go ahead with using the registration, while the writer will delete it!
113-
114+
115+ // Do we ultimately have to do something like this?
116+
114117 // int32_t ref_count = opal_atomic_load_32(®->ref_count);
115-
116- // while(1 ) {
118+
119+ // while(true ) {
117120 // if(0 == ref_count) {
118121 // reg = NULL;
119122 // break;
120123 // }
121-
124+
122125 // if(opal_atomic_compare_exchange_strong_32(
123126 // ®->ref_count, &ref_count, ref_count + 1)) {
124127 // break;
125128 // }
126129 // }
127-
128130 } else {
131+ printf ("no region match\n" );
132+
129133 /* If there is a registration that overlaps with the requested range, but
130- * does not fully cover it, we destroy it and make a new one in its place
131- * to covers both the previous range and the new requested one. */
132-
133- rc = mca_rcache_base_vma_find (vma_module , (void * ) base , 1 , & reg );
134- assert (rc != OPAL_SUCCESS );
135-
136- // TODO is this correct?
137- // TODO check for hang. Only with non-debug?
138-
139- if (NULL == reg ) {
140- rc = mca_rcache_base_vma_find (vma_module , (void * ) (bound + 1 ), 1 , & reg );
141- assert (rc != OPAL_SUCCESS );
142- }
143-
134+ * does not fully cover it, we destroy it and make in its place a new one
135+ * that covers both the existing and the new range. */
136+
137+ // uintptr_t search_begin[4] = {base, bound, base - 1, bound + 1};
138+ uintptr_t search_begin [2 ] = {base , bound };
139+ for (size_t i = 0 ; i < 2 ; i ++ ) {
140+ printf ("search overlapping %p-%p\n" ,
141+ search_begin [i ], search_begin [i ]+ 1 );
142+
143+ rc = mca_rcache_base_vma_find (vma_module , (void * ) search_begin [i ], 1 , & reg );
144+ assert (OPAL_SUCCESS == rc );
145+
146+ if (reg ) {
147+ break ;
148+ }
149+ }
150+
144151 if (reg ) {
152+ printf ("found overlapping\n" );
153+
145154 /* Set the invalid flag, to mark the deletion of this registration
146155 * (will take place in unmap_peer_region). If another thread has
147156 * already marked deletion, ignore. */
@@ -150,31 +159,38 @@ void *mca_smsc_xpmem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t fla
150159 (volatile opal_atomic_int32_t * ) & reg -> flags , MCA_RCACHE_FLAGS_INVALID );
151160
152161 if (!(old_flags & MCA_RCACHE_FLAGS_INVALID )) {
162+ printf ("handling merge\n" );
163+
153164 base = opal_min (base , (uintptr_t ) reg -> base );
154165 bound = opal_max (bound , (uintptr_t ) reg -> bound );
166+ region_size = bound - base + 1 ;
155167
156- /* We did not increment the ref count when we found the registration.
157- * When PERSIST is set, a superfluous ref is present, so no need to do
158- * anything. If not, we must increment the ref counter before calling
159- * unmap_peer_region (which will decrement it), to avoid it going negative. */
168+ /* unmap_peer_region will decrement the ref count, but we did not
169+ * increment it when we found the reg. If persist was not set,
170+ * a superflous ref is present, so all is fine. If not, we need
171+ * to manually adjust before calling unmap_peer_region, to avoid
172+ * deallocation while someone is still using the reg. */
160173 if (!(MCA_RCACHE_FLAGS_PERSIST & reg -> flags ))
161174 opal_atomic_add (& reg -> ref_count , 1 );
162175
176+ printf ("set invalid, ref count before unmap call %d\n" , reg -> ref_count );
177+
163178 mca_smsc_xpmem_unmap_peer_region (reg );
164179 }
165-
180+
166181 reg = NULL ;
167- }
182+ } else
183+ printf ("no overlapping\n" );
168184 }
169-
185+
170186 if (NULL == reg ) {
171187 reg = OBJ_NEW (mca_rcache_base_registration_t );
172188 if (OPAL_LIKELY (NULL == reg )) {
173189 return NULL ;
174190 }
175191
176192 reg -> ref_count = ((flags & MCA_RCACHE_FLAGS_PERSIST )
177- && !(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS ) ? 2 : 1 );
193+ && !(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS ) ? 2 : 1 );
178194 reg -> flags = flags ;
179195 reg -> base = (unsigned char * ) base ;
180196 reg -> bound = (unsigned char * ) bound ;
@@ -192,24 +208,39 @@ void *mca_smsc_xpmem_map_peer_region(mca_smsc_endpoint_t *endpoint, uint64_t fla
192208 "for endpoint %p address range %p-%p" ,
193209 (void * ) endpoint , reg -> base , reg -> bound );
194210
195- reg -> rcache_context = xpmem_attach (xpmem_addr , bound - base , NULL );
211+ reg -> rcache_context = xpmem_attach (xpmem_addr , region_size , NULL );
212+ printf ("xpmem attach(%p, 0x%lx) -> %p\n" , base , region_size , reg -> rcache_context );
213+
196214 if (OPAL_UNLIKELY ((void * ) -1 == reg -> rcache_context )) {
215+ uintptr_t old_bound = bound ;
216+
197217 /* retry with the page as upper bound */
198- bound = OPAL_ALIGN ((uintptr_t ) remote_ptr + size , opal_getpagesize (), uintptr_t );
218+ bound = OPAL_ALIGN ((uintptr_t ) remote_ptr + size , opal_getpagesize (), uintptr_t ) - 1 ;
199219 reg -> bound = (unsigned char * ) bound ;
200- reg -> rcache_context = xpmem_attach (xpmem_addr , bound - base , NULL );
220+ region_size = bound - base + 1 ;
221+
222+ opal_output_verbose (MCA_BASE_VERBOSE_INFO , opal_smsc_base_framework .framework_output ,
223+ "mca_smsc_xpmem_map_peer_region: region mapping "
224+ "for endpoint %p address range %p-%p failed. "
225+ "retrying with range %p-%p" ,
226+ (void * ) endpoint , reg -> base , (void * ) old_bound ,
227+ reg -> base , reg -> bound );
228+
229+ reg -> rcache_context = xpmem_attach (xpmem_addr , region_size , NULL );
201230 if (OPAL_UNLIKELY ((void * ) -1 == reg -> rcache_context )) {
202231 OBJ_RELEASE (reg );
203232 return NULL ;
204233 }
205234 }
206235
207- opal_memchecker_base_mem_defined (reg -> rcache_context , bound - base );
208-
236+ printf ("new reg %p-%p ref count %d\n" , reg -> base , reg -> bound , reg -> ref_count );
237+
238+ opal_memchecker_base_mem_defined (reg -> rcache_context , region_size );
239+
209240 if (!(reg -> flags & MCA_RCACHE_FLAGS_CACHE_BYPASS )) {
210241 rc = mca_rcache_base_vma_insert (vma_module , reg , 0 );
211242 assert (OPAL_SUCCESS == rc );
212-
243+
213244 if (OPAL_SUCCESS != rc ) {
214245 reg -> flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS ;
215246 }
@@ -231,6 +262,8 @@ void mca_smsc_xpmem_unmap_peer_region(void *ctx)
231262
232263 ref_count = opal_atomic_add_fetch_32 (& reg -> ref_count , -1 );
233264 if (OPAL_UNLIKELY (0 == ref_count )) {
265+ printf ("UNMAP reg %p-%p\n" , reg -> base , reg -> bound );
266+
234267 opal_output_verbose (MCA_BASE_VERBOSE_INFO , opal_smsc_base_framework .framework_output ,
235268 "mca_smsc_xpmem_unmap_peer_region: deleting region mapping for "
236269 "endpoint %p address range %p-%p" ,
@@ -244,7 +277,7 @@ void mca_smsc_xpmem_unmap_peer_region(void *ctx)
244277#endif
245278 }
246279
247- opal_memchecker_base_mem_noaccess (reg -> rcache_context , (uintptr_t )(reg -> bound - reg -> base ));
280+ opal_memchecker_base_mem_noaccess (reg -> rcache_context , (uintptr_t )(reg -> bound - reg -> base + 1 ));
248281 (void ) xpmem_detach (reg -> rcache_context );
249282
250283 OBJ_RELEASE (reg );
@@ -253,6 +286,12 @@ void mca_smsc_xpmem_unmap_peer_region(void *ctx)
253286
254287static int mca_smsc_xpmem_endpoint_rcache_cleanup (mca_rcache_base_registration_t * reg , void * ctx )
255288{
289+ /* See respective comment in mca_smsc_xpmem_map_peer_region */
290+ if (!(MCA_RCACHE_FLAGS_PERSIST & reg -> flags ))
291+ opal_atomic_add (& reg -> ref_count , 1 );
292+
293+ printf ("cleanup reg %p-%p count %d\n" , reg -> base , reg -> bound , reg -> ref_count );
294+
256295 mca_smsc_xpmem_unmap_peer_region (reg );
257296 return OPAL_SUCCESS ;
258297}
@@ -284,7 +323,7 @@ void mca_smsc_xpmem_return_endpoint(mca_smsc_endpoint_t *endpoint)
284323}
285324
286325/* memcpy is faster at larger sizes but is undefined if the
287- pointers are aliased (TODO -- readd alias check) */
326+ pointers are aliased (TODO -- read alias check) */
288327static inline void mca_smsc_xpmem_memmove (void * dst , void * src , size_t size )
289328{
290329 while (size > 0 ) {
0 commit comments