316 ire_list = ire_unlink(irb);
317 rw_exit(&irb->irb_lock);
318 ASSERT(ire_list != NULL);
319 ire_cleanup(ire_list);
320 } else {
321 rw_exit(&irb->irb_lock);
322 }
323 }
324 }
325
326
327 /*
328 * Bump up the reference count on the IRE. We cannot assert that the
329 * bucket lock is being held as it is legal to bump up the reference
330 * count after the first lookup has returned the IRE without
331 * holding the lock.
332 */
333 void
334 ire_refhold(ire_t *ire)
335 {
336 atomic_add_32(&(ire)->ire_refcnt, 1);
337 ASSERT((ire)->ire_refcnt != 0);
338 #ifdef DEBUG
339 ire_trace_ref(ire);
340 #endif
341 }
342
343 void
344 ire_refhold_notr(ire_t *ire)
345 {
346 atomic_add_32(&(ire)->ire_refcnt, 1);
347 ASSERT((ire)->ire_refcnt != 0);
348 }
349
350 void
351 ire_refhold_locked(ire_t *ire)
352 {
353 #ifdef DEBUG
354 ire_trace_ref(ire);
355 #endif
356 ire->ire_refcnt++;
357 }
358
359 /*
360 * Release a ref on an IRE.
361 *
362 * Must not be called while holding any locks. Otherwise if this is
363 * the last reference to be released there is a chance of recursive mutex
364 * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
365 * to restart an ioctl. The one exception is when the caller is sure that
366 * this is not the last reference to be released. Eg. if the caller is
367 * sure that the ire has not been deleted and won't be deleted.
368 *
369 * In architectures e.g sun4u, where atomic_add_32_nv is just
370 * a cas, we need to maintain the right memory barrier semantics
371 * as that of mutex_exit i.e all the loads and stores should complete
372 * before the cas is executed. membar_exit() does that here.
373 */
374 void
375 ire_refrele(ire_t *ire)
376 {
377 #ifdef DEBUG
378 ire_untrace_ref(ire);
379 #endif
380 ASSERT((ire)->ire_refcnt != 0);
381 membar_exit();
382 if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)
383 ire_inactive(ire);
384 }
385
386 void
387 ire_refrele_notr(ire_t *ire)
388 {
389 ASSERT((ire)->ire_refcnt != 0);
390 membar_exit();
391 if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0)
392 ire_inactive(ire);
393 }
394
395 /*
396 * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
397 * IOCTL[s]. The NO_REPLY form is used by TCP to tell IP that it is
398 * having problems reaching a particular destination.
399 * This will make IP consider alternate routes (e.g., when there are
400 * muliple default routes), and it will also make IP discard any (potentially)
401 * stale redirect.
402 * Management processes may want to use the version that generates a reply.
403 *
404 * With the use of NUD like behavior for IPv4/ARP in addition to IPv6
405 * this function shouldn't be necessary for IP to recover from a bad redirect,
406 * a bad default router (when there are multiple default routers), or
407 * a stale ND/ARP entry. But we retain it in any case.
408 * For instance, this is helpful when TCP suspects a failure before NUD does.
409 */
410 int
411 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
1255 * in the gateway security attributes; such routes are
1256 * considered duplicates.
1257 * To change that we explicitly have to treat them as
1258 * different here.
1259 */
1260 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
1261 ire->ire_gateway_addr, ire->ire_type, ire->ire_ill,
1262 ire->ire_zoneid, NULL, match_flags)) {
1263 /*
1264 * Return the old ire after doing a REFHOLD.
1265 * As most of the callers continue to use the IRE
1266 * after adding, we return a held ire. This will
1267 * avoid a lookup in the caller again. If the callers
1268 * don't want to use it, they need to do a REFRELE.
1269 *
1270 * We only allow exactly one IRE_IF_CLONE for any dst,
1271 * so, if the is an IF_CLONE, return the ire without
1272 * an identical_ref, but with an ire_ref held.
1273 */
1274 if (ire->ire_type != IRE_IF_CLONE) {
1275 atomic_add_32(&ire1->ire_identical_ref, 1);
1276 DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
1277 ire_t *, ire);
1278 }
1279 ire_refhold(ire1);
1280 ire_atomic_end(irb_ptr, ire);
1281 ire_delete(ire);
1282 irb_refrele(irb_ptr);
1283 return (ire1);
1284 }
1285 }
1286
1287 /*
1288 * Normally we do head insertion since most things do not care about
1289 * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add
1290 * assumes we at least do head insertion so that its IRE_BROADCAST
1291 * arrive ahead of existing IRE_HOST for the same address.
1292 * However, due to shared-IP zones (and restrict_interzone_loopback)
1293 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
1294 * address. For that reason we do tail insertion for IRE_IF_CLONE.
1295 * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket,
1516 parent->ire_ib_pkt_count += ire->ire_ib_pkt_count;
1517 ire->ire_ob_pkt_count = 0;
1518 ire->ire_ib_pkt_count = 0;
1519 }
1520 rw_exit(&ipst->ips_ire_dep_lock);
1521 }
1522
1523 rw_enter(&irb->irb_lock, RW_WRITER);
1524 if (ire->ire_ptpn == NULL) {
1525 /*
1526 * Some other thread has removed us from the list.
1527 * It should have done the REFRELE for us.
1528 */
1529 rw_exit(&irb->irb_lock);
1530 return;
1531 }
1532
1533 if (!IRE_IS_CONDEMNED(ire)) {
1534 /* Is this an IRE representing multiple duplicate entries? */
1535 ASSERT(ire->ire_identical_ref >= 1);
1536 if (atomic_add_32_nv(&ire->ire_identical_ref, -1) != 0) {
1537 /* Removed one of the identical parties */
1538 rw_exit(&irb->irb_lock);
1539 return;
1540 }
1541
1542 irb->irb_ire_cnt--;
1543 ire_make_condemned(ire);
1544 }
1545
1546 if (irb->irb_refcnt != 0) {
1547 /*
1548 * The last thread to leave this bucket will
1549 * delete this ire.
1550 */
1551 irb->irb_marks |= IRB_MARK_CONDEMNED;
1552 rw_exit(&irb->irb_lock);
1553 return;
1554 }
1555
1556 /*
2601 return (ire_nce_init(ill, addr6, ire_type));
2602 }
2603
2604 /*
2605 * The caller should hold irb_lock as a writer if the ire is in a bucket.
2606 * This routine will clear ire_nce_cache, and we make sure that we can never
2607 * set ire_nce_cache after the ire is marked condemned.
2608 */
2609 void
2610 ire_make_condemned(ire_t *ire)
2611 {
2612 ip_stack_t *ipst = ire->ire_ipst;
2613 nce_t *nce;
2614
2615 mutex_enter(&ire->ire_lock);
2616 ASSERT(ire->ire_bucket == NULL ||
2617 RW_WRITE_HELD(&ire->ire_bucket->irb_lock));
2618 ASSERT(!IRE_IS_CONDEMNED(ire));
2619 ire->ire_generation = IRE_GENERATION_CONDEMNED;
2620 /* Count how many condemned ires for kmem_cache callback */
2621 atomic_add_32(&ipst->ips_num_ire_condemned, 1);
2622 nce = ire->ire_nce_cache;
2623 ire->ire_nce_cache = NULL;
2624 mutex_exit(&ire->ire_lock);
2625 if (nce != NULL)
2626 nce_refrele(nce);
2627 }
2628
2629 /*
2630 * Increment the generation avoiding the special condemned value
2631 */
2632 void
2633 ire_increment_generation(ire_t *ire)
2634 {
2635 uint_t generation;
2636
2637 mutex_enter(&ire->ire_lock);
2638 /*
2639 * Even though the caller has a hold it can't prevent a concurrent
2640 * ire_delete marking the IRE condemned
2641 */
|
316 ire_list = ire_unlink(irb);
317 rw_exit(&irb->irb_lock);
318 ASSERT(ire_list != NULL);
319 ire_cleanup(ire_list);
320 } else {
321 rw_exit(&irb->irb_lock);
322 }
323 }
324 }
325
326
327 /*
328 * Bump up the reference count on the IRE. We cannot assert that the
329 * bucket lock is being held as it is legal to bump up the reference
330 * count after the first lookup has returned the IRE without
331 * holding the lock.
332 */
333 void
334 ire_refhold(ire_t *ire)
335 {
336 atomic_inc_32(&(ire)->ire_refcnt);
337 ASSERT((ire)->ire_refcnt != 0);
338 #ifdef DEBUG
339 ire_trace_ref(ire);
340 #endif
341 }
342
343 void
344 ire_refhold_notr(ire_t *ire)
345 {
346 atomic_inc_32(&(ire)->ire_refcnt);
347 ASSERT((ire)->ire_refcnt != 0);
348 }
349
350 void
351 ire_refhold_locked(ire_t *ire)
352 {
353 #ifdef DEBUG
354 ire_trace_ref(ire);
355 #endif
356 ire->ire_refcnt++;
357 }
358
359 /*
360 * Release a ref on an IRE.
361 *
362 * Must not be called while holding any locks. Otherwise if this is
363 * the last reference to be released there is a chance of recursive mutex
364 * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying
365 * to restart an ioctl. The one exception is when the caller is sure that
366 * this is not the last reference to be released. Eg. if the caller is
367 * sure that the ire has not been deleted and won't be deleted.
368 *
369 * In architectures e.g sun4u, where atomic_add_32_nv is just
370 * a cas, we need to maintain the right memory barrier semantics
371 * as that of mutex_exit i.e all the loads and stores should complete
372 * before the cas is executed. membar_exit() does that here.
373 */
374 void
375 ire_refrele(ire_t *ire)
376 {
377 #ifdef DEBUG
378 ire_untrace_ref(ire);
379 #endif
380 ASSERT((ire)->ire_refcnt != 0);
381 membar_exit();
382 if (atomic_dec_32_nv(&(ire)->ire_refcnt) == 0)
383 ire_inactive(ire);
384 }
385
386 void
387 ire_refrele_notr(ire_t *ire)
388 {
389 ASSERT((ire)->ire_refcnt != 0);
390 membar_exit();
391 if (atomic_dec_32_nv(&(ire)->ire_refcnt) == 0)
392 ire_inactive(ire);
393 }
394
395 /*
396 * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY]
397 * IOCTL[s]. The NO_REPLY form is used by TCP to tell IP that it is
398 * having problems reaching a particular destination.
399 * This will make IP consider alternate routes (e.g., when there are
400 * muliple default routes), and it will also make IP discard any (potentially)
401 * stale redirect.
402 * Management processes may want to use the version that generates a reply.
403 *
404 * With the use of NUD like behavior for IPv4/ARP in addition to IPv6
405 * this function shouldn't be necessary for IP to recover from a bad redirect,
406 * a bad default router (when there are multiple default routers), or
407 * a stale ND/ARP entry. But we retain it in any case.
408 * For instance, this is helpful when TCP suspects a failure before NUD does.
409 */
410 int
411 ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr)
1255 * in the gateway security attributes; such routes are
1256 * considered duplicates.
1257 * To change that we explicitly have to treat them as
1258 * different here.
1259 */
1260 if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask,
1261 ire->ire_gateway_addr, ire->ire_type, ire->ire_ill,
1262 ire->ire_zoneid, NULL, match_flags)) {
1263 /*
1264 * Return the old ire after doing a REFHOLD.
1265 * As most of the callers continue to use the IRE
1266 * after adding, we return a held ire. This will
1267 * avoid a lookup in the caller again. If the callers
1268 * don't want to use it, they need to do a REFRELE.
1269 *
1270 * We only allow exactly one IRE_IF_CLONE for any dst,
1271 * so, if the is an IF_CLONE, return the ire without
1272 * an identical_ref, but with an ire_ref held.
1273 */
1274 if (ire->ire_type != IRE_IF_CLONE) {
1275 atomic_inc_32(&ire1->ire_identical_ref);
1276 DTRACE_PROBE2(ire__add__exist, ire_t *, ire1,
1277 ire_t *, ire);
1278 }
1279 ire_refhold(ire1);
1280 ire_atomic_end(irb_ptr, ire);
1281 ire_delete(ire);
1282 irb_refrele(irb_ptr);
1283 return (ire1);
1284 }
1285 }
1286
1287 /*
1288 * Normally we do head insertion since most things do not care about
1289 * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add
1290 * assumes we at least do head insertion so that its IRE_BROADCAST
1291 * arrive ahead of existing IRE_HOST for the same address.
1292 * However, due to shared-IP zones (and restrict_interzone_loopback)
1293 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
1294 * address. For that reason we do tail insertion for IRE_IF_CLONE.
1295 * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket,
1516 parent->ire_ib_pkt_count += ire->ire_ib_pkt_count;
1517 ire->ire_ob_pkt_count = 0;
1518 ire->ire_ib_pkt_count = 0;
1519 }
1520 rw_exit(&ipst->ips_ire_dep_lock);
1521 }
1522
1523 rw_enter(&irb->irb_lock, RW_WRITER);
1524 if (ire->ire_ptpn == NULL) {
1525 /*
1526 * Some other thread has removed us from the list.
1527 * It should have done the REFRELE for us.
1528 */
1529 rw_exit(&irb->irb_lock);
1530 return;
1531 }
1532
1533 if (!IRE_IS_CONDEMNED(ire)) {
1534 /* Is this an IRE representing multiple duplicate entries? */
1535 ASSERT(ire->ire_identical_ref >= 1);
1536 if (atomic_dec_32_nv(&ire->ire_identical_ref) != 0) {
1537 /* Removed one of the identical parties */
1538 rw_exit(&irb->irb_lock);
1539 return;
1540 }
1541
1542 irb->irb_ire_cnt--;
1543 ire_make_condemned(ire);
1544 }
1545
1546 if (irb->irb_refcnt != 0) {
1547 /*
1548 * The last thread to leave this bucket will
1549 * delete this ire.
1550 */
1551 irb->irb_marks |= IRB_MARK_CONDEMNED;
1552 rw_exit(&irb->irb_lock);
1553 return;
1554 }
1555
1556 /*
2601 return (ire_nce_init(ill, addr6, ire_type));
2602 }
2603
2604 /*
2605 * The caller should hold irb_lock as a writer if the ire is in a bucket.
2606 * This routine will clear ire_nce_cache, and we make sure that we can never
2607 * set ire_nce_cache after the ire is marked condemned.
2608 */
2609 void
2610 ire_make_condemned(ire_t *ire)
2611 {
2612 ip_stack_t *ipst = ire->ire_ipst;
2613 nce_t *nce;
2614
2615 mutex_enter(&ire->ire_lock);
2616 ASSERT(ire->ire_bucket == NULL ||
2617 RW_WRITE_HELD(&ire->ire_bucket->irb_lock));
2618 ASSERT(!IRE_IS_CONDEMNED(ire));
2619 ire->ire_generation = IRE_GENERATION_CONDEMNED;
2620 /* Count how many condemned ires for kmem_cache callback */
2621 atomic_inc_32(&ipst->ips_num_ire_condemned);
2622 nce = ire->ire_nce_cache;
2623 ire->ire_nce_cache = NULL;
2624 mutex_exit(&ire->ire_lock);
2625 if (nce != NULL)
2626 nce_refrele(nce);
2627 }
2628
2629 /*
2630 * Increment the generation avoiding the special condemned value
2631 */
2632 void
2633 ire_increment_generation(ire_t *ire)
2634 {
2635 uint_t generation;
2636
2637 mutex_enter(&ire->ire_lock);
2638 /*
2639 * Even though the caller has a hold it can't prevent a concurrent
2640 * ire_delete marking the IRE condemned
2641 */
|