1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25 
  26 #include <sys/param.h>
  27 #include <sys/thread.h>
  28 #include <sys/cpuvar.h>
  29 #include <sys/inttypes.h>
  30 #include <sys/cmn_err.h>
  31 #include <sys/time.h>
  32 #include <sys/ksynch.h>
  33 #include <sys/systm.h>
  34 #include <sys/kcpc.h>
  35 #include <sys/cpc_impl.h>
  36 #include <sys/cpc_pcbe.h>
  37 #include <sys/atomic.h>
  38 #include <sys/sunddi.h>
  39 #include <sys/modctl.h>
  40 #include <sys/sdt.h>
  41 #include <sys/archsystm.h>
  42 #include <sys/promif.h>
  43 #include <sys/x_call.h>
  44 #include <sys/cap_util.h>
  45 #if defined(__x86)
  46 #include <asm/clock.h>
  47 #include <sys/xc_levels.h>
  48 #endif
  49 
  50 static kmutex_t kcpc_ctx_llock[CPC_HASH_BUCKETS];       /* protects ctx_list */
  51 static kcpc_ctx_t *kcpc_ctx_list[CPC_HASH_BUCKETS];     /* head of list */
  52 
  53 
  54 krwlock_t       kcpc_cpuctx_lock;       /* lock for 'kcpc_cpuctx' below */
  55 int             kcpc_cpuctx;            /* number of cpu-specific contexts */
  56 
  57 int kcpc_counts_include_idle = 1; /* Project Private /etc/system variable */
  58 
  59 /*
  60  * These are set when a PCBE module is loaded.
  61  */
  62 uint_t          cpc_ncounters = 0;
  63 pcbe_ops_t      *pcbe_ops = NULL;
  64 
  65 /*
  66  * Statistics on (mis)behavior
  67  */
  68 static uint32_t kcpc_intrctx_count;    /* # overflows in an interrupt handler */
  69 static uint32_t kcpc_nullctx_count;    /* # overflows in a thread with no ctx */
  70 
  71 /*
  72  * By setting 'kcpc_nullctx_panic' to 1, any overflow interrupts in a thread
  73  * with no valid context will result in a panic.
  74  */
  75 static int kcpc_nullctx_panic = 0;
  76 
  77 static void kcpc_lwp_create(kthread_t *t, kthread_t *ct);
  78 static void kcpc_restore(kcpc_ctx_t *ctx);
  79 static void kcpc_save(kcpc_ctx_t *ctx);
  80 static void kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx);
  81 static int kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch);
  82 static kcpc_set_t *kcpc_dup_set(kcpc_set_t *set);
  83 static kcpc_set_t *kcpc_set_create(kcpc_request_t *reqs, int nreqs,
  84     int set_flags, int kmem_flags);
  85 
  86 /*
  87  * Macros to manipulate context flags. All flag updates should use one of these
  88  * two macros
  89  *
  90  * Flags should be always be updated atomically since some of the updates are
  91  * not protected by locks.
  92  */
  93 #define KCPC_CTX_FLAG_SET(ctx, flag) atomic_or_uint(&(ctx)->kc_flags, (flag))
  94 #define KCPC_CTX_FLAG_CLR(ctx, flag) atomic_and_uint(&(ctx)->kc_flags, ~(flag))
  95 
  96 /*
  97  * The IS_HIPIL() macro verifies that the code is executed either from a
  98  * cross-call or from high-PIL interrupt
  99  */
 100 #ifdef DEBUG
 101 #define IS_HIPIL() (getpil() >= XCALL_PIL)
 102 #else
 103 #define IS_HIPIL()
 104 #endif  /* DEBUG */
 105 
 106 
 107 extern int kcpc_hw_load_pcbe(void);
 108 
 109 /*
 110  * Return value from kcpc_hw_load_pcbe()
 111  */
 112 static int kcpc_pcbe_error = 0;
 113 
 114 /*
 115  * Perform one-time initialization of kcpc framework.
 116  * This function performs the initialization only the first time it is called.
 117  * It is safe to call it multiple times.
 118  */
 119 int
 120 kcpc_init(void)
 121 {
 122         long hash;
 123         static uint32_t kcpc_initialized = 0;
 124 
 125         /*
 126          * We already tried loading platform pcbe module and failed
 127          */
 128         if (kcpc_pcbe_error != 0)
 129                 return (-1);
 130 
 131         /*
 132          * The kcpc framework should be initialized at most once
 133          */
 134         if (atomic_cas_32(&kcpc_initialized, 0, 1) != 0)
 135                 return (0);
 136 
 137         rw_init(&kcpc_cpuctx_lock, NULL, RW_DEFAULT, NULL);
 138         for (hash = 0; hash < CPC_HASH_BUCKETS; hash++)
 139                 mutex_init(&kcpc_ctx_llock[hash],
 140                     NULL, MUTEX_DRIVER, (void *)(uintptr_t)15);
 141 
 142         /*
 143          * Load platform-specific pcbe module
 144          */
 145         kcpc_pcbe_error = kcpc_hw_load_pcbe();
 146 
 147         return (kcpc_pcbe_error == 0 ? 0 : -1);
 148 }
 149 
 150 void
 151 kcpc_register_pcbe(pcbe_ops_t *ops)
 152 {
 153         pcbe_ops = ops;
 154         cpc_ncounters = pcbe_ops->pcbe_ncounters();
 155 }
 156 
 157 void
 158 kcpc_register_dcpc(void (*func)(uint64_t))
 159 {
 160         dtrace_cpc_fire = func;
 161 }
 162 
 163 void
 164 kcpc_unregister_dcpc(void)
 165 {
 166         dtrace_cpc_fire = NULL;
 167 }
 168 
 169 int
 170 kcpc_bind_cpu(kcpc_set_t *set, processorid_t cpuid, int *subcode)
 171 {
 172         cpu_t           *cp;
 173         kcpc_ctx_t      *ctx;
 174         int             error;
 175         int             save_spl;
 176 
 177         ctx = kcpc_ctx_alloc(KM_SLEEP);
 178 
 179         if (kcpc_assign_reqs(set, ctx) != 0) {
 180                 kcpc_ctx_free(ctx);
 181                 *subcode = CPC_RESOURCE_UNAVAIL;
 182                 return (EINVAL);
 183         }
 184 
 185         ctx->kc_cpuid = cpuid;
 186         ctx->kc_thread = curthread;
 187 
 188         set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
 189 
 190         if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
 191                 kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 192                 kcpc_ctx_free(ctx);
 193                 return (error);
 194         }
 195 
 196         set->ks_ctx = ctx;
 197         ctx->kc_set = set;
 198 
 199         /*
 200          * We must hold cpu_lock to prevent DR, offlining, or unbinding while
 201          * we are manipulating the cpu_t and programming the hardware, else the
 202          * the cpu_t could go away while we're looking at it.
 203          */
 204         mutex_enter(&cpu_lock);
 205         cp = cpu_get(cpuid);
 206 
 207         if (cp == NULL)
 208                 /*
 209                  * The CPU could have been DRd out while we were getting set up.
 210                  */
 211                 goto unbound;
 212 
 213         mutex_enter(&cp->cpu_cpc_ctxlock);
 214         kpreempt_disable();
 215         save_spl = spl_xcall();
 216 
 217         /*
 218          * Check to see whether counters for CPU already being used by someone
 219          * other than kernel for capacity and utilization (since kernel will
 220          * let go of counters for user in kcpc_program() below)
 221          */
 222         if (cp->cpu_cpc_ctx != NULL && !CU_CPC_ON(cp)) {
 223                 /*
 224                  * If this CPU already has a bound set, return an error.
 225                  */
 226                 splx(save_spl);
 227                 kpreempt_enable();
 228                 mutex_exit(&cp->cpu_cpc_ctxlock);
 229                 goto unbound;
 230         }
 231 
 232         if (curthread->t_bind_cpu != cpuid) {
 233                 splx(save_spl);
 234                 kpreempt_enable();
 235                 mutex_exit(&cp->cpu_cpc_ctxlock);
 236                 goto unbound;
 237         }
 238 
 239         kcpc_program(ctx, B_FALSE, B_TRUE);
 240 
 241         splx(save_spl);
 242         kpreempt_enable();
 243 
 244         mutex_exit(&cp->cpu_cpc_ctxlock);
 245         mutex_exit(&cpu_lock);
 246 
 247         mutex_enter(&set->ks_lock);
 248         set->ks_state |= KCPC_SET_BOUND;
 249         cv_signal(&set->ks_condv);
 250         mutex_exit(&set->ks_lock);
 251 
 252         return (0);
 253 
 254 unbound:
 255         mutex_exit(&cpu_lock);
 256         set->ks_ctx = NULL;
 257         kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 258         kcpc_ctx_free(ctx);
 259         return (EAGAIN);
 260 }
 261 
 262 int
 263 kcpc_bind_thread(kcpc_set_t *set, kthread_t *t, int *subcode)
 264 {
 265         kcpc_ctx_t      *ctx;
 266         int             error;
 267 
 268         /*
 269          * Only one set is allowed per context, so ensure there is no
 270          * existing context.
 271          */
 272 
 273         if (t->t_cpc_ctx != NULL)
 274                 return (EEXIST);
 275 
 276         ctx = kcpc_ctx_alloc(KM_SLEEP);
 277 
 278         /*
 279          * The context must begin life frozen until it has been properly
 280          * programmed onto the hardware. This prevents the context ops from
 281          * worrying about it until we're ready.
 282          */
 283         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 284         ctx->kc_hrtime = gethrtime();
 285 
 286         if (kcpc_assign_reqs(set, ctx) != 0) {
 287                 kcpc_ctx_free(ctx);
 288                 *subcode = CPC_RESOURCE_UNAVAIL;
 289                 return (EINVAL);
 290         }
 291 
 292         ctx->kc_cpuid = -1;
 293         if (set->ks_flags & CPC_BIND_LWP_INHERIT)
 294                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_LWPINHERIT);
 295         ctx->kc_thread = t;
 296         t->t_cpc_ctx = ctx;
 297         /*
 298          * Permit threads to look at their own hardware counters from userland.
 299          */
 300         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_NONPRIV);
 301 
 302         /*
 303          * Create the data store for this set.
 304          */
 305         set->ks_data = kmem_alloc(set->ks_nreqs * sizeof (uint64_t), KM_SLEEP);
 306 
 307         if ((error = kcpc_configure_reqs(ctx, set, subcode)) != 0) {
 308                 kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
 309                 kcpc_ctx_free(ctx);
 310                 t->t_cpc_ctx = NULL;
 311                 return (error);
 312         }
 313 
 314         set->ks_ctx = ctx;
 315         ctx->kc_set = set;
 316 
 317         /*
 318          * Add a device context to the subject thread.
 319          */
 320         installctx(t, ctx, kcpc_save, kcpc_restore, NULL,
 321             kcpc_lwp_create, NULL, kcpc_free);
 322 
 323         /*
 324          * Ask the backend to program the hardware.
 325          */
 326         if (t == curthread) {
 327                 int save_spl;
 328 
 329                 kpreempt_disable();
 330                 save_spl = spl_xcall();
 331                 kcpc_program(ctx, B_TRUE, B_TRUE);
 332                 splx(save_spl);
 333                 kpreempt_enable();
 334         } else {
 335                 /*
 336                  * Since we are the agent LWP, we know the victim LWP is stopped
 337                  * until we're done here; no need to worry about preemption or
 338                  * migration here. We still use an atomic op to clear the flag
 339                  * to ensure the flags are always self-consistent; they can
 340                  * still be accessed from, for instance, another CPU doing a
 341                  * kcpc_invalidate_all().
 342                  */
 343                 KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 344         }
 345 
 346         mutex_enter(&set->ks_lock);
 347         set->ks_state |= KCPC_SET_BOUND;
 348         cv_signal(&set->ks_condv);
 349         mutex_exit(&set->ks_lock);
 350 
 351         return (0);
 352 }
 353 
 354 /*
 355  * Walk through each request in the set and ask the PCBE to configure a
 356  * corresponding counter.
 357  */
 358 int
 359 kcpc_configure_reqs(kcpc_ctx_t *ctx, kcpc_set_t *set, int *subcode)
 360 {
 361         int             i;
 362         int             ret;
 363         kcpc_request_t  *rp;
 364 
 365         for (i = 0; i < set->ks_nreqs; i++) {
 366                 int n;
 367                 rp = &set->ks_req[i];
 368 
 369                 n = rp->kr_picnum;
 370 
 371                 ASSERT(n >= 0 && n < cpc_ncounters);
 372 
 373                 ASSERT(ctx->kc_pics[n].kp_req == NULL);
 374 
 375                 if (rp->kr_flags & CPC_OVF_NOTIFY_EMT) {
 376                         if ((pcbe_ops->pcbe_caps & CPC_CAP_OVERFLOW_INTERRUPT)
 377                             == 0) {
 378                                 *subcode = -1;
 379                                 return (ENOTSUP);
 380                         }
 381                         /*
 382                          * If any of the counters have requested overflow
 383                          * notification, we flag the context as being one that
 384                          * cares about overflow.
 385                          */
 386                         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_SIGOVF);
 387                 }
 388 
 389                 rp->kr_config = NULL;
 390                 if ((ret = pcbe_ops->pcbe_configure(n, rp->kr_event,
 391                     rp->kr_preset, rp->kr_flags, rp->kr_nattrs, rp->kr_attr,
 392                     &(rp->kr_config), (void *)ctx)) != 0) {
 393                         kcpc_free_configs(set);
 394                         *subcode = ret;
 395                         switch (ret) {
 396                         case CPC_ATTR_REQUIRES_PRIVILEGE:
 397                         case CPC_HV_NO_ACCESS:
 398                                 return (EACCES);
 399                         default:
 400                                 return (EINVAL);
 401                         }
 402                 }
 403 
 404                 ctx->kc_pics[n].kp_req = rp;
 405                 rp->kr_picp = &ctx->kc_pics[n];
 406                 rp->kr_data = set->ks_data + rp->kr_index;
 407                 *rp->kr_data = rp->kr_preset;
 408         }
 409 
 410         return (0);
 411 }
 412 
 413 void
 414 kcpc_free_configs(kcpc_set_t *set)
 415 {
 416         int i;
 417 
 418         for (i = 0; i < set->ks_nreqs; i++)
 419                 if (set->ks_req[i].kr_config != NULL)
 420                         pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
 421 }
 422 
 423 /*
 424  * buf points to a user address and the data should be copied out to that
 425  * address in the current process.
 426  */
 427 int
 428 kcpc_sample(kcpc_set_t *set, uint64_t *buf, hrtime_t *hrtime, uint64_t *tick)
 429 {
 430         kcpc_ctx_t      *ctx = set->ks_ctx;
 431         int             save_spl;
 432 
 433         mutex_enter(&set->ks_lock);
 434         if ((set->ks_state & KCPC_SET_BOUND) == 0) {
 435                 mutex_exit(&set->ks_lock);
 436                 return (EINVAL);
 437         }
 438         mutex_exit(&set->ks_lock);
 439 
 440         /*
 441          * Kernel preemption must be disabled while reading the hardware regs,
 442          * and if this is a CPU-bound context, while checking the CPU binding of
 443          * the current thread.
 444          */
 445         kpreempt_disable();
 446         save_spl = spl_xcall();
 447 
 448         if (ctx->kc_flags & KCPC_CTX_INVALID) {
 449                 splx(save_spl);
 450                 kpreempt_enable();
 451                 return (EAGAIN);
 452         }
 453 
 454         if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0) {
 455                 if (ctx->kc_cpuid != -1) {
 456                         if (curthread->t_bind_cpu != ctx->kc_cpuid) {
 457                                 splx(save_spl);
 458                                 kpreempt_enable();
 459                                 return (EAGAIN);
 460                         }
 461                 }
 462 
 463                 if (ctx->kc_thread == curthread) {
 464                         uint64_t curtick = KCPC_GET_TICK();
 465 
 466                         ctx->kc_hrtime = gethrtime_waitfree();
 467                         pcbe_ops->pcbe_sample(ctx);
 468                         ctx->kc_vtick += curtick - ctx->kc_rawtick;
 469                         ctx->kc_rawtick = curtick;
 470                 }
 471 
 472                 /*
 473                  * The config may have been invalidated by
 474                  * the pcbe_sample op.
 475                  */
 476                 if (ctx->kc_flags & KCPC_CTX_INVALID) {
 477                         splx(save_spl);
 478                         kpreempt_enable();
 479                         return (EAGAIN);
 480                 }
 481 
 482         }
 483 
 484         splx(save_spl);
 485         kpreempt_enable();
 486 
 487         if (copyout(set->ks_data, buf,
 488             set->ks_nreqs * sizeof (uint64_t)) == -1)
 489                 return (EFAULT);
 490         if (copyout(&ctx->kc_hrtime, hrtime, sizeof (uint64_t)) == -1)
 491                 return (EFAULT);
 492         if (copyout(&ctx->kc_vtick, tick, sizeof (uint64_t)) == -1)
 493                 return (EFAULT);
 494 
 495         return (0);
 496 }
 497 
 498 /*
 499  * Stop the counters on the CPU this context is bound to.
 500  */
 501 static void
 502 kcpc_stop_hw(kcpc_ctx_t *ctx)
 503 {
 504         cpu_t *cp;
 505 
 506         kpreempt_disable();
 507 
 508         if (ctx->kc_cpuid == CPU->cpu_id) {
 509                 cp = CPU;
 510         } else {
 511                 cp = cpu_get(ctx->kc_cpuid);
 512         }
 513 
 514         ASSERT(cp != NULL && cp->cpu_cpc_ctx == ctx);
 515         kcpc_cpu_stop(cp, B_FALSE);
 516 
 517         kpreempt_enable();
 518 }
 519 
 520 int
 521 kcpc_unbind(kcpc_set_t *set)
 522 {
 523         kcpc_ctx_t      *ctx;
 524         kthread_t       *t;
 525 
 526         /*
 527          * We could be racing with the process's agent thread as it
 528          * binds the set; we must wait for the set to finish binding
 529          * before attempting to tear it down.
 530          */
 531         mutex_enter(&set->ks_lock);
 532         while ((set->ks_state & KCPC_SET_BOUND) == 0)
 533                 cv_wait(&set->ks_condv, &set->ks_lock);
 534         mutex_exit(&set->ks_lock);
 535 
 536         ctx = set->ks_ctx;
 537 
 538         /*
 539          * Use kc_lock to synchronize with kcpc_restore().
 540          */
 541         mutex_enter(&ctx->kc_lock);
 542         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
 543         mutex_exit(&ctx->kc_lock);
 544 
 545         if (ctx->kc_cpuid == -1) {
 546                 t = ctx->kc_thread;
 547                 /*
 548                  * The context is thread-bound and therefore has a device
 549                  * context.  It will be freed via removectx() calling
 550                  * freectx() calling kcpc_free().
 551                  */
 552                 if (t == curthread) {
 553                         int save_spl;
 554 
 555                         kpreempt_disable();
 556                         save_spl = spl_xcall();
 557                         if (!(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED))
 558                                 kcpc_unprogram(ctx, B_TRUE);
 559                         splx(save_spl);
 560                         kpreempt_enable();
 561                 }
 562 #ifdef DEBUG
 563                 if (removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
 564                     kcpc_lwp_create, NULL, kcpc_free) == 0)
 565                         panic("kcpc_unbind: context %p not preset on thread %p",
 566                             (void *)ctx, (void *)t);
 567 #else
 568                 (void) removectx(t, ctx, kcpc_save, kcpc_restore, NULL,
 569                     kcpc_lwp_create, NULL, kcpc_free);
 570 #endif /* DEBUG */
 571                 t->t_cpc_set = NULL;
 572                 t->t_cpc_ctx = NULL;
 573         } else {
 574                 /*
 575                  * If we are unbinding a CPU-bound set from a remote CPU, the
 576                  * native CPU's idle thread could be in the midst of programming
 577                  * this context onto the CPU. We grab the context's lock here to
 578                  * ensure that the idle thread is done with it. When we release
 579                  * the lock, the CPU no longer has a context and the idle thread
 580                  * will move on.
 581                  *
 582                  * cpu_lock must be held to prevent the CPU from being DR'd out
 583                  * while we disassociate the context from the cpu_t.
 584                  */
 585                 cpu_t *cp;
 586                 mutex_enter(&cpu_lock);
 587                 cp = cpu_get(ctx->kc_cpuid);
 588                 if (cp != NULL) {
 589                         /*
 590                          * The CPU may have been DR'd out of the system.
 591                          */
 592                         mutex_enter(&cp->cpu_cpc_ctxlock);
 593                         if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0)
 594                                 kcpc_stop_hw(ctx);
 595                         ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);
 596                         mutex_exit(&cp->cpu_cpc_ctxlock);
 597                 }
 598                 mutex_exit(&cpu_lock);
 599                 if (ctx->kc_thread == curthread) {
 600                         kcpc_free(ctx, 0);
 601                         curthread->t_cpc_set = NULL;
 602                 }
 603         }
 604 
 605         return (0);
 606 }
 607 
 608 int
 609 kcpc_preset(kcpc_set_t *set, int index, uint64_t preset)
 610 {
 611         int i;
 612 
 613         ASSERT(set != NULL);
 614         ASSERT(set->ks_state & KCPC_SET_BOUND);
 615         ASSERT(set->ks_ctx->kc_thread == curthread);
 616         ASSERT(set->ks_ctx->kc_cpuid == -1);
 617 
 618         if (index < 0 || index >= set->ks_nreqs)
 619                 return (EINVAL);
 620 
 621         for (i = 0; i < set->ks_nreqs; i++)
 622                 if (set->ks_req[i].kr_index == index)
 623                         break;
 624         ASSERT(i != set->ks_nreqs);
 625 
 626         set->ks_req[i].kr_preset = preset;
 627         return (0);
 628 }
 629 
 630 int
 631 kcpc_restart(kcpc_set_t *set)
 632 {
 633         kcpc_ctx_t      *ctx = set->ks_ctx;
 634         int             i;
 635         int             save_spl;
 636 
 637         ASSERT(set->ks_state & KCPC_SET_BOUND);
 638         ASSERT(ctx->kc_thread == curthread);
 639         ASSERT(ctx->kc_cpuid == -1);
 640 
 641         for (i = 0; i < set->ks_nreqs; i++) {
 642                 *(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
 643                 pcbe_ops->pcbe_configure(0, NULL, set->ks_req[i].kr_preset,
 644                     0, 0, NULL, &set->ks_req[i].kr_config, NULL);
 645         }
 646 
 647         kpreempt_disable();
 648         save_spl = spl_xcall();
 649 
 650         /*
 651          * If the user is doing this on a running set, make sure the counters
 652          * are stopped first.
 653          */
 654         if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
 655                 pcbe_ops->pcbe_allstop();
 656 
 657         /*
 658          * Ask the backend to program the hardware.
 659          */
 660         ctx->kc_rawtick = KCPC_GET_TICK();
 661         KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 662         pcbe_ops->pcbe_program(ctx);
 663         splx(save_spl);
 664         kpreempt_enable();
 665 
 666         return (0);
 667 }
 668 
 669 /*
 670  * Caller must hold kcpc_cpuctx_lock.
 671  */
 672 int
 673 kcpc_enable(kthread_t *t, int cmd, int enable)
 674 {
 675         kcpc_ctx_t      *ctx = t->t_cpc_ctx;
 676         kcpc_set_t      *set = t->t_cpc_set;
 677         kcpc_set_t      *newset;
 678         int             i;
 679         int             flag;
 680         int             err;
 681 
 682         ASSERT(RW_READ_HELD(&kcpc_cpuctx_lock));
 683 
 684         if (ctx == NULL) {
 685                 /*
 686                  * This thread has a set but no context; it must be a
 687                  * CPU-bound set.
 688                  */
 689                 ASSERT(t->t_cpc_set != NULL);
 690                 ASSERT(t->t_cpc_set->ks_ctx->kc_cpuid != -1);
 691                 return (EINVAL);
 692         } else if (ctx->kc_flags & KCPC_CTX_INVALID)
 693                 return (EAGAIN);
 694 
 695         if (cmd == CPC_ENABLE) {
 696                 if ((ctx->kc_flags & KCPC_CTX_FREEZE) == 0)
 697                         return (EINVAL);
 698                 kpreempt_disable();
 699                 KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
 700                 kcpc_restore(ctx);
 701                 kpreempt_enable();
 702         } else if (cmd == CPC_DISABLE) {
 703                 if (ctx->kc_flags & KCPC_CTX_FREEZE)
 704                         return (EINVAL);
 705                 kpreempt_disable();
 706                 kcpc_save(ctx);
 707                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
 708                 kpreempt_enable();
 709         } else if (cmd == CPC_USR_EVENTS || cmd == CPC_SYS_EVENTS) {
 710                 /*
 711                  * Strategy for usr/sys: stop counters and update set's presets
 712                  * with current counter values, unbind, update requests with
 713                  * new config, then re-bind.
 714                  */
 715                 flag = (cmd == CPC_USR_EVENTS) ?
 716                     CPC_COUNT_USER: CPC_COUNT_SYSTEM;
 717 
 718                 kpreempt_disable();
 719                 KCPC_CTX_FLAG_SET(ctx,
 720                     KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
 721                 pcbe_ops->pcbe_allstop();
 722                 kpreempt_enable();
 723 
 724                 for (i = 0; i < set->ks_nreqs; i++) {
 725                         set->ks_req[i].kr_preset = *(set->ks_req[i].kr_data);
 726                         if (enable)
 727                                 set->ks_req[i].kr_flags |= flag;
 728                         else
 729                                 set->ks_req[i].kr_flags &= ~flag;
 730                 }
 731                 newset = kcpc_dup_set(set);
 732                 if (kcpc_unbind(set) != 0)
 733                         return (EINVAL);
 734                 t->t_cpc_set = newset;
 735                 if (kcpc_bind_thread(newset, t, &err) != 0) {
 736                         t->t_cpc_set = NULL;
 737                         kcpc_free_set(newset);
 738                         return (EINVAL);
 739                 }
 740         } else
 741                 return (EINVAL);
 742 
 743         return (0);
 744 }
 745 
 746 /*
 747  * Provide PCBEs with a way of obtaining the configs of every counter which will
 748  * be programmed together.
 749  *
 750  * If current is NULL, provide the first config.
 751  *
 752  * If data != NULL, caller wants to know where the data store associated with
 753  * the config we return is located.
 754  */
 755 void *
 756 kcpc_next_config(void *token, void *current, uint64_t **data)
 757 {
 758         int             i;
 759         kcpc_pic_t      *pic;
 760         kcpc_ctx_t *ctx = (kcpc_ctx_t *)token;
 761 
 762         if (current == NULL) {
 763                 /*
 764                  * Client would like the first config, which may not be in
 765                  * counter 0; we need to search through the counters for the
 766                  * first config.
 767                  */
 768                 for (i = 0; i < cpc_ncounters; i++)
 769                         if (ctx->kc_pics[i].kp_req != NULL)
 770                                 break;
 771                 /*
 772                  * There are no counters configured for the given context.
 773                  */
 774                 if (i == cpc_ncounters)
 775                         return (NULL);
 776         } else {
 777                 /*
 778                  * There surely is a faster way to do this.
 779                  */
 780                 for (i = 0; i < cpc_ncounters; i++) {
 781                         pic = &ctx->kc_pics[i];
 782 
 783                         if (pic->kp_req != NULL &&
 784                             current == pic->kp_req->kr_config)
 785                                 break;
 786                 }
 787 
 788                 /*
 789                  * We found the current config at picnum i. Now search for the
 790                  * next configured PIC.
 791                  */
 792                 for (i++; i < cpc_ncounters; i++) {
 793                         pic = &ctx->kc_pics[i];
 794                         if (pic->kp_req != NULL)
 795                                 break;
 796                 }
 797 
 798                 if (i == cpc_ncounters)
 799                         return (NULL);
 800         }
 801 
 802         if (data != NULL) {
 803                 *data = ctx->kc_pics[i].kp_req->kr_data;
 804         }
 805 
 806         return (ctx->kc_pics[i].kp_req->kr_config);
 807 }
 808 
 809 
 810 kcpc_ctx_t *
 811 kcpc_ctx_alloc(int kmem_flags)
 812 {
 813         kcpc_ctx_t      *ctx;
 814         long            hash;
 815 
 816         ctx = (kcpc_ctx_t *)kmem_zalloc(sizeof (kcpc_ctx_t), kmem_flags);
 817         if (ctx == NULL)
 818                 return (NULL);
 819 
 820         hash = CPC_HASH_CTX(ctx);
 821         mutex_enter(&kcpc_ctx_llock[hash]);
 822         ctx->kc_next = kcpc_ctx_list[hash];
 823         kcpc_ctx_list[hash] = ctx;
 824         mutex_exit(&kcpc_ctx_llock[hash]);
 825 
 826         ctx->kc_pics = (kcpc_pic_t *)kmem_zalloc(sizeof (kcpc_pic_t) *
 827             cpc_ncounters, KM_SLEEP);
 828 
 829         ctx->kc_cpuid = -1;
 830 
 831         return (ctx);
 832 }
 833 
 834 /*
 835  * Copy set from ctx to the child context, cctx, if it has CPC_BIND_LWP_INHERIT
 836  * in the flags.
 837  */
 838 static void
 839 kcpc_ctx_clone(kcpc_ctx_t *ctx, kcpc_ctx_t *cctx)
 840 {
 841         kcpc_set_t      *ks = ctx->kc_set, *cks;
 842         int             i, j;
 843         int             code;
 844 
 845         ASSERT(ks != NULL);
 846 
 847         if ((ks->ks_flags & CPC_BIND_LWP_INHERIT) == 0)
 848                 return;
 849 
 850         cks = kmem_zalloc(sizeof (*cks), KM_SLEEP);
 851         cks->ks_state &= ~KCPC_SET_BOUND;
 852         cctx->kc_set = cks;
 853         cks->ks_flags = ks->ks_flags;
 854         cks->ks_nreqs = ks->ks_nreqs;
 855         cks->ks_req = kmem_alloc(cks->ks_nreqs *
 856             sizeof (kcpc_request_t), KM_SLEEP);
 857         cks->ks_data = kmem_alloc(cks->ks_nreqs * sizeof (uint64_t),
 858             KM_SLEEP);
 859         cks->ks_ctx = cctx;
 860 
 861         for (i = 0; i < cks->ks_nreqs; i++) {
 862                 cks->ks_req[i].kr_index = ks->ks_req[i].kr_index;
 863                 cks->ks_req[i].kr_picnum = ks->ks_req[i].kr_picnum;
 864                 (void) strncpy(cks->ks_req[i].kr_event,
 865                     ks->ks_req[i].kr_event, CPC_MAX_EVENT_LEN);
 866                 cks->ks_req[i].kr_preset = ks->ks_req[i].kr_preset;
 867                 cks->ks_req[i].kr_flags = ks->ks_req[i].kr_flags;
 868                 cks->ks_req[i].kr_nattrs = ks->ks_req[i].kr_nattrs;
 869                 if (ks->ks_req[i].kr_nattrs > 0) {
 870                         cks->ks_req[i].kr_attr =
 871                             kmem_alloc(ks->ks_req[i].kr_nattrs *
 872                             sizeof (kcpc_attr_t), KM_SLEEP);
 873                 }
 874                 for (j = 0; j < ks->ks_req[i].kr_nattrs; j++) {
 875                         (void) strncpy(cks->ks_req[i].kr_attr[j].ka_name,
 876                             ks->ks_req[i].kr_attr[j].ka_name,
 877                             CPC_MAX_ATTR_LEN);
 878                         cks->ks_req[i].kr_attr[j].ka_val =
 879                             ks->ks_req[i].kr_attr[j].ka_val;
 880                 }
 881         }
 882         if (kcpc_configure_reqs(cctx, cks, &code) != 0)
 883                 kcpc_invalidate_config(cctx);
 884 
 885         mutex_enter(&cks->ks_lock);
 886         cks->ks_state |= KCPC_SET_BOUND;
 887         cv_signal(&cks->ks_condv);
 888         mutex_exit(&cks->ks_lock);
 889 }
 890 
 891 
 892 void
 893 kcpc_ctx_free(kcpc_ctx_t *ctx)
 894 {
 895         kcpc_ctx_t      **loc;
 896         long            hash = CPC_HASH_CTX(ctx);
 897 
 898         mutex_enter(&kcpc_ctx_llock[hash]);
 899         loc = &kcpc_ctx_list[hash];
 900         ASSERT(*loc != NULL);
 901         while (*loc != ctx)
 902                 loc = &(*loc)->kc_next;
 903         *loc = ctx->kc_next;
 904         mutex_exit(&kcpc_ctx_llock[hash]);
 905 
 906         kmem_free(ctx->kc_pics, cpc_ncounters * sizeof (kcpc_pic_t));
 907         cv_destroy(&ctx->kc_condv);
 908         mutex_destroy(&ctx->kc_lock);
 909         kmem_free(ctx, sizeof (*ctx));
 910 }
 911 
 912 /*
 913  * Generic interrupt handler used on hardware that generates
 914  * overflow interrupts.
 915  *
 916  * Note: executed at high-level interrupt context!
 917  */
 918 /*ARGSUSED*/
 919 kcpc_ctx_t *
 920 kcpc_overflow_intr(caddr_t arg, uint64_t bitmap)
 921 {
 922         kcpc_ctx_t      *ctx;
 923         kthread_t       *t = curthread;
 924         int             i;
 925 
 926         /*
 927          * On both x86 and UltraSPARC, we may deliver the high-level
 928          * interrupt in kernel mode, just after we've started to run an
 929          * interrupt thread.  (That's because the hardware helpfully
 930          * delivers the overflow interrupt some random number of cycles
 931          * after the instruction that caused the overflow by which time
 932          * we're in some part of the kernel, not necessarily running on
 933          * the right thread).
 934          *
 935          * Check for this case here -- find the pinned thread
 936          * that was running when the interrupt went off.
 937          */
 938         if (t->t_flag & T_INTR_THREAD) {
 939                 klwp_t *lwp;
 940 
 941                 atomic_inc_32(&kcpc_intrctx_count);
 942 
 943                 /*
 944                  * Note that t_lwp is always set to point at the underlying
 945                  * thread, thus this will work in the presence of nested
 946                  * interrupts.
 947                  */
 948                 ctx = NULL;
 949                 if ((lwp = t->t_lwp) != NULL) {
 950                         t = lwptot(lwp);
 951                         ctx = t->t_cpc_ctx;
 952                 }
 953         } else
 954                 ctx = t->t_cpc_ctx;
 955 
 956         if (ctx == NULL) {
 957                 /*
 958                  * This can easily happen if we're using the counters in
 959                  * "shared" mode, for example, and an overflow interrupt
 960                  * occurs while we are running cpustat.  In that case, the
 961                  * bound thread that has the context that belongs to this
 962                  * CPU is almost certainly sleeping (if it was running on
 963                  * the CPU we'd have found it above), and the actual
 964                  * interrupted thread has no knowledge of performance counters!
 965                  */
 966                 ctx = curthread->t_cpu->cpu_cpc_ctx;
 967                 if (ctx != NULL) {
 968                         /*
 969                          * Return the bound context for this CPU to
 970                          * the interrupt handler so that it can synchronously
 971                          * sample the hardware counters and restart them.
 972                          */
 973                         return (ctx);
 974                 }
 975 
 976                 /*
 977                  * As long as the overflow interrupt really is delivered early
 978                  * enough after trapping into the kernel to avoid switching
 979                  * threads, we must always be able to find the cpc context,
 980                  * or something went terribly wrong i.e. we ended up
 981                  * running a passivated interrupt thread, a kernel
 982                  * thread or we interrupted idle, all of which are Very Bad.
 983                  *
 984                  * We also could end up here owing to an incredibly unlikely
 985                  * race condition that exists on x86 based architectures when
 986                  * the cpc provider is in use; overflow interrupts are directed
 987                  * to the cpc provider if the 'dtrace_cpc_in_use' variable is
 988                  * set when we enter the handler. This variable is unset after
 989                  * overflow interrupts have been disabled on all CPUs and all
 990                  * contexts have been torn down. To stop interrupts, the cpc
 991                  * provider issues a xcall to the remote CPU before it tears
 992                  * down that CPUs context. As high priority xcalls, on an x86
 993                  * architecture, execute at a higher PIL than this handler, it
 994                  * is possible (though extremely unlikely) that the xcall could
 995                  * interrupt the overflow handler before the handler has
 996                  * checked the 'dtrace_cpc_in_use' variable, stop the counters,
 997                  * return to the cpc provider which could then rip down
 998                  * contexts and unset 'dtrace_cpc_in_use' *before* the CPUs
 999                  * overflow handler has had a chance to check the variable. In
1000                  * that case, the handler would direct the overflow into this
1001                  * code and no valid context will be found. The default behavior
1002                  * when no valid context is found is now to shout a warning to
1003                  * the console and bump the 'kcpc_nullctx_count' variable.
1004                  */
1005                 if (kcpc_nullctx_panic)
1006                         panic("null cpc context, thread %p", (void *)t);
1007 #ifdef DEBUG
1008                 cmn_err(CE_NOTE,
1009                     "null cpc context found in overflow handler!\n");
1010 #endif
1011                 atomic_inc_32(&kcpc_nullctx_count);
1012         } else if ((ctx->kc_flags & KCPC_CTX_INVALID) == 0) {
1013                 /*
1014                  * Schedule an ast to sample the counters, which will
1015                  * propagate any overflow into the virtualized performance
1016                  * counter(s), and may deliver a signal.
1017                  */
1018                 ttolwp(t)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1019                 /*
1020                  * If a counter has overflowed which was counting on behalf of
1021                  * a request which specified CPC_OVF_NOTIFY_EMT, send the
1022                  * process a signal.
1023                  */
1024                 for (i = 0; i < cpc_ncounters; i++) {
1025                         if (ctx->kc_pics[i].kp_req != NULL &&
1026                             bitmap & (1 << i) &&
1027                             ctx->kc_pics[i].kp_req->kr_flags &
1028                             CPC_OVF_NOTIFY_EMT) {
1029                                 /*
1030                                  * A signal has been requested for this PIC, so
1031                                  * so freeze the context. The interrupt handler
1032                                  * has already stopped the counter hardware.
1033                                  */
1034                                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1035                                 atomic_or_uint(&ctx->kc_pics[i].kp_flags,
1036                                     KCPC_PIC_OVERFLOWED);
1037                         }
1038                 }
1039                 aston(t);
1040         } else if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1041                 /*
1042                  * Thread context is no longer valid, but here may be a valid
1043                  * CPU context.
1044                  */
1045                 return (curthread->t_cpu->cpu_cpc_ctx);
1046         }
1047 
1048         return (NULL);
1049 }
1050 
1051 /*
1052  * The current thread context had an overflow interrupt; we're
1053  * executing here in high-level interrupt context.
1054  */
1055 /*ARGSUSED*/
1056 uint_t
1057 kcpc_hw_overflow_intr(caddr_t arg1, caddr_t arg2)
1058 {
1059         kcpc_ctx_t *ctx;
1060         uint64_t bitmap;
1061         uint8_t *state;
1062         int     save_spl;
1063 
1064         if (pcbe_ops == NULL ||
1065             (bitmap = pcbe_ops->pcbe_overflow_bitmap()) == 0)
1066                 return (DDI_INTR_UNCLAIMED);
1067 
1068         /*
1069          * Prevent any further interrupts.
1070          */
1071         pcbe_ops->pcbe_allstop();
1072 
1073         if (dtrace_cpc_in_use) {
1074                 state = &cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state;
1075 
1076                 /*
1077                  * Set the per-CPU state bit to indicate that we are currently
1078                  * processing an interrupt if it is currently free. Drop the
1079                  * interrupt if the state isn't free (i.e. a configuration
1080                  * event is taking place).
1081                  */
1082                 if (atomic_cas_8(state, DCPC_INTR_FREE,
1083                     DCPC_INTR_PROCESSING) == DCPC_INTR_FREE) {
1084                         int i;
1085                         kcpc_request_t req;
1086 
1087                         ASSERT(dtrace_cpc_fire != NULL);
1088 
1089                         (*dtrace_cpc_fire)(bitmap);
1090 
1091                         ctx = curthread->t_cpu->cpu_cpc_ctx;
1092                         if (ctx == NULL) {
1093 #ifdef DEBUG
1094                                 cmn_err(CE_NOTE, "null cpc context in"
1095                                     "hardware overflow handler!\n");
1096 #endif
1097                                 return (DDI_INTR_CLAIMED);
1098                         }
1099 
1100                         /* Reset any counters that have overflowed */
1101                         for (i = 0; i < ctx->kc_set->ks_nreqs; i++) {
1102                                 req = ctx->kc_set->ks_req[i];
1103 
1104                                 if (bitmap & (1 << req.kr_picnum)) {
1105                                         pcbe_ops->pcbe_configure(req.kr_picnum,
1106                                             req.kr_event, req.kr_preset,
1107                                             req.kr_flags, req.kr_nattrs,
1108                                             req.kr_attr, &(req.kr_config),
1109                                             (void *)ctx);
1110                                 }
1111                         }
1112                         pcbe_ops->pcbe_program(ctx);
1113 
1114                         /*
1115                          * We've finished processing the interrupt so set
1116                          * the state back to free.
1117                          */
1118                         cpu_core[CPU->cpu_id].cpuc_dcpc_intr_state =
1119                             DCPC_INTR_FREE;
1120                         membar_producer();
1121                 }
1122                 return (DDI_INTR_CLAIMED);
1123         }
1124 
1125         /*
1126          * DTrace isn't involved so pass on accordingly.
1127          *
1128          * If the interrupt has occurred in the context of an lwp owning
1129          * the counters, then the handler posts an AST to the lwp to
1130          * trigger the actual sampling, and optionally deliver a signal or
1131          * restart the counters, on the way out of the kernel using
1132          * kcpc_hw_overflow_ast() (see below).
1133          *
1134          * On the other hand, if the handler returns the context to us
1135          * directly, then it means that there are no other threads in
1136          * the middle of updating it, no AST has been posted, and so we
1137          * should sample the counters here, and restart them with no
1138          * further fuss.
1139          *
1140          * The CPU's CPC context may disappear as a result of cross-call which
1141          * has higher PIL on x86, so protect the context by raising PIL to the
1142          * cross-call level.
1143          */
1144         save_spl = spl_xcall();
1145         if ((ctx = kcpc_overflow_intr(arg1, bitmap)) != NULL) {
1146                 uint64_t curtick = KCPC_GET_TICK();
1147 
1148                 ctx->kc_hrtime = gethrtime_waitfree();
1149                 ctx->kc_vtick += curtick - ctx->kc_rawtick;
1150                 ctx->kc_rawtick = curtick;
1151                 pcbe_ops->pcbe_sample(ctx);
1152                 pcbe_ops->pcbe_program(ctx);
1153         }
1154         splx(save_spl);
1155 
1156         return (DDI_INTR_CLAIMED);
1157 }
1158 
1159 /*
1160  * Called from trap() when processing the ast posted by the high-level
1161  * interrupt handler.
1162  */
1163 int
1164 kcpc_overflow_ast()
1165 {
1166         kcpc_ctx_t      *ctx = curthread->t_cpc_ctx;
1167         int             i;
1168         int             found = 0;
1169         uint64_t        curtick = KCPC_GET_TICK();
1170 
1171         ASSERT(ctx != NULL);    /* Beware of interrupt skid. */
1172 
1173         /*
1174          * An overflow happened: sample the context to ensure that
1175          * the overflow is propagated into the upper bits of the
1176          * virtualized 64-bit counter(s).
1177          */
1178         kpreempt_disable();
1179         ctx->kc_hrtime = gethrtime_waitfree();
1180         pcbe_ops->pcbe_sample(ctx);
1181         kpreempt_enable();
1182 
1183         ctx->kc_vtick += curtick - ctx->kc_rawtick;
1184 
1185         /*
1186          * The interrupt handler has marked any pics with KCPC_PIC_OVERFLOWED
1187          * if that pic generated an overflow and if the request it was counting
1188          * on behalf of had CPC_OVERFLOW_REQUEST specified. We go through all
1189          * pics in the context and clear the KCPC_PIC_OVERFLOWED flags. If we
1190          * found any overflowed pics, keep the context frozen and return true
1191          * (thus causing a signal to be sent).
1192          */
1193         for (i = 0; i < cpc_ncounters; i++) {
1194                 if (ctx->kc_pics[i].kp_flags & KCPC_PIC_OVERFLOWED) {
1195                         atomic_and_uint(&ctx->kc_pics[i].kp_flags,
1196                             ~KCPC_PIC_OVERFLOWED);
1197                         found = 1;
1198                 }
1199         }
1200         if (found)
1201                 return (1);
1202 
1203         /*
1204          * Otherwise, re-enable the counters and continue life as before.
1205          */
1206         kpreempt_disable();
1207         KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
1208         pcbe_ops->pcbe_program(ctx);
1209         kpreempt_enable();
1210         return (0);
1211 }
1212 
1213 /*
1214  * Called when switching away from current thread.
1215  */
1216 static void
1217 kcpc_save(kcpc_ctx_t *ctx)
1218 {
1219         int err;
1220         int save_spl;
1221 
1222         kpreempt_disable();
1223         save_spl = spl_xcall();
1224 
1225         if (ctx->kc_flags & KCPC_CTX_INVALID) {
1226                 if (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) {
1227                         splx(save_spl);
1228                         kpreempt_enable();
1229                         return;
1230                 }
1231                 /*
1232                  * This context has been invalidated but the counters have not
1233                  * been stopped. Stop them here and mark the context stopped.
1234                  */
1235                 kcpc_unprogram(ctx, B_TRUE);
1236                 splx(save_spl);
1237                 kpreempt_enable();
1238                 return;
1239         }
1240 
1241         pcbe_ops->pcbe_allstop();
1242         if (ctx->kc_flags & KCPC_CTX_FREEZE) {
1243                 splx(save_spl);
1244                 kpreempt_enable();
1245                 return;
1246         }
1247 
1248         /*
1249          * Need to sample for all reqs into each req's current mpic.
1250          */
1251         ctx->kc_hrtime = gethrtime_waitfree();
1252         ctx->kc_vtick += KCPC_GET_TICK() - ctx->kc_rawtick;
1253         pcbe_ops->pcbe_sample(ctx);
1254 
1255         /*
1256          * Program counter for measuring capacity and utilization since user
1257          * thread isn't using counter anymore
1258          */
1259         ASSERT(ctx->kc_cpuid == -1);
1260         cu_cpc_program(CPU, &err);
1261         splx(save_spl);
1262         kpreempt_enable();
1263 }
1264 
1265 static void
1266 kcpc_restore(kcpc_ctx_t *ctx)
1267 {
1268         int save_spl;
1269 
1270         mutex_enter(&ctx->kc_lock);
1271 
1272         if ((ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED)) ==
1273             KCPC_CTX_INVALID) {
1274                 /*
1275                  * The context is invalidated but has not been marked stopped.
1276                  * We mark it as such here because we will not start the
1277                  * counters during this context switch.
1278                  */
1279                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
1280         }
1281 
1282         if (ctx->kc_flags & (KCPC_CTX_INVALID | KCPC_CTX_FREEZE)) {
1283                 mutex_exit(&ctx->kc_lock);
1284                 return;
1285         }
1286 
1287         /*
1288          * Set kc_flags to show that a kcpc_restore() is in progress to avoid
1289          * ctx & set related memory objects being freed without us knowing.
1290          * This can happen if an agent thread is executing a kcpc_unbind(),
1291          * with this thread as the target, whilst we're concurrently doing a
1292          * restorectx() during, for example, a proc_exit().  Effectively, by
1293          * doing this, we're asking kcpc_free() to cv_wait() until
1294          * kcpc_restore() has completed.
1295          */
1296         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_RESTORE);
1297         mutex_exit(&ctx->kc_lock);
1298 
1299         /*
1300          * While programming the hardware, the counters should be stopped. We
1301          * don't do an explicit pcbe_allstop() here because they should have
1302          * been stopped already by the last consumer.
1303          */
1304         kpreempt_disable();
1305         save_spl = spl_xcall();
1306         kcpc_program(ctx, B_TRUE, B_TRUE);
1307         splx(save_spl);
1308         kpreempt_enable();
1309 
1310         /*
1311          * Wake the agent thread if it's waiting in kcpc_free().
1312          */
1313         mutex_enter(&ctx->kc_lock);
1314         KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_RESTORE);
1315         cv_signal(&ctx->kc_condv);
1316         mutex_exit(&ctx->kc_lock);
1317 }
1318 
1319 /*
1320  * If kcpc_counts_include_idle is set to 0 by the sys admin, we add the the
1321  * following context operators to the idle thread on each CPU. They stop the
1322  * counters when the idle thread is switched on, and they start them again when
1323  * it is switched off.
1324  */
1325 /*ARGSUSED*/
1326 void
1327 kcpc_idle_save(struct cpu *cp)
1328 {
1329         /*
1330          * The idle thread shouldn't be run anywhere else.
1331          */
1332         ASSERT(CPU == cp);
1333 
1334         /*
1335          * We must hold the CPU's context lock to ensure the context isn't freed
1336          * while we're looking at it.
1337          */
1338         mutex_enter(&cp->cpu_cpc_ctxlock);
1339 
1340         if ((cp->cpu_cpc_ctx == NULL) ||
1341             (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1342                 mutex_exit(&cp->cpu_cpc_ctxlock);
1343                 return;
1344         }
1345 
1346         pcbe_ops->pcbe_program(cp->cpu_cpc_ctx);
1347         mutex_exit(&cp->cpu_cpc_ctxlock);
1348 }
1349 
1350 void
1351 kcpc_idle_restore(struct cpu *cp)
1352 {
1353         /*
1354          * The idle thread shouldn't be run anywhere else.
1355          */
1356         ASSERT(CPU == cp);
1357 
1358         /*
1359          * We must hold the CPU's context lock to ensure the context isn't freed
1360          * while we're looking at it.
1361          */
1362         mutex_enter(&cp->cpu_cpc_ctxlock);
1363 
1364         if ((cp->cpu_cpc_ctx == NULL) ||
1365             (cp->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID)) {
1366                 mutex_exit(&cp->cpu_cpc_ctxlock);
1367                 return;
1368         }
1369 
1370         pcbe_ops->pcbe_allstop();
1371         mutex_exit(&cp->cpu_cpc_ctxlock);
1372 }
1373 
1374 /*ARGSUSED*/
1375 static void
1376 kcpc_lwp_create(kthread_t *t, kthread_t *ct)
1377 {
1378         kcpc_ctx_t      *ctx = t->t_cpc_ctx, *cctx;
1379         int             i;
1380 
1381         if (ctx == NULL || (ctx->kc_flags & KCPC_CTX_LWPINHERIT) == 0)
1382                 return;
1383 
1384         rw_enter(&kcpc_cpuctx_lock, RW_READER);
1385         if (ctx->kc_flags & KCPC_CTX_INVALID) {
1386                 rw_exit(&kcpc_cpuctx_lock);
1387                 return;
1388         }
1389         cctx = kcpc_ctx_alloc(KM_SLEEP);
1390         kcpc_ctx_clone(ctx, cctx);
1391         rw_exit(&kcpc_cpuctx_lock);
1392 
1393         /*
1394          * Copy the parent context's kc_flags field, but don't overwrite
1395          * the child's in case it was modified during kcpc_ctx_clone.
1396          */
1397         KCPC_CTX_FLAG_SET(cctx,  ctx->kc_flags);
1398         cctx->kc_thread = ct;
1399         cctx->kc_cpuid = -1;
1400         ct->t_cpc_set = cctx->kc_set;
1401         ct->t_cpc_ctx = cctx;
1402 
1403         if (cctx->kc_flags & KCPC_CTX_SIGOVF) {
1404                 kcpc_set_t *ks = cctx->kc_set;
1405                 /*
1406                  * Our contract with the user requires us to immediately send an
1407                  * overflow signal to all children if we have the LWPINHERIT
1408                  * and SIGOVF flags set. In addition, all counters should be
1409                  * set to UINT64_MAX, and their pic's overflow flag turned on
1410                  * so that our trap() processing knows to send a signal.
1411                  */
1412                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_FREEZE);
1413                 for (i = 0; i < ks->ks_nreqs; i++) {
1414                         kcpc_request_t *kr = &ks->ks_req[i];
1415 
1416                         if (kr->kr_flags & CPC_OVF_NOTIFY_EMT) {
1417                                 *(kr->kr_data) = UINT64_MAX;
1418                                 atomic_or_uint(&kr->kr_picp->kp_flags,
1419                                     KCPC_PIC_OVERFLOWED);
1420                         }
1421                 }
1422                 ttolwp(ct)->lwp_pcb.pcb_flags |= CPC_OVERFLOW;
1423                 aston(ct);
1424         }
1425 
1426         installctx(ct, cctx, kcpc_save, kcpc_restore,
1427             NULL, kcpc_lwp_create, NULL, kcpc_free);
1428 }
1429 
1430 /*
1431  * Counter Stoppage Theory
1432  *
1433  * The counters may need to be stopped properly at the following occasions:
1434  *
1435  * 1) An LWP exits.
1436  * 2) A thread exits.
1437  * 3) An LWP performs an exec().
1438  * 4) A bound set is unbound.
1439  *
1440  * In addition to stopping the counters, the CPC context (a kcpc_ctx_t) may need
1441  * to be freed as well.
1442  *
1443  * Case 1: kcpc_passivate(), called via lwp_exit(), stops the counters. Later on
1444  * when the thread is freed, kcpc_free(), called by freectx(), frees the
1445  * context.
1446  *
1447  * Case 2: same as case 1 except kcpc_passivate is called from thread_exit().
1448  *
1449  * Case 3: kcpc_free(), called via freectx() via exec(), recognizes that it has
1450  * been called from exec. It stops the counters _and_ frees the context.
1451  *
1452  * Case 4: kcpc_unbind() stops the hardware _and_ frees the context.
1453  *
1454  * CPU-bound counters are always stopped via kcpc_unbind().
1455  */
1456 
1457 /*
1458  * We're being called to delete the context; we ensure that all associated data
1459  * structures are freed, and that the hardware is passivated if this is an exec.
1460  */
1461 
1462 /*ARGSUSED*/
1463 void
1464 kcpc_free(kcpc_ctx_t *ctx, int isexec)
1465 {
1466         int             i;
1467         kcpc_set_t      *set = ctx->kc_set;
1468 
1469         ASSERT(set != NULL);
1470 
1471         /*
1472          * Wait for kcpc_restore() to finish before we tear things down.
1473          */
1474         mutex_enter(&ctx->kc_lock);
1475         while (ctx->kc_flags & KCPC_CTX_RESTORE)
1476                 cv_wait(&ctx->kc_condv, &ctx->kc_lock);
1477         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1478         mutex_exit(&ctx->kc_lock);
1479 
1480         if (isexec) {
1481                 /*
1482                  * This thread is execing, and after the exec it should not have
1483                  * any performance counter context. Stop the counters properly
1484                  * here so the system isn't surprised by an overflow interrupt
1485                  * later.
1486                  */
1487                 if (ctx->kc_cpuid != -1) {
1488                         cpu_t *cp;
1489                         /*
1490                          * CPU-bound context; stop the appropriate CPU's ctrs.
1491                          * Hold cpu_lock while examining the CPU to ensure it
1492                          * doesn't go away.
1493                          */
1494                         mutex_enter(&cpu_lock);
1495                         cp = cpu_get(ctx->kc_cpuid);
1496                         /*
1497                          * The CPU could have been DR'd out, so only stop the
1498                          * CPU and clear its context pointer if the CPU still
1499                          * exists.
1500                          */
1501                         if (cp != NULL) {
1502                                 mutex_enter(&cp->cpu_cpc_ctxlock);
1503                                 kcpc_stop_hw(ctx);
1504                                 mutex_exit(&cp->cpu_cpc_ctxlock);
1505                         }
1506                         mutex_exit(&cpu_lock);
1507                         ASSERT(curthread->t_cpc_ctx == NULL);
1508                 } else {
1509                         int save_spl;
1510 
1511                         /*
1512                          * Thread-bound context; stop _this_ CPU's counters.
1513                          */
1514                         kpreempt_disable();
1515                         save_spl = spl_xcall();
1516                         kcpc_unprogram(ctx, B_TRUE);
1517                         curthread->t_cpc_ctx = NULL;
1518                         splx(save_spl);
1519                         kpreempt_enable();
1520                 }
1521 
1522                 /*
1523                  * Since we are being called from an exec and we know that
1524                  * exec is not permitted via the agent thread, we should clean
1525                  * up this thread's CPC state completely, and not leave dangling
1526                  * CPC pointers behind.
1527                  */
1528                 ASSERT(ctx->kc_thread == curthread);
1529                 curthread->t_cpc_set = NULL;
1530         }
1531 
1532         /*
1533          * Walk through each request in this context's set and free the PCBE's
1534          * configuration if it exists.
1535          */
1536         for (i = 0; i < set->ks_nreqs; i++) {
1537                 if (set->ks_req[i].kr_config != NULL)
1538                         pcbe_ops->pcbe_free(set->ks_req[i].kr_config);
1539         }
1540 
1541         kmem_free(set->ks_data, set->ks_nreqs * sizeof (uint64_t));
1542         kcpc_ctx_free(ctx);
1543         kcpc_free_set(set);
1544 }
1545 
1546 /*
1547  * Free the memory associated with a request set.
1548  */
1549 void
1550 kcpc_free_set(kcpc_set_t *set)
1551 {
1552         int             i;
1553         kcpc_request_t  *req;
1554 
1555         ASSERT(set->ks_req != NULL);
1556 
1557         for (i = 0; i < set->ks_nreqs; i++) {
1558                 req = &set->ks_req[i];
1559 
1560                 if (req->kr_nattrs != 0) {
1561                         kmem_free(req->kr_attr,
1562                             req->kr_nattrs * sizeof (kcpc_attr_t));
1563                 }
1564         }
1565 
1566         kmem_free(set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
1567         cv_destroy(&set->ks_condv);
1568         mutex_destroy(&set->ks_lock);
1569         kmem_free(set, sizeof (kcpc_set_t));
1570 }
1571 
1572 /*
1573  * Grab every existing context and mark it as invalid.
1574  */
1575 void
1576 kcpc_invalidate_all(void)
1577 {
1578         kcpc_ctx_t *ctx;
1579         long hash;
1580 
1581         for (hash = 0; hash < CPC_HASH_BUCKETS; hash++) {
1582                 mutex_enter(&kcpc_ctx_llock[hash]);
1583                 for (ctx = kcpc_ctx_list[hash]; ctx; ctx = ctx->kc_next)
1584                         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1585                 mutex_exit(&kcpc_ctx_llock[hash]);
1586         }
1587 }
1588 
1589 /*
1590  * Interface for PCBEs to signal that an existing configuration has suddenly
1591  * become invalid.
1592  */
1593 void
1594 kcpc_invalidate_config(void *token)
1595 {
1596         kcpc_ctx_t *ctx = token;
1597 
1598         ASSERT(ctx != NULL);
1599 
1600         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1601 }
1602 
1603 /*
1604  * Called from lwp_exit() and thread_exit()
1605  */
1606 void
1607 kcpc_passivate(void)
1608 {
1609         kcpc_ctx_t *ctx = curthread->t_cpc_ctx;
1610         kcpc_set_t *set = curthread->t_cpc_set;
1611         int     save_spl;
1612 
1613         if (set == NULL)
1614                 return;
1615 
1616         if (ctx == NULL) {
1617                 /*
1618                  * This thread has a set but no context; it must be a CPU-bound
1619                  * set. The hardware will be stopped via kcpc_unbind() when the
1620                  * process exits and closes its file descriptors with
1621                  * kcpc_close(). Our only job here is to clean up this thread's
1622                  * state; the set will be freed with the unbind().
1623                  */
1624                 (void) kcpc_unbind(set);
1625                 /*
1626                  * Unbinding a set belonging to the current thread should clear
1627                  * its set pointer.
1628                  */
1629                 ASSERT(curthread->t_cpc_set == NULL);
1630                 return;
1631         }
1632 
1633         kpreempt_disable();
1634         save_spl = spl_xcall();
1635         curthread->t_cpc_set = NULL;
1636 
1637         /*
1638          * This thread/LWP is exiting but context switches will continue to
1639          * happen for a bit as the exit proceeds.  Kernel preemption must be
1640          * disabled here to prevent a race between checking or setting the
1641          * INVALID_STOPPED flag here and kcpc_restore() setting the flag during
1642          * a context switch.
1643          */
1644         if ((ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) == 0) {
1645                 kcpc_unprogram(ctx, B_TRUE);
1646                 KCPC_CTX_FLAG_SET(ctx,
1647                     KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);
1648         }
1649 
1650         /*
1651          * We're cleaning up after this thread; ensure there are no dangling
1652          * CPC pointers left behind. The context and set will be freed by
1653          * freectx().
1654          */
1655         curthread->t_cpc_ctx = NULL;
1656 
1657         splx(save_spl);
1658         kpreempt_enable();
1659 }
1660 
1661 /*
1662  * Assign the requests in the given set to the PICs in the context.
1663  * Returns 0 if successful, -1 on failure.
1664  */
1665 /*ARGSUSED*/
1666 int
1667 kcpc_assign_reqs(kcpc_set_t *set, kcpc_ctx_t *ctx)
1668 {
1669         int i;
1670         int *picnum_save;
1671 
1672         ASSERT(set->ks_nreqs <= cpc_ncounters);
1673 
1674         /*
1675          * Provide kcpc_tryassign() with scratch space to avoid doing an
1676          * alloc/free with every invocation.
1677          */
1678         picnum_save = kmem_alloc(set->ks_nreqs * sizeof (int), KM_SLEEP);
1679         /*
1680          * kcpc_tryassign() blindly walks through each request in the set,
1681          * seeing if a counter can count its event. If yes, it assigns that
1682          * counter. However, that counter may have been the only capable counter
1683          * for _another_ request's event. The solution is to try every possible
1684          * request first. Note that this does not cover all solutions, as
1685          * that would require all unique orderings of requests, an n^n operation
1686          * which would be unacceptable for architectures with many counters.
1687          */
1688         for (i = 0; i < set->ks_nreqs; i++)
1689                 if (kcpc_tryassign(set, i, picnum_save) == 0)
1690                         break;
1691 
1692         kmem_free(picnum_save, set->ks_nreqs * sizeof (int));
1693         if (i == set->ks_nreqs)
1694                 return (-1);
1695         return (0);
1696 }
1697 
1698 static int
1699 kcpc_tryassign(kcpc_set_t *set, int starting_req, int *scratch)
1700 {
1701         int             i;
1702         int             j;
1703         uint64_t        bitmap = 0, resmap = 0;
1704         uint64_t        ctrmap;
1705 
1706         /*
1707          * We are attempting to assign the reqs to pics, but we may fail. If we
1708          * fail, we need to restore the state of the requests to what it was
1709          * when we found it, as some reqs may have been explicitly assigned to
1710          * a specific PIC beforehand. We do this by snapshotting the assignments
1711          * now and restoring from it later if we fail.
1712          *
1713          * Also we note here which counters have already been claimed by
1714          * requests with explicit counter assignments.
1715          */
1716         for (i = 0; i < set->ks_nreqs; i++) {
1717                 scratch[i] = set->ks_req[i].kr_picnum;
1718                 if (set->ks_req[i].kr_picnum != -1)
1719                         resmap |= (1 << set->ks_req[i].kr_picnum);
1720         }
1721 
1722         /*
1723          * Walk through requests assigning them to the first PIC that is
1724          * capable.
1725          */
1726         i = starting_req;
1727         do {
1728                 if (set->ks_req[i].kr_picnum != -1) {
1729                         ASSERT((bitmap & (1 << set->ks_req[i].kr_picnum)) == 0);
1730                         bitmap |= (1 << set->ks_req[i].kr_picnum);
1731                         if (++i == set->ks_nreqs)
1732                                 i = 0;
1733                         continue;
1734                 }
1735 
1736                 ctrmap = pcbe_ops->pcbe_event_coverage(set->ks_req[i].kr_event);
1737                 for (j = 0; j < cpc_ncounters; j++) {
1738                         if (ctrmap & (1 << j) && (bitmap & (1 << j)) == 0 &&
1739                             (resmap & (1 << j)) == 0) {
1740                                 /*
1741                                  * We can assign this counter because:
1742                                  *
1743                                  * 1. It can count the event (ctrmap)
1744                                  * 2. It hasn't been assigned yet (bitmap)
1745                                  * 3. It wasn't reserved by a request (resmap)
1746                                  */
1747                                 bitmap |= (1 << j);
1748                                 break;
1749                         }
1750                 }
1751                 if (j == cpc_ncounters) {
1752                         for (i = 0; i < set->ks_nreqs; i++)
1753                                 set->ks_req[i].kr_picnum = scratch[i];
1754                         return (-1);
1755                 }
1756                 set->ks_req[i].kr_picnum = j;
1757 
1758                 if (++i == set->ks_nreqs)
1759                         i = 0;
1760         } while (i != starting_req);
1761 
1762         return (0);
1763 }
1764 
1765 kcpc_set_t *
1766 kcpc_dup_set(kcpc_set_t *set)
1767 {
1768         kcpc_set_t      *new;
1769         int             i;
1770         int             j;
1771 
1772         new = kmem_zalloc(sizeof (*new), KM_SLEEP);
1773         new->ks_state &= ~KCPC_SET_BOUND;
1774         new->ks_flags = set->ks_flags;
1775         new->ks_nreqs = set->ks_nreqs;
1776         new->ks_req = kmem_alloc(set->ks_nreqs * sizeof (kcpc_request_t),
1777             KM_SLEEP);
1778         new->ks_data = NULL;
1779         new->ks_ctx = NULL;
1780 
1781         for (i = 0; i < new->ks_nreqs; i++) {
1782                 new->ks_req[i].kr_config = NULL;
1783                 new->ks_req[i].kr_index = set->ks_req[i].kr_index;
1784                 new->ks_req[i].kr_picnum = set->ks_req[i].kr_picnum;
1785                 new->ks_req[i].kr_picp = NULL;
1786                 new->ks_req[i].kr_data = NULL;
1787                 (void) strncpy(new->ks_req[i].kr_event, set->ks_req[i].kr_event,
1788                     CPC_MAX_EVENT_LEN);
1789                 new->ks_req[i].kr_preset = set->ks_req[i].kr_preset;
1790                 new->ks_req[i].kr_flags = set->ks_req[i].kr_flags;
1791                 new->ks_req[i].kr_nattrs = set->ks_req[i].kr_nattrs;
1792                 new->ks_req[i].kr_attr = kmem_alloc(new->ks_req[i].kr_nattrs *
1793                     sizeof (kcpc_attr_t), KM_SLEEP);
1794                 for (j = 0; j < new->ks_req[i].kr_nattrs; j++) {
1795                         new->ks_req[i].kr_attr[j].ka_val =
1796                             set->ks_req[i].kr_attr[j].ka_val;
1797                         (void) strncpy(new->ks_req[i].kr_attr[j].ka_name,
1798                             set->ks_req[i].kr_attr[j].ka_name,
1799                             CPC_MAX_ATTR_LEN);
1800                 }
1801         }
1802 
1803         return (new);
1804 }
1805 
1806 int
1807 kcpc_allow_nonpriv(void *token)
1808 {
1809         return (((kcpc_ctx_t *)token)->kc_flags & KCPC_CTX_NONPRIV);
1810 }
1811 
1812 void
1813 kcpc_invalidate(kthread_t *t)
1814 {
1815         kcpc_ctx_t *ctx = t->t_cpc_ctx;
1816 
1817         if (ctx != NULL)
1818                 KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID);
1819 }
1820 
1821 /*
1822  * Given a PCBE ID, attempt to load a matching PCBE module. The strings given
1823  * are used to construct PCBE names, starting with the most specific,
1824  * "pcbe.first.second.third.fourth" and ending with the least specific,
1825  * "pcbe.first".
1826  *
1827  * Returns 0 if a PCBE was successfully loaded and -1 upon error.
1828  */
1829 int
1830 kcpc_pcbe_tryload(const char *prefix, uint_t first, uint_t second, uint_t third)
1831 {
1832         uint_t s[3];
1833 
1834         s[0] = first;
1835         s[1] = second;
1836         s[2] = third;
1837 
1838         return (modload_qualified("pcbe",
1839             "pcbe", prefix, ".", s, 3, NULL) < 0 ? -1 : 0);
1840 }
1841 
1842 /*
1843  * Create one or more CPC context for given CPU with specified counter event
1844  * requests
1845  *
1846  * If number of requested counter events is less than or equal number of
1847  * hardware counters on a CPU and can all be assigned to the counters on a CPU
1848  * at the same time, then make one CPC context.
1849  *
1850  * Otherwise, multiple CPC contexts are created to allow multiplexing more
1851  * counter events than existing counters onto the counters by iterating through
1852  * all of the CPC contexts, programming the counters with each CPC context one
1853  * at a time and measuring the resulting counter values.  Each of the resulting
1854  * CPC contexts contains some number of requested counter events less than or
1855  * equal the number of counters on a CPU depending on whether all the counter
1856  * events can be programmed on all the counters at the same time or not.
1857  *
1858  * Flags to kmem_{,z}alloc() are passed in as an argument to allow specifying
1859  * whether memory allocation should be non-blocking or not.  The code will try
1860  * to allocate *whole* CPC contexts if possible.  If there is any memory
1861  * allocation failure during the allocations needed for a given CPC context, it
1862  * will skip allocating that CPC context because it cannot allocate the whole
1863  * thing.  Thus, the only time that it will end up allocating none (ie. no CPC
1864  * contexts whatsoever) is when it cannot even allocate *one* whole CPC context
1865  * without a memory allocation failure occurring.
1866  */
1867 int
1868 kcpc_cpu_ctx_create(cpu_t *cp, kcpc_request_list_t *req_list, int kmem_flags,
1869     kcpc_ctx_t ***ctx_ptr_array, size_t *ctx_ptr_array_sz)
1870 {
1871         kcpc_ctx_t      **ctx_ptrs;
1872         int             nctx;
1873         int             nctx_ptrs;
1874         int             nreqs;
1875         kcpc_request_t  *reqs;
1876 
1877         if (cp == NULL || ctx_ptr_array == NULL || ctx_ptr_array_sz == NULL ||
1878             req_list == NULL || req_list->krl_cnt < 1)
1879                 return (-1);
1880 
1881         /*
1882          * Allocate number of sets assuming that each set contains one and only
1883          * one counter event request for each counter on a CPU
1884          */
1885         nreqs = req_list->krl_cnt;
1886         nctx_ptrs = (nreqs + cpc_ncounters - 1) / cpc_ncounters;
1887         ctx_ptrs = kmem_zalloc(nctx_ptrs * sizeof (kcpc_ctx_t *), kmem_flags);
1888         if (ctx_ptrs == NULL)
1889                 return (-2);
1890 
1891         /*
1892          * Fill in sets of requests
1893          */
1894         nctx = 0;
1895         reqs = req_list->krl_list;
1896         while (nreqs > 0) {
1897                 kcpc_ctx_t      *ctx;
1898                 kcpc_set_t      *set;
1899                 int             subcode;
1900 
1901                 /*
1902                  * Allocate CPC context and set for requested counter events
1903                  */
1904                 ctx = kcpc_ctx_alloc(kmem_flags);
1905                 set = kcpc_set_create(reqs, nreqs, 0, kmem_flags);
1906                 if (set == NULL) {
1907                         kcpc_ctx_free(ctx);
1908                         break;
1909                 }
1910 
1911                 /*
1912                  * Determine assignment of requested counter events to specific
1913                  * counters
1914                  */
1915                 if (kcpc_assign_reqs(set, ctx) != 0) {
1916                         /*
1917                          * May not be able to assign requested counter events
1918                          * to all counters since all counters may not be able
1919                          * to do all events, so only do one counter event in
1920                          * set of counter requests when this happens since at
1921                          * least one of the counters must be able to do the
1922                          * event.
1923                          */
1924                         kcpc_free_set(set);
1925                         set = kcpc_set_create(reqs, 1, 0, kmem_flags);
1926                         if (set == NULL) {
1927                                 kcpc_ctx_free(ctx);
1928                                 break;
1929                         }
1930                         if (kcpc_assign_reqs(set, ctx) != 0) {
1931 #ifdef DEBUG
1932                                 cmn_err(CE_NOTE, "!kcpc_cpu_ctx_create: can't "
1933                                     "assign counter event %s!\n",
1934                                     set->ks_req->kr_event);
1935 #endif
1936                                 kcpc_free_set(set);
1937                                 kcpc_ctx_free(ctx);
1938                                 reqs++;
1939                                 nreqs--;
1940                                 continue;
1941                         }
1942                 }
1943 
1944                 /*
1945                  * Allocate memory needed to hold requested counter event data
1946                  */
1947                 set->ks_data = kmem_zalloc(set->ks_nreqs * sizeof (uint64_t),
1948                     kmem_flags);
1949                 if (set->ks_data == NULL) {
1950                         kcpc_free_set(set);
1951                         kcpc_ctx_free(ctx);
1952                         break;
1953                 }
1954 
1955                 /*
1956                  * Configure requested counter events
1957                  */
1958                 if (kcpc_configure_reqs(ctx, set, &subcode) != 0) {
1959 #ifdef DEBUG
1960                         cmn_err(CE_NOTE,
1961                             "!kcpc_cpu_ctx_create: can't configure "
1962                             "set of counter event requests!\n");
1963 #endif
1964                         reqs += set->ks_nreqs;
1965                         nreqs -= set->ks_nreqs;
1966                         kmem_free(set->ks_data,
1967                             set->ks_nreqs * sizeof (uint64_t));
1968                         kcpc_free_set(set);
1969                         kcpc_ctx_free(ctx);
1970                         continue;
1971                 }
1972 
1973                 /*
1974                  * Point set of counter event requests at this context and fill
1975                  * in CPC context
1976                  */
1977                 set->ks_ctx = ctx;
1978                 ctx->kc_set = set;
1979                 ctx->kc_cpuid = cp->cpu_id;
1980                 ctx->kc_thread = curthread;
1981 
1982                 ctx_ptrs[nctx] = ctx;
1983 
1984                 /*
1985                  * Update requests and how many are left to be assigned to sets
1986                  */
1987                 reqs += set->ks_nreqs;
1988                 nreqs -= set->ks_nreqs;
1989 
1990                 /*
1991                  * Increment number of CPC contexts and allocate bigger array
1992                  * for context pointers as needed
1993                  */
1994                 nctx++;
1995                 if (nctx >= nctx_ptrs) {
1996                         kcpc_ctx_t      **new;
1997                         int             new_cnt;
1998 
1999                         /*
2000                          * Allocate more CPC contexts based on how many
2001                          * contexts allocated so far and how many counter
2002                          * requests left to assign
2003                          */
2004                         new_cnt = nctx_ptrs +
2005                             ((nreqs + cpc_ncounters - 1) / cpc_ncounters);
2006                         new = kmem_zalloc(new_cnt * sizeof (kcpc_ctx_t *),
2007                             kmem_flags);
2008                         if (new == NULL)
2009                                 break;
2010 
2011                         /*
2012                          * Copy contents of old sets into new ones
2013                          */
2014                         bcopy(ctx_ptrs, new,
2015                             nctx_ptrs * sizeof (kcpc_ctx_t *));
2016 
2017                         /*
2018                          * Free old array of context pointers and use newly
2019                          * allocated one instead now
2020                          */
2021                         kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2022                         ctx_ptrs = new;
2023                         nctx_ptrs = new_cnt;
2024                 }
2025         }
2026 
2027         /*
2028          * Return NULL if no CPC contexts filled in
2029          */
2030         if (nctx == 0) {
2031                 kmem_free(ctx_ptrs, nctx_ptrs * sizeof (kcpc_ctx_t *));
2032                 *ctx_ptr_array = NULL;
2033                 *ctx_ptr_array_sz = 0;
2034                 return (-2);
2035         }
2036 
2037         *ctx_ptr_array = ctx_ptrs;
2038         *ctx_ptr_array_sz = nctx_ptrs * sizeof (kcpc_ctx_t *);
2039         return (nctx);
2040 }
2041 
2042 /*
2043  * Return whether PCBE supports given counter event
2044  */
2045 boolean_t
2046 kcpc_event_supported(char *event)
2047 {
2048         if (pcbe_ops == NULL || pcbe_ops->pcbe_event_coverage(event) == 0)
2049                 return (B_FALSE);
2050 
2051         return (B_TRUE);
2052 }
2053 
2054 /*
2055  * Program counters on current CPU with given CPC context
2056  *
2057  * If kernel is interposing on counters to measure hardware capacity and
2058  * utilization, then unprogram counters for kernel *before* programming them
2059  * with specified CPC context.
2060  *
2061  * kcpc_{program,unprogram}() may be called either directly by a thread running
2062  * on the target CPU or from a cross-call from another CPU. To protect
2063  * programming and unprogramming from being interrupted by cross-calls, callers
2064  * who execute kcpc_{program,unprogram} should raise PIL to the level used by
2065  * cross-calls.
2066  */
2067 void
2068 kcpc_program(kcpc_ctx_t *ctx, boolean_t for_thread, boolean_t cu_interpose)
2069 {
2070         int     error;
2071 
2072         ASSERT(IS_HIPIL());
2073 
2074         /*
2075          * CPC context shouldn't be NULL, its CPU field should specify current
2076          * CPU or be -1 to specify any CPU when the context is bound to a
2077          * thread, and preemption should be disabled
2078          */
2079         ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2080             ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2081         if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2082             ctx->kc_cpuid != -1) || curthread->t_preempt < 1)
2083                 return;
2084 
2085         /*
2086          * Unprogram counters for kernel measuring hardware capacity and
2087          * utilization
2088          */
2089         if (cu_interpose == B_TRUE) {
2090                 cu_cpc_unprogram(CPU, &error);
2091         } else {
2092                 kcpc_set_t *set = ctx->kc_set;
2093                 int i;
2094 
2095                 ASSERT(set != NULL);
2096 
2097                 /*
2098                  * Since cu_interpose is false, we are programming CU context.
2099                  * In general, PCBE can continue from the state saved in the
2100                  * set, but it is not very reliable, so we start again from the
2101                  * preset value.
2102                  */
2103                 for (i = 0; i < set->ks_nreqs; i++) {
2104                         /*
2105                          * Reset the virtual counter value to the preset value.
2106                          */
2107                         *(set->ks_req[i].kr_data) = set->ks_req[i].kr_preset;
2108 
2109                         /*
2110                          * Reset PCBE to the preset value.
2111                          */
2112                         pcbe_ops->pcbe_configure(0, NULL,
2113                             set->ks_req[i].kr_preset,
2114                             0, 0, NULL, &set->ks_req[i].kr_config, NULL);
2115                 }
2116         }
2117 
2118         /*
2119          * Program counters with specified CPC context
2120          */
2121         ctx->kc_rawtick = KCPC_GET_TICK();
2122         pcbe_ops->pcbe_program(ctx);
2123 
2124         /*
2125          * Denote that counters programmed for thread or CPU CPC context
2126          * differently
2127          */
2128         if (for_thread == B_TRUE)
2129                 KCPC_CTX_FLAG_CLR(ctx, KCPC_CTX_FREEZE);
2130         else
2131                 CPU->cpu_cpc_ctx = ctx;
2132 }
2133 
2134 /*
2135  * Unprogram counters with given CPC context on current CPU
2136  *
2137  * If kernel is interposing on counters to measure hardware capacity and
2138  * utilization, then program counters for the kernel capacity and utilization
2139  * *after* unprogramming them for given CPC context.
2140  *
2141  * See the comment for kcpc_program regarding the synchronization with
2142  * cross-calls.
2143  */
2144 void
2145 kcpc_unprogram(kcpc_ctx_t *ctx, boolean_t cu_interpose)
2146 {
2147         int     error;
2148 
2149         ASSERT(IS_HIPIL());
2150 
2151         /*
2152          * CPC context shouldn't be NULL, its CPU field should specify current
2153          * CPU or be -1 to specify any CPU when the context is bound to a
2154          * thread, and preemption should be disabled
2155          */
2156         ASSERT(ctx != NULL && (ctx->kc_cpuid == CPU->cpu_id ||
2157             ctx->kc_cpuid == -1) && curthread->t_preempt > 0);
2158 
2159         if (ctx == NULL || (ctx->kc_cpuid != CPU->cpu_id &&
2160             ctx->kc_cpuid != -1) || curthread->t_preempt < 1 ||
2161             (ctx->kc_flags & KCPC_CTX_INVALID_STOPPED) != 0) {
2162                 return;
2163         }
2164 
2165         /*
2166          * Specified CPC context to be unprogrammed should be bound to current
2167          * CPU or thread
2168          */
2169         ASSERT(CPU->cpu_cpc_ctx == ctx || curthread->t_cpc_ctx == ctx);
2170 
2171         /*
2172          * Stop counters
2173          */
2174         pcbe_ops->pcbe_allstop();
2175         KCPC_CTX_FLAG_SET(ctx, KCPC_CTX_INVALID_STOPPED);
2176 
2177         /*
2178          * Allow kernel to interpose on counters and program them for its own
2179          * use to measure hardware capacity and utilization if cu_interpose
2180          * argument is true
2181          */
2182         if (cu_interpose == B_TRUE)
2183                 cu_cpc_program(CPU, &error);
2184 }
2185 
2186 /*
2187  * Read CPU Performance Counter (CPC) on current CPU and call specified update
2188  * routine with data for each counter event currently programmed on CPU
2189  */
2190 int
2191 kcpc_read(kcpc_update_func_t update_func)
2192 {
2193         kcpc_ctx_t      *ctx;
2194         int             i;
2195         kcpc_request_t  *req;
2196         int             retval;
2197         kcpc_set_t      *set;
2198 
2199         ASSERT(IS_HIPIL());
2200 
2201         /*
2202          * Can't grab locks or block because may be called inside dispatcher
2203          */
2204         kpreempt_disable();
2205 
2206         ctx = CPU->cpu_cpc_ctx;
2207         if (ctx == NULL) {
2208                 kpreempt_enable();
2209                 return (0);
2210         }
2211 
2212         /*
2213          * Read counter data from current CPU
2214          */
2215         pcbe_ops->pcbe_sample(ctx);
2216 
2217         set = ctx->kc_set;
2218         if (set == NULL || set->ks_req == NULL) {
2219                 kpreempt_enable();
2220                 return (0);
2221         }
2222 
2223         /*
2224          * Call update function with preset pointer and data for each CPC event
2225          * request currently programmed on current CPU
2226          */
2227         req = set->ks_req;
2228         retval = 0;
2229         for (i = 0; i < set->ks_nreqs; i++) {
2230                 int     ret;
2231 
2232                 if (req[i].kr_data == NULL)
2233                         break;
2234 
2235                 ret = update_func(req[i].kr_ptr, *req[i].kr_data);
2236                 if (ret < 0)
2237                         retval = ret;
2238         }
2239 
2240         kpreempt_enable();
2241 
2242         return (retval);
2243 }
2244 
2245 /*
2246  * Initialize list of counter event requests
2247  */
2248 kcpc_request_list_t *
2249 kcpc_reqs_init(int nreqs, int kmem_flags)
2250 {
2251         kcpc_request_list_t     *req_list;
2252         kcpc_request_t          *reqs;
2253 
2254         if (nreqs < 1)
2255                 return (NULL);
2256 
2257         req_list = kmem_zalloc(sizeof (kcpc_request_list_t), kmem_flags);
2258         if (req_list == NULL)
2259                 return (NULL);
2260 
2261         reqs = kmem_zalloc(nreqs * sizeof (kcpc_request_t), kmem_flags);
2262         if (reqs == NULL) {
2263                 kmem_free(req_list, sizeof (kcpc_request_list_t));
2264                 return (NULL);
2265         }
2266 
2267         req_list->krl_list = reqs;
2268         req_list->krl_cnt = 0;
2269         req_list->krl_max = nreqs;
2270         return (req_list);
2271 }
2272 
2273 
2274 /*
2275  * Add counter event request to given list of counter event requests
2276  */
2277 int
2278 kcpc_reqs_add(kcpc_request_list_t *req_list, char *event, uint64_t preset,
2279     uint_t flags, uint_t nattrs, kcpc_attr_t *attr, void *ptr, int kmem_flags)
2280 {
2281         kcpc_request_t  *req;
2282 
2283         if (req_list == NULL || req_list->krl_list == NULL)
2284                 return (-1);
2285 
2286         ASSERT(req_list->krl_max != 0);
2287 
2288         /*
2289          * Allocate more space (if needed)
2290          */
2291         if (req_list->krl_cnt > req_list->krl_max) {
2292                 kcpc_request_t  *new;
2293                 kcpc_request_t  *old;
2294 
2295                 old = req_list->krl_list;
2296                 new = kmem_zalloc((req_list->krl_max +
2297                     cpc_ncounters) * sizeof (kcpc_request_t), kmem_flags);
2298                 if (new == NULL)
2299                         return (-2);
2300 
2301                 req_list->krl_list = new;
2302                 bcopy(old, req_list->krl_list,
2303                     req_list->krl_cnt * sizeof (kcpc_request_t));
2304                 kmem_free(old, req_list->krl_max * sizeof (kcpc_request_t));
2305                 req_list->krl_cnt = 0;
2306                 req_list->krl_max += cpc_ncounters;
2307         }
2308 
2309         /*
2310          * Fill in request as much as possible now, but some fields will need
2311          * to be set when request is assigned to a set.
2312          */
2313         req = &req_list->krl_list[req_list->krl_cnt];
2314         req->kr_config = NULL;
2315         req->kr_picnum = -1; /* have CPC pick this */
2316         req->kr_index = -1;  /* set when assigning request to set */
2317         req->kr_data = NULL; /* set when configuring request */
2318         (void) strcpy(req->kr_event, event);
2319         req->kr_preset = preset;
2320         req->kr_flags = flags;
2321         req->kr_nattrs = nattrs;
2322         req->kr_attr = attr;
2323         /*
2324          * Keep pointer given by caller to give to update function when this
2325          * counter event is sampled/read
2326          */
2327         req->kr_ptr = ptr;
2328 
2329         req_list->krl_cnt++;
2330 
2331         return (0);
2332 }
2333 
2334 /*
2335  * Reset list of CPC event requests so its space can be used for another set
2336  * of requests
2337  */
2338 int
2339 kcpc_reqs_reset(kcpc_request_list_t *req_list)
2340 {
2341         /*
2342          * Return when pointer to request list structure or request is NULL or
2343          * when max requests is less than or equal to 0
2344          */
2345         if (req_list == NULL || req_list->krl_list == NULL ||
2346             req_list->krl_max <= 0)
2347                 return (-1);
2348 
2349         /*
2350          * Zero out requests and number of requests used
2351          */
2352         bzero(req_list->krl_list, req_list->krl_max * sizeof (kcpc_request_t));
2353         req_list->krl_cnt = 0;
2354         return (0);
2355 }
2356 
2357 /*
2358  * Free given list of counter event requests
2359  */
2360 int
2361 kcpc_reqs_fini(kcpc_request_list_t *req_list)
2362 {
2363         kmem_free(req_list->krl_list,
2364             req_list->krl_max * sizeof (kcpc_request_t));
2365         kmem_free(req_list, sizeof (kcpc_request_list_t));
2366         return (0);
2367 }
2368 
2369 /*
2370  * Create set of given counter event requests
2371  */
2372 static kcpc_set_t *
2373 kcpc_set_create(kcpc_request_t *reqs, int nreqs, int set_flags, int kmem_flags)
2374 {
2375         int             i;
2376         kcpc_set_t      *set;
2377 
2378         /*
2379          * Allocate set and assign number of requests in set and flags
2380          */
2381         set = kmem_zalloc(sizeof (kcpc_set_t), kmem_flags);
2382         if (set == NULL)
2383                 return (NULL);
2384 
2385         if (nreqs < cpc_ncounters)
2386                 set->ks_nreqs = nreqs;
2387         else
2388                 set->ks_nreqs = cpc_ncounters;
2389 
2390         set->ks_flags = set_flags;
2391 
2392         /*
2393          * Allocate requests needed, copy requests into set, and set index into
2394          * data for each request (which may change when we assign requested
2395          * counter events to counters)
2396          */
2397         set->ks_req = (kcpc_request_t *)kmem_zalloc(sizeof (kcpc_request_t) *
2398             set->ks_nreqs, kmem_flags);
2399         if (set->ks_req == NULL) {
2400                 kmem_free(set, sizeof (kcpc_set_t));
2401                 return (NULL);
2402         }
2403 
2404         bcopy(reqs, set->ks_req, sizeof (kcpc_request_t) * set->ks_nreqs);
2405 
2406         for (i = 0; i < set->ks_nreqs; i++)
2407                 set->ks_req[i].kr_index = i;
2408 
2409         return (set);
2410 }
2411 
2412 
2413 /*
2414  * Stop counters on current CPU.
2415  *
2416  * If preserve_context is true, the caller is interested in the CPU's CPC
2417  * context and wants it to be preserved.
2418  *
2419  * If preserve_context is false, the caller does not need the CPU's CPC context
2420  * to be preserved, so it is set to NULL.
2421  */
2422 static void
2423 kcpc_cpustop_func(boolean_t preserve_context)
2424 {
2425         kpreempt_disable();
2426 
2427         /*
2428          * Someone already stopped this context before us, so there is nothing
2429          * to do.
2430          */
2431         if (CPU->cpu_cpc_ctx == NULL) {
2432                 kpreempt_enable();
2433                 return;
2434         }
2435 
2436         kcpc_unprogram(CPU->cpu_cpc_ctx, B_TRUE);
2437         /*
2438          * If CU does not use counters, then clear the CPU's CPC context
2439          * If the caller requested to preserve context it should disable CU
2440          * first, so there should be no CU context now.
2441          */
2442         ASSERT(!preserve_context || !CU_CPC_ON(CPU));
2443         if (!preserve_context && CPU->cpu_cpc_ctx != NULL && !CU_CPC_ON(CPU))
2444                 CPU->cpu_cpc_ctx = NULL;
2445 
2446         kpreempt_enable();
2447 }
2448 
2449 /*
2450  * Stop counters on given CPU and set its CPC context to NULL unless
2451  * preserve_context is true.
2452  */
2453 void
2454 kcpc_cpu_stop(cpu_t *cp, boolean_t preserve_context)
2455 {
2456         cpu_call(cp, (cpu_call_func_t)kcpc_cpustop_func,
2457             preserve_context, 0);
2458 }
2459 
2460 /*
2461  * Program the context on the current CPU
2462  */
2463 static void
2464 kcpc_remoteprogram_func(kcpc_ctx_t *ctx, uintptr_t arg)
2465 {
2466         boolean_t for_thread = (boolean_t)arg;
2467 
2468         ASSERT(ctx != NULL);
2469 
2470         kpreempt_disable();
2471         kcpc_program(ctx, for_thread, B_TRUE);
2472         kpreempt_enable();
2473 }
2474 
2475 /*
2476  * Program counters on given CPU
2477  */
2478 void
2479 kcpc_cpu_program(cpu_t *cp, kcpc_ctx_t *ctx)
2480 {
2481         cpu_call(cp, (cpu_call_func_t)kcpc_remoteprogram_func, (uintptr_t)ctx,
2482             (uintptr_t)B_FALSE);
2483 }
2484 
2485 char *
2486 kcpc_list_attrs(void)
2487 {
2488         ASSERT(pcbe_ops != NULL);
2489 
2490         return (pcbe_ops->pcbe_list_attrs());
2491 }
2492 
2493 char *
2494 kcpc_list_events(uint_t pic)
2495 {
2496         ASSERT(pcbe_ops != NULL);
2497 
2498         return (pcbe_ops->pcbe_list_events(pic));
2499 }
2500 
2501 uint_t
2502 kcpc_pcbe_capabilities(void)
2503 {
2504         ASSERT(pcbe_ops != NULL);
2505 
2506         return (pcbe_ops->pcbe_caps);
2507 }
2508 
2509 int
2510 kcpc_pcbe_loaded(void)
2511 {
2512         return (pcbe_ops == NULL ? -1 : 0);
2513 }