1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  * Fill in and write out the cpr state file
  28  *      1. Allocate and write headers, ELF and cpr dump header
  29  *      2. Allocate bitmaps according to phys_install
  30  *      3. Tag kernel pages into corresponding bitmap
  31  *      4. Write bitmaps to state file
  32  *      5. Write actual physical page data to state file
  33  */
  34 
  35 #include <sys/types.h>
  36 #include <sys/systm.h>
  37 #include <sys/vm.h>
  38 #include <sys/memlist.h>
  39 #include <sys/kmem.h>
  40 #include <sys/vnode.h>
  41 #include <sys/fs/ufs_inode.h>
  42 #include <sys/errno.h>
  43 #include <sys/cmn_err.h>
  44 #include <sys/debug.h>
  45 #include <vm/page.h>
  46 #include <vm/seg.h>
  47 #include <vm/seg_kmem.h>
  48 #include <vm/seg_kpm.h>
  49 #include <vm/hat.h>
  50 #include <sys/cpr.h>
  51 #include <sys/conf.h>
  52 #include <sys/ddi.h>
  53 #include <sys/panic.h>
  54 #include <sys/thread.h>
  55 #include <sys/note.h>
  56 
  57 /* Local defines and variables */
  58 #define BTOb(bytes)     ((bytes) << 3)            /* Bytes to bits, log2(NBBY) */
  59 #define bTOB(bits)      ((bits) >> 3)             /* bits to Bytes, log2(NBBY) */
  60 
  61 #if defined(__sparc)
  62 static uint_t cpr_pages_tobe_dumped;
  63 static uint_t cpr_regular_pgs_dumped;
  64 static int cpr_dump_regular_pages(vnode_t *);
  65 static int cpr_count_upages(int, bitfunc_t);
  66 static int cpr_compress_and_write(vnode_t *, uint_t, pfn_t, pgcnt_t);
  67 #endif
  68 
  69 int cpr_flush_write(vnode_t *);
  70 
  71 int cpr_contig_pages(vnode_t *, int);
  72 
  73 void cpr_clear_bitmaps();
  74 
  75 extern size_t cpr_get_devsize(dev_t);
  76 extern int i_cpr_dump_setup(vnode_t *);
  77 extern int i_cpr_blockzero(char *, char **, int *, vnode_t *);
  78 extern int cpr_test_mode;
  79 int cpr_setbit(pfn_t, int);
  80 int cpr_clrbit(pfn_t, int);
  81 
  82 ctrm_t cpr_term;
  83 
  84 char *cpr_buf, *cpr_buf_end;
  85 int cpr_buf_blocks;             /* size of cpr_buf in blocks */
  86 size_t cpr_buf_size;            /* size of cpr_buf in bytes */
  87 size_t cpr_bitmap_size;
  88 int cpr_nbitmaps;
  89 
  90 char *cpr_pagedata;             /* page buffer for compression / tmp copy */
  91 size_t cpr_pagedata_size;       /* page buffer size in bytes */
  92 
  93 #if defined(__sparc)
  94 static char *cpr_wptr;          /* keep track of where to write to next */
  95 static int cpr_file_bn;         /* cpr state-file block offset */
  96 static int cpr_disk_writes_ok;
  97 static size_t cpr_dev_space = 0;
  98 #endif
  99 
 100 char cpr_pagecopy[CPR_MAXCONTIG * MMU_PAGESIZE];
 101 
 102 #if defined(__sparc)
 103 /*
 104  * On some platforms bcopy may modify the thread structure
 105  * during bcopy (eg, to prevent cpu migration).  If the
 106  * range we are currently writing out includes our own
 107  * thread structure then it will be snapshotted by bcopy
 108  * including those modified members - and the updates made
 109  * on exit from bcopy will no longer be seen when we later
 110  * restore the mid-bcopy kthread_t.  So if the range we
 111  * need to copy overlaps with our thread structure we will
 112  * use a simple byte copy.
 113  */
 114 void
 115 cprbcopy(void *from, void *to, size_t bytes)
 116 {
 117         extern int curthreadremapped;
 118         caddr_t kthrend;
 119 
 120         kthrend = (caddr_t)curthread + sizeof (kthread_t) - 1;
 121         if (curthreadremapped || (kthrend >= (caddr_t)from &&
 122             kthrend < (caddr_t)from + bytes + sizeof (kthread_t) - 1)) {
 123                 caddr_t src = from, dst = to;
 124 
 125                 while (bytes-- > 0)
 126                         *dst++ = *src++;
 127         } else {
 128                 bcopy(from, to, bytes);
 129         }
 130 }
 131 
 132 /*
 133  * Allocate pages for buffers used in writing out the statefile
 134  */
 135 static int
 136 cpr_alloc_bufs(void)
 137 {
 138         char *allocerr = "Unable to allocate memory for cpr buffer";
 139         size_t size;
 140 
 141         /*
 142          * set the cpr write buffer size to at least the historic
 143          * size (128k) or large enough to store the both the early
 144          * set of statefile structures (well under 0x800) plus the
 145          * bitmaps, and roundup to the next pagesize.
 146          */
 147         size = PAGE_ROUNDUP(dbtob(4) + cpr_bitmap_size);
 148         cpr_buf_size = MAX(size, CPRBUFSZ);
 149         cpr_buf_blocks = btodb(cpr_buf_size);
 150         cpr_buf = kmem_alloc(cpr_buf_size, KM_NOSLEEP);
 151         if (cpr_buf == NULL) {
 152                 cpr_err(CE_WARN, allocerr);
 153                 return (ENOMEM);
 154         }
 155         cpr_buf_end = cpr_buf + cpr_buf_size;
 156 
 157         cpr_pagedata_size = mmu_ptob(CPR_MAXCONTIG + 1);
 158         cpr_pagedata = kmem_alloc(cpr_pagedata_size, KM_NOSLEEP);
 159         if (cpr_pagedata == NULL) {
 160                 kmem_free(cpr_buf, cpr_buf_size);
 161                 cpr_buf = NULL;
 162                 cpr_err(CE_WARN, allocerr);
 163                 return (ENOMEM);
 164         }
 165 
 166         return (0);
 167 }
 168 
 169 
 170 /*
 171  * Set bitmap size in bytes based on phys_install.
 172  */
 173 void
 174 cpr_set_bitmap_size(void)
 175 {
 176         struct memlist *pmem;
 177         size_t size = 0;
 178 
 179         memlist_read_lock();
 180         for (pmem = phys_install; pmem; pmem = pmem->ml_next)
 181                 size += pmem->ml_size;
 182         memlist_read_unlock();
 183         cpr_bitmap_size = BITMAP_BYTES(size);
 184 }
 185 
 186 
 187 /*
 188  * CPR dump header contains the following information:
 189  *      1. header magic -- unique to cpr state file
 190  *      2. kernel return pc & ppn for resume
 191  *      3. current thread info
 192  *      4. debug level and test mode
 193  *      5. number of bitmaps allocated
 194  *      6. number of page records
 195  */
 196 static int
 197 cpr_write_header(vnode_t *vp)
 198 {
 199         extern ushort_t cpr_mach_type;
 200         struct cpr_dump_desc cdump;
 201         pgcnt_t bitmap_pages;
 202         pgcnt_t kpages, vpages, upages;
 203         pgcnt_t cpr_count_kpages(int mapflag, bitfunc_t bitfunc);
 204 
 205         cdump.cdd_magic = (uint_t)CPR_DUMP_MAGIC;
 206         cdump.cdd_version = CPR_VERSION;
 207         cdump.cdd_machine = cpr_mach_type;
 208         cdump.cdd_debug = cpr_debug;
 209         cdump.cdd_test_mode = cpr_test_mode;
 210         cdump.cdd_bitmaprec = cpr_nbitmaps;
 211 
 212         cpr_clear_bitmaps();
 213 
 214         /*
 215          * Remember how many pages we plan to save to statefile.
 216          * This information will be used for sanity checks.
 217          * Untag those pages that will not be saved to statefile.
 218          */
 219         kpages = cpr_count_kpages(REGULAR_BITMAP, cpr_setbit);
 220         vpages = cpr_count_volatile_pages(REGULAR_BITMAP, cpr_clrbit);
 221         upages = cpr_count_upages(REGULAR_BITMAP, cpr_setbit);
 222         cdump.cdd_dumppgsize = kpages - vpages + upages;
 223         cpr_pages_tobe_dumped = cdump.cdd_dumppgsize;
 224         CPR_DEBUG(CPR_DEBUG7,
 225             "\ncpr_write_header: kpages %ld - vpages %ld + upages %ld = %d\n",
 226             kpages, vpages, upages, cdump.cdd_dumppgsize);
 227 
 228         /*
 229          * Some pages contain volatile data (cpr_buf and storage area for
 230          * sensitive kpages), which are no longer needed after the statefile
 231          * is dumped to disk.  We have already untagged them from regular
 232          * bitmaps.  Now tag them into the volatile bitmaps.  The pages in
 233          * volatile bitmaps will be claimed during resume, and the resumed
 234          * kernel will free them.
 235          */
 236         (void) cpr_count_volatile_pages(VOLATILE_BITMAP, cpr_setbit);
 237 
 238         bitmap_pages = mmu_btopr(cpr_bitmap_size);
 239 
 240         /*
 241          * Export accurate statefile size for statefile allocation retry.
 242          * statefile_size = all the headers + total pages +
 243          * number of pages used by the bitmaps.
 244          * Roundup will be done in the file allocation code.
 245          */
 246         STAT->cs_nocomp_statefsz = sizeof (cdd_t) + sizeof (cmd_t) +
 247             (sizeof (cbd_t) * cdump.cdd_bitmaprec) +
 248             (sizeof (cpd_t) * cdump.cdd_dumppgsize) +
 249             mmu_ptob(cdump.cdd_dumppgsize + bitmap_pages);
 250 
 251         /*
 252          * If the estimated statefile is not big enough,
 253          * go retry now to save un-necessary operations.
 254          */
 255         if (!(CPR->c_flags & C_COMPRESSING) &&
 256             (STAT->cs_nocomp_statefsz > STAT->cs_est_statefsz)) {
 257                 if (cpr_debug & (CPR_DEBUG1 | CPR_DEBUG7))
 258                         prom_printf("cpr_write_header: "
 259                             "STAT->cs_nocomp_statefsz > "
 260                             "STAT->cs_est_statefsz\n");
 261                 return (ENOSPC);
 262         }
 263 
 264         /* now write cpr dump descriptor */
 265         return (cpr_write(vp, (caddr_t)&cdump, sizeof (cdd_t)));
 266 }
 267 
 268 
 269 /*
 270  * CPR dump tail record contains the following information:
 271  *      1. header magic -- unique to cpr state file
 272  *      2. all misc info that needs to be passed to cprboot or resumed kernel
 273  */
 274 static int
 275 cpr_write_terminator(vnode_t *vp)
 276 {
 277         cpr_term.magic = (uint_t)CPR_TERM_MAGIC;
 278         cpr_term.va = (cpr_ptr)&cpr_term;
 279         cpr_term.pfn = (cpr_ext)va_to_pfn(&cpr_term);
 280 
 281         /* count the last one (flush) */
 282         cpr_term.real_statef_size = STAT->cs_real_statefsz +
 283             btod(cpr_wptr - cpr_buf) * DEV_BSIZE;
 284 
 285         CPR_DEBUG(CPR_DEBUG9, "cpr_dump: Real Statefile Size: %ld\n",
 286             STAT->cs_real_statefsz);
 287 
 288         cpr_tod_get(&cpr_term.tm_shutdown);
 289 
 290         return (cpr_write(vp, (caddr_t)&cpr_term, sizeof (cpr_term)));
 291 }
 292 
 293 /*
 294  * Write bitmap descriptor array, followed by merged bitmaps.
 295  */
 296 static int
 297 cpr_write_bitmap(vnode_t *vp)
 298 {
 299         char *rmap, *vmap, *dst, *tail;
 300         size_t size, bytes;
 301         cbd_t *dp;
 302         int err;
 303 
 304         dp = CPR->c_bmda;
 305         if (err = cpr_write(vp, (caddr_t)dp, cpr_nbitmaps * sizeof (*dp)))
 306                 return (err);
 307 
 308         /*
 309          * merge regular and volatile bitmaps into tmp space
 310          * and write to disk
 311          */
 312         for (; dp->cbd_size; dp++) {
 313                 rmap = (char *)dp->cbd_reg_bitmap;
 314                 vmap = (char *)dp->cbd_vlt_bitmap;
 315                 for (size = dp->cbd_size; size; size -= bytes) {
 316                         bytes = min(size, sizeof (cpr_pagecopy));
 317                         tail = &cpr_pagecopy[bytes];
 318                         for (dst = cpr_pagecopy; dst < tail; dst++)
 319                                 *dst = *rmap++ | *vmap++;
 320                         if (err = cpr_write(vp, cpr_pagecopy, bytes))
 321                                 break;
 322                 }
 323         }
 324 
 325         return (err);
 326 }
 327 
 328 
 329 static int
 330 cpr_write_statefile(vnode_t *vp)
 331 {
 332         uint_t error = 0;
 333         extern  int     i_cpr_check_pgs_dumped();
 334         void flush_windows(void);
 335         pgcnt_t spages;
 336         char *str;
 337 
 338         flush_windows();
 339 
 340         /*
 341          * to get an accurate view of kas, we need to untag sensitive
 342          * pages *before* dumping them because the disk driver makes
 343          * allocations and changes kas along the way.  The remaining
 344          * pages referenced in the bitmaps are dumped out later as
 345          * regular kpages.
 346          */
 347         str = "cpr_write_statefile:";
 348         spages = i_cpr_count_sensitive_kpages(REGULAR_BITMAP, cpr_clrbit);
 349         CPR_DEBUG(CPR_DEBUG7, "%s untag %ld sens pages\n", str, spages);
 350 
 351         /*
 352          * now it's OK to call a driver that makes allocations
 353          */
 354         cpr_disk_writes_ok = 1;
 355 
 356         /*
 357          * now write out the clean sensitive kpages
 358          * according to the sensitive descriptors
 359          */
 360         error = i_cpr_dump_sensitive_kpages(vp);
 361         if (error) {
 362                 CPR_DEBUG(CPR_DEBUG7,
 363                     "%s cpr_dump_sensitive_kpages() failed!\n", str);
 364                 return (error);
 365         }
 366 
 367         /*
 368          * cpr_dump_regular_pages() counts cpr_regular_pgs_dumped
 369          */
 370         error = cpr_dump_regular_pages(vp);
 371         if (error) {
 372                 CPR_DEBUG(CPR_DEBUG7,
 373                     "%s cpr_dump_regular_pages() failed!\n", str);
 374                 return (error);
 375         }
 376 
 377         /*
 378          * sanity check to verify the right number of pages were dumped
 379          */
 380         error = i_cpr_check_pgs_dumped(cpr_pages_tobe_dumped,
 381             cpr_regular_pgs_dumped);
 382 
 383         if (error) {
 384                 prom_printf("\n%s page count mismatch!\n", str);
 385 #ifdef DEBUG
 386                 if (cpr_test_mode)
 387                         debug_enter(NULL);
 388 #endif
 389         }
 390 
 391         return (error);
 392 }
 393 #endif
 394 
 395 
 396 /*
 397  * creates the CPR state file, the following sections are
 398  * written out in sequence:
 399  *    - writes the cpr dump header
 400  *    - writes the memory usage bitmaps
 401  *    - writes the platform dependent info
 402  *    - writes the remaining user pages
 403  *    - writes the kernel pages
 404  */
 405 #if defined(__x86)
 406         _NOTE(ARGSUSED(0))
 407 #endif
 408 int
 409 cpr_dump(vnode_t *vp)
 410 {
 411 #if defined(__sparc)
 412         int error;
 413 
 414         if (cpr_buf == NULL) {
 415                 ASSERT(cpr_pagedata == NULL);
 416                 if (error = cpr_alloc_bufs())
 417                         return (error);
 418         }
 419         /* point to top of internal buffer */
 420         cpr_wptr = cpr_buf;
 421 
 422         /* initialize global variables used by the write operation */
 423         cpr_file_bn = cpr_statefile_offset();
 424         cpr_dev_space = 0;
 425 
 426         /* allocate bitmaps */
 427         if (CPR->c_bmda == NULL) {
 428                 if (error = i_cpr_alloc_bitmaps()) {
 429                         cpr_err(CE_WARN, "cannot allocate bitmaps");
 430                         return (error);
 431                 }
 432         }
 433 
 434         if (error = i_cpr_prom_pages(CPR_PROM_SAVE))
 435                 return (error);
 436 
 437         if (error = i_cpr_dump_setup(vp))
 438                 return (error);
 439 
 440         /*
 441          * set internal cross checking; we dont want to call
 442          * a disk driver that makes allocations until after
 443          * sensitive pages are saved
 444          */
 445         cpr_disk_writes_ok = 0;
 446 
 447         /*
 448          * 1253112: heap corruption due to memory allocation when dumpping
 449          *          statefile.
 450          * Theoretically on Sun4u only the kernel data nucleus, kvalloc and
 451          * kvseg segments can be contaminated should memory allocations happen
 452          * during sddump, which is not supposed to happen after the system
 453          * is quiesced. Let's call the kernel pages that tend to be affected
 454          * 'sensitive kpages' here. To avoid saving inconsistent pages, we
 455          * will allocate some storage space to save the clean sensitive pages
 456          * aside before statefile dumping takes place. Since there may not be
 457          * much memory left at this stage, the sensitive pages will be
 458          * compressed before they are saved into the storage area.
 459          */
 460         if (error = i_cpr_save_sensitive_kpages()) {
 461                 CPR_DEBUG(CPR_DEBUG7,
 462                     "cpr_dump: save_sensitive_kpages failed!\n");
 463                 return (error);
 464         }
 465 
 466         /*
 467          * since all cpr allocations are done (space for sensitive kpages,
 468          * bitmaps, cpr_buf), kas is stable, and now we can accurately
 469          * count regular and sensitive kpages.
 470          */
 471         if (error = cpr_write_header(vp)) {
 472                 CPR_DEBUG(CPR_DEBUG7,
 473                     "cpr_dump: cpr_write_header() failed!\n");
 474                 return (error);
 475         }
 476 
 477         if (error = i_cpr_write_machdep(vp))
 478                 return (error);
 479 
 480         if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, NULL, NULL))
 481                 return (error);
 482 
 483         if (error = cpr_write_bitmap(vp))
 484                 return (error);
 485 
 486         if (error = cpr_write_statefile(vp)) {
 487                 CPR_DEBUG(CPR_DEBUG7,
 488                     "cpr_dump: cpr_write_statefile() failed!\n");
 489                 return (error);
 490         }
 491 
 492         if (error = cpr_write_terminator(vp))
 493                 return (error);
 494 
 495         if (error = cpr_flush_write(vp))
 496                 return (error);
 497 
 498         if (error = i_cpr_blockzero(cpr_buf, &cpr_wptr, &cpr_file_bn, vp))
 499                 return (error);
 500 #endif
 501 
 502         return (0);
 503 }
 504 
 505 
 506 #if defined(__sparc)
 507 /*
 508  * cpr_xwalk() is called many 100x with a range within kvseg or kvseg_reloc;
 509  * a page-count from each range is accumulated at arg->pages.
 510  */
 511 static void
 512 cpr_xwalk(void *arg, void *base, size_t size)
 513 {
 514         struct cpr_walkinfo *cwip = arg;
 515 
 516         cwip->pages += cpr_count_pages(base, size,
 517             cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
 518         cwip->size += size;
 519         cwip->ranges++;
 520 }
 521 
 522 /*
 523  * cpr_walk() is called many 100x with a range within kvseg or kvseg_reloc;
 524  * a page-count from each range is accumulated at arg->pages.
 525  */
 526 static void
 527 cpr_walk(void *arg, void *base, size_t size)
 528 {
 529         caddr_t addr = base;
 530         caddr_t addr_end = addr + size;
 531 
 532         /*
 533          * If we are about to start walking the range of addresses we
 534          * carved out of the kernel heap for the large page heap walk
 535          * heap_lp_arena to find what segments are actually populated
 536          */
 537         if (SEGKMEM_USE_LARGEPAGES &&
 538             addr == heap_lp_base && addr_end == heap_lp_end &&
 539             vmem_size(heap_lp_arena, VMEM_ALLOC) < size) {
 540                 vmem_walk(heap_lp_arena, VMEM_ALLOC, cpr_xwalk, arg);
 541         } else {
 542                 cpr_xwalk(arg, base, size);
 543         }
 544 }
 545 
 546 
 547 /*
 548  * faster scan of kvseg using vmem_walk() to visit
 549  * allocated ranges.
 550  */
 551 pgcnt_t
 552 cpr_scan_kvseg(int mapflag, bitfunc_t bitfunc, struct seg *seg)
 553 {
 554         struct cpr_walkinfo cwinfo;
 555 
 556         bzero(&cwinfo, sizeof (cwinfo));
 557         cwinfo.mapflag = mapflag;
 558         cwinfo.bitfunc = bitfunc;
 559 
 560         vmem_walk(heap_arena, VMEM_ALLOC, cpr_walk, &cwinfo);
 561 
 562         if (cpr_debug & CPR_DEBUG7) {
 563                 prom_printf("walked %d sub-ranges, total pages %ld\n",
 564                     cwinfo.ranges, mmu_btop(cwinfo.size));
 565                 cpr_show_range(seg->s_base, seg->s_size,
 566                     mapflag, bitfunc, cwinfo.pages);
 567         }
 568 
 569         return (cwinfo.pages);
 570 }
 571 
 572 
 573 /*
 574  * cpr_walk_kpm() is called for every used area within the large
 575  * segkpm virtual address window. A page-count is accumulated at
 576  * arg->pages.
 577  */
 578 static void
 579 cpr_walk_kpm(void *arg, void *base, size_t size)
 580 {
 581         struct cpr_walkinfo *cwip = arg;
 582 
 583         cwip->pages += cpr_count_pages(base, size,
 584             cwip->mapflag, cwip->bitfunc, DBG_DONTSHOWRANGE);
 585         cwip->size += size;
 586         cwip->ranges++;
 587 }
 588 
 589 
 590 /*
 591  * faster scan of segkpm using hat_kpm_walk() to visit only used ranges.
 592  */
 593 /*ARGSUSED*/
 594 static pgcnt_t
 595 cpr_scan_segkpm(int mapflag, bitfunc_t bitfunc, struct seg *seg)
 596 {
 597         struct cpr_walkinfo cwinfo;
 598 
 599         if (kpm_enable == 0)
 600                 return (0);
 601 
 602         bzero(&cwinfo, sizeof (cwinfo));
 603         cwinfo.mapflag = mapflag;
 604         cwinfo.bitfunc = bitfunc;
 605         hat_kpm_walk(cpr_walk_kpm, &cwinfo);
 606 
 607         if (cpr_debug & CPR_DEBUG7) {
 608                 prom_printf("walked %d sub-ranges, total pages %ld\n",
 609                     cwinfo.ranges, mmu_btop(cwinfo.size));
 610                 cpr_show_range(segkpm->s_base, segkpm->s_size,
 611                     mapflag, bitfunc, cwinfo.pages);
 612         }
 613 
 614         return (cwinfo.pages);
 615 }
 616 
 617 
 618 /*
 619  * Sparsely filled kernel segments are registered in kseg_table for
 620  * easier lookup. See also block comment for cpr_count_seg_pages.
 621  */
 622 
 623 #define KSEG_SEG_ADDR   0       /* address of struct seg */
 624 #define KSEG_PTR_ADDR   1       /* address of pointer to struct seg */
 625 
 626 typedef struct {
 627         struct seg **st_seg;            /* segment pointer or segment address */
 628         pgcnt_t (*st_fcn)(int, bitfunc_t, struct seg *); /* function to call */
 629         int     st_addrtype;            /* address type in st_seg */
 630 } ksegtbl_entry_t;
 631 
 632 ksegtbl_entry_t kseg_table[] = {
 633         {(struct seg **)&kvseg,             cpr_scan_kvseg,         KSEG_SEG_ADDR},
 634         {&segkpm,                   cpr_scan_segkpm,        KSEG_PTR_ADDR},
 635         {NULL,                          0,                      0}
 636 };
 637 
 638 
 639 /*
 640  * Compare seg with each entry in kseg_table; when there is a match
 641  * return the entry pointer, otherwise return NULL.
 642  */
 643 static ksegtbl_entry_t *
 644 cpr_sparse_seg_check(struct seg *seg)
 645 {
 646         ksegtbl_entry_t *ste = &kseg_table[0];
 647         struct seg *tseg;
 648 
 649         for (; ste->st_seg; ste++) {
 650                 tseg = (ste->st_addrtype == KSEG_PTR_ADDR) ?
 651                     *ste->st_seg : (struct seg *)ste->st_seg;
 652 
 653                 if (seg == tseg)
 654                         return (ste);
 655         }
 656 
 657         return ((ksegtbl_entry_t *)NULL);
 658 }
 659 
 660 
 661 /*
 662  * Count pages within each kernel segment; call cpr_sparse_seg_check()
 663  * to find out whether a sparsely filled segment needs special
 664  * treatment (e.g. kvseg).
 665  * Todo: A "SEGOP_CPR" like SEGOP_DUMP should be introduced, the cpr
 666  *       module shouldn't need to know segment details like if it is
 667  *       sparsely filled or not (makes kseg_table obsolete).
 668  */
 669 pgcnt_t
 670 cpr_count_seg_pages(int mapflag, bitfunc_t bitfunc)
 671 {
 672         struct seg *segp;
 673         pgcnt_t pages;
 674         ksegtbl_entry_t *ste;
 675 
 676         pages = 0;
 677         for (segp = AS_SEGFIRST(&kas); segp; segp = AS_SEGNEXT(&kas, segp)) {
 678                 if (ste = cpr_sparse_seg_check(segp)) {
 679                         pages += (ste->st_fcn)(mapflag, bitfunc, segp);
 680                 } else {
 681                         pages += cpr_count_pages(segp->s_base,
 682                             segp->s_size, mapflag, bitfunc, DBG_SHOWRANGE);
 683                 }
 684         }
 685 
 686         return (pages);
 687 }
 688 
 689 
 690 /*
 691  * count kernel pages within kas and any special ranges
 692  */
 693 pgcnt_t
 694 cpr_count_kpages(int mapflag, bitfunc_t bitfunc)
 695 {
 696         pgcnt_t kas_cnt;
 697 
 698         /*
 699          * Some pages need to be taken care of differently.
 700          * eg: panicbuf pages of sun4m are not in kas but they need
 701          * to be saved.  On sun4u, the physical pages of panicbuf are
 702          * allocated via prom_retain().
 703          */
 704         kas_cnt = i_cpr_count_special_kpages(mapflag, bitfunc);
 705         kas_cnt += cpr_count_seg_pages(mapflag, bitfunc);
 706 
 707         CPR_DEBUG(CPR_DEBUG9, "cpr_count_kpages: kas_cnt=%ld\n", kas_cnt);
 708         CPR_DEBUG(CPR_DEBUG7, "\ncpr_count_kpages: %ld pages, 0x%lx bytes\n",
 709             kas_cnt, mmu_ptob(kas_cnt));
 710 
 711         return (kas_cnt);
 712 }
 713 
 714 
 715 /*
 716  * Set a bit corresponding to the arg phys page number;
 717  * returns 0 when the ppn is valid and the corresponding
 718  * map bit was clear, otherwise returns 1.
 719  */
 720 int
 721 cpr_setbit(pfn_t ppn, int mapflag)
 722 {
 723         char *bitmap;
 724         cbd_t *dp;
 725         pfn_t rel;
 726         int clr;
 727 
 728         for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
 729                 if (PPN_IN_RANGE(ppn, dp)) {
 730                         bitmap = DESC_TO_MAP(dp, mapflag);
 731                         rel = ppn - dp->cbd_spfn;
 732                         if ((clr = isclr(bitmap, rel)) != 0)
 733                                 setbit(bitmap, rel);
 734                         return (clr == 0);
 735                 }
 736         }
 737 
 738         return (1);
 739 }
 740 
 741 
 742 /*
 743  * Clear a bit corresponding to the arg phys page number.
 744  */
 745 int
 746 cpr_clrbit(pfn_t ppn, int mapflag)
 747 {
 748         char *bitmap;
 749         cbd_t *dp;
 750         pfn_t rel;
 751         int set;
 752 
 753         for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
 754                 if (PPN_IN_RANGE(ppn, dp)) {
 755                         bitmap = DESC_TO_MAP(dp, mapflag);
 756                         rel = ppn - dp->cbd_spfn;
 757                         if ((set = isset(bitmap, rel)) != 0)
 758                                 clrbit(bitmap, rel);
 759                         return (set == 0);
 760                 }
 761         }
 762 
 763         return (1);
 764 }
 765 
 766 
 767 /* ARGSUSED */
 768 int
 769 cpr_nobit(pfn_t ppn, int mapflag)
 770 {
 771         return (0);
 772 }
 773 
 774 
 775 /*
 776  * Lookup a bit corresponding to the arg phys page number.
 777  */
 778 int
 779 cpr_isset(pfn_t ppn, int mapflag)
 780 {
 781         char *bitmap;
 782         cbd_t *dp;
 783         pfn_t rel;
 784 
 785         for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
 786                 if (PPN_IN_RANGE(ppn, dp)) {
 787                         bitmap = DESC_TO_MAP(dp, mapflag);
 788                         rel = ppn - dp->cbd_spfn;
 789                         return (isset(bitmap, rel));
 790                 }
 791         }
 792 
 793         return (0);
 794 }
 795 
 796 
 797 /*
 798  * Go thru all pages and pick up any page not caught during the invalidation
 799  * stage. This is also used to save pages with cow lock or phys page lock held
 800  * (none zero p_lckcnt or p_cowcnt)
 801  */
 802 static  int
 803 cpr_count_upages(int mapflag, bitfunc_t bitfunc)
 804 {
 805         page_t *pp, *page0;
 806         pgcnt_t dcnt = 0, tcnt = 0;
 807         pfn_t pfn;
 808 
 809         page0 = pp = page_first();
 810 
 811         do {
 812                 if (pp->p_vnode == NULL || PP_ISKAS(pp) ||
 813                     PP_ISFREE(pp) && PP_ISAGED(pp))
 814                         continue;
 815 
 816                 pfn = page_pptonum(pp);
 817                 if (pf_is_memory(pfn)) {
 818                         tcnt++;
 819                         if ((*bitfunc)(pfn, mapflag) == 0)
 820                                 dcnt++; /* dirty count */
 821                 }
 822         } while ((pp = page_next(pp)) != page0);
 823 
 824         STAT->cs_upage2statef = dcnt;
 825         CPR_DEBUG(CPR_DEBUG9, "cpr_count_upages: dirty=%ld total=%ld\n",
 826             dcnt, tcnt);
 827         CPR_DEBUG(CPR_DEBUG7, "cpr_count_upages: %ld pages, 0x%lx bytes\n",
 828             dcnt, mmu_ptob(dcnt));
 829         page0 = NULL; /* for Lint */
 830         return (dcnt);
 831 }
 832 
 833 
 834 /*
 835  * try compressing pages based on cflag,
 836  * and for DEBUG kernels, verify uncompressed data checksum;
 837  *
 838  * this routine replaces common code from
 839  * i_cpr_compress_and_save() and cpr_compress_and_write()
 840  */
 841 char *
 842 cpr_compress_pages(cpd_t *dp, pgcnt_t pages, int cflag)
 843 {
 844         size_t nbytes, clen, len;
 845         uint32_t test_sum;
 846         char *datap;
 847 
 848         nbytes = mmu_ptob(pages);
 849 
 850         /*
 851          * set length to the original uncompressed data size;
 852          * always init cpd_flag to zero
 853          */
 854         dp->cpd_length = nbytes;
 855         dp->cpd_flag = 0;
 856 
 857 #ifdef  DEBUG
 858         /*
 859          * Make a copy of the uncompressed data so we can checksum it.
 860          * Compress that copy so the checksum works at the other end
 861          */
 862         cprbcopy(CPR->c_mapping_area, cpr_pagecopy, nbytes);
 863         dp->cpd_usum = checksum32(cpr_pagecopy, nbytes);
 864         dp->cpd_flag |= CPD_USUM;
 865         datap = cpr_pagecopy;
 866 #else
 867         datap = CPR->c_mapping_area;
 868         dp->cpd_usum = 0;
 869 #endif
 870 
 871         /*
 872          * try compressing the raw data to cpr_pagedata;
 873          * if there was a size reduction: record the new length,
 874          * flag the compression, and point to the compressed data.
 875          */
 876         dp->cpd_csum = 0;
 877         if (cflag) {
 878                 clen = compress(datap, cpr_pagedata, nbytes);
 879                 if (clen < nbytes) {
 880                         dp->cpd_flag |= CPD_COMPRESS;
 881                         dp->cpd_length = clen;
 882                         datap = cpr_pagedata;
 883 #ifdef  DEBUG
 884                         dp->cpd_csum = checksum32(datap, clen);
 885                         dp->cpd_flag |= CPD_CSUM;
 886 
 887                         /*
 888                          * decompress the data back to a scratch area
 889                          * and compare the new checksum with the original
 890                          * checksum to verify the compression.
 891                          */
 892                         bzero(cpr_pagecopy, sizeof (cpr_pagecopy));
 893                         len = decompress(datap, cpr_pagecopy,
 894                             clen, sizeof (cpr_pagecopy));
 895                         test_sum = checksum32(cpr_pagecopy, len);
 896                         ASSERT(test_sum == dp->cpd_usum);
 897 #endif
 898                 }
 899         }
 900 
 901         return (datap);
 902 }
 903 
 904 
 905 /*
 906  * 1. Prepare cpr page descriptor and write it to file
 907  * 2. Compress page data and write it out
 908  */
 909 static int
 910 cpr_compress_and_write(vnode_t *vp, uint_t va, pfn_t pfn, pgcnt_t npg)
 911 {
 912         int error = 0;
 913         char *datap;
 914         cpd_t cpd;      /* cpr page descriptor */
 915         extern void i_cpr_mapin(caddr_t, uint_t, pfn_t);
 916         extern void i_cpr_mapout(caddr_t, uint_t);
 917 
 918         i_cpr_mapin(CPR->c_mapping_area, npg, pfn);
 919 
 920         CPR_DEBUG(CPR_DEBUG3, "mapped-in %ld pages, vaddr 0x%p, pfn 0x%lx\n",
 921             npg, (void *)CPR->c_mapping_area, pfn);
 922 
 923         /*
 924          * Fill cpr page descriptor.
 925          */
 926         cpd.cpd_magic = (uint_t)CPR_PAGE_MAGIC;
 927         cpd.cpd_pfn = pfn;
 928         cpd.cpd_pages = npg;
 929 
 930         STAT->cs_dumped_statefsz += mmu_ptob(npg);
 931 
 932         datap = cpr_compress_pages(&cpd, npg, CPR->c_flags & C_COMPRESSING);
 933 
 934         /* Write cpr page descriptor */
 935         error = cpr_write(vp, (caddr_t)&cpd, sizeof (cpd_t));
 936 
 937         /* Write compressed page data */
 938         error = cpr_write(vp, (caddr_t)datap, cpd.cpd_length);
 939 
 940         /*
 941          * Unmap the pages for tlb and vac flushing
 942          */
 943         i_cpr_mapout(CPR->c_mapping_area, npg);
 944 
 945         if (error) {
 946                 CPR_DEBUG(CPR_DEBUG1,
 947                     "cpr_compress_and_write: vp 0x%p va 0x%x ", (void *)vp, va);
 948                 CPR_DEBUG(CPR_DEBUG1, "pfn 0x%lx blk %d err %d\n",
 949                     pfn, cpr_file_bn, error);
 950         } else {
 951                 cpr_regular_pgs_dumped += npg;
 952         }
 953 
 954         return (error);
 955 }
 956 
 957 
 958 int
 959 cpr_write(vnode_t *vp, caddr_t buffer, size_t size)
 960 {
 961         caddr_t fromp = buffer;
 962         size_t bytes, wbytes;
 963         int error;
 964 
 965         if (cpr_dev_space == 0) {
 966                 if (vp->v_type == VBLK) {
 967                         cpr_dev_space = cpr_get_devsize(vp->v_rdev);
 968                         ASSERT(cpr_dev_space);
 969                 } else
 970                         cpr_dev_space = 1;      /* not used in this case */
 971         }
 972 
 973         /*
 974          * break the write into multiple part if request is large,
 975          * calculate count up to buf page boundary, then write it out.
 976          * repeat until done.
 977          */
 978         while (size) {
 979                 bytes = MIN(size, cpr_buf_end - cpr_wptr);
 980                 cprbcopy(fromp, cpr_wptr, bytes);
 981                 cpr_wptr += bytes;
 982                 fromp += bytes;
 983                 size -= bytes;
 984                 if (cpr_wptr < cpr_buf_end)
 985                         return (0);     /* buffer not full yet */
 986                 ASSERT(cpr_wptr == cpr_buf_end);
 987 
 988                 wbytes = dbtob(cpr_file_bn + cpr_buf_blocks);
 989                 if (vp->v_type == VBLK) {
 990                         if (wbytes > cpr_dev_space)
 991                                 return (ENOSPC);
 992                 } else {
 993                         if (wbytes > VTOI(vp)->i_size)
 994                                 return (ENOSPC);
 995                 }
 996 
 997                 CPR_DEBUG(CPR_DEBUG3,
 998                     "cpr_write: frmp=%p wptr=%p cnt=%lx...",
 999                     (void *)fromp, (void *)cpr_wptr, bytes);
1000                 /*
1001                  * cross check, this should not happen!
1002                  */
1003                 if (cpr_disk_writes_ok == 0) {
1004                         prom_printf("cpr_write: disk write too early!\n");
1005                         return (EINVAL);
1006                 }
1007 
1008                 do_polled_io = 1;
1009                 error = VOP_DUMP(vp, cpr_buf, cpr_file_bn, cpr_buf_blocks,
1010                     NULL);
1011                 do_polled_io = 0;
1012                 CPR_DEBUG(CPR_DEBUG3, "done\n");
1013 
1014                 STAT->cs_real_statefsz += cpr_buf_size;
1015 
1016                 if (error) {
1017                         cpr_err(CE_WARN, "cpr_write error %d", error);
1018                         return (error);
1019                 }
1020                 cpr_file_bn += cpr_buf_blocks;  /* Increment block count */
1021                 cpr_wptr = cpr_buf;             /* back to top of buffer */
1022         }
1023         return (0);
1024 }
1025 
1026 
1027 int
1028 cpr_flush_write(vnode_t *vp)
1029 {
1030         int     nblk;
1031         int     error;
1032 
1033         /*
1034          * Calculate remaining blocks in buffer, rounded up to nearest
1035          * disk block
1036          */
1037         nblk = btod(cpr_wptr - cpr_buf);
1038 
1039         do_polled_io = 1;
1040         error = VOP_DUMP(vp, (caddr_t)cpr_buf, cpr_file_bn, nblk, NULL);
1041         do_polled_io = 0;
1042 
1043         cpr_file_bn += nblk;
1044         if (error)
1045                 CPR_DEBUG(CPR_DEBUG2, "cpr_flush_write: error (%d)\n",
1046                     error);
1047         return (error);
1048 }
1049 
1050 void
1051 cpr_clear_bitmaps(void)
1052 {
1053         cbd_t *dp;
1054 
1055         for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1056                 bzero((void *)dp->cbd_reg_bitmap,
1057                     (size_t)dp->cbd_size * 2);
1058         }
1059         CPR_DEBUG(CPR_DEBUG7, "\ncleared reg and vlt bitmaps\n");
1060 }
1061 
1062 int
1063 cpr_contig_pages(vnode_t *vp, int flag)
1064 {
1065         int chunks = 0, error = 0;
1066         pgcnt_t i, j, totbit;
1067         pfn_t spfn;
1068         cbd_t *dp;
1069         uint_t  spin_cnt = 0;
1070         extern  int i_cpr_compress_and_save();
1071 
1072         for (dp = CPR->c_bmda; dp->cbd_size; dp++) {
1073                 spfn = dp->cbd_spfn;
1074                 totbit = BTOb(dp->cbd_size);
1075                 i = 0; /* Beginning of bitmap */
1076                 j = 0;
1077                 while (i < totbit) {
1078                         while ((j < CPR_MAXCONTIG) && ((j + i) < totbit)) {
1079                                 if (isset((char *)dp->cbd_reg_bitmap, j+i))
1080                                         j++;
1081                                 else /* not contiguous anymore */
1082                                         break;
1083                         }
1084 
1085                         if (j) {
1086                                 chunks++;
1087                                 if (flag == SAVE_TO_STORAGE) {
1088                                         error = i_cpr_compress_and_save(
1089                                             chunks, spfn + i, j);
1090                                         if (error)
1091                                                 return (error);
1092                                 } else if (flag == WRITE_TO_STATEFILE) {
1093                                         error = cpr_compress_and_write(vp, 0,
1094                                             spfn + i, j);
1095                                         if (error)
1096                                                 return (error);
1097                                         else {
1098                                                 spin_cnt++;
1099                                                 if ((spin_cnt & 0x5F) == 1)
1100                                                         cpr_spinning_bar();
1101                                         }
1102                                 }
1103                         }
1104 
1105                         i += j;
1106                         if (j != CPR_MAXCONTIG) {
1107                                 /* Stopped on a non-tagged page */
1108                                 i++;
1109                         }
1110 
1111                         j = 0;
1112                 }
1113         }
1114 
1115         if (flag == STORAGE_DESC_ALLOC)
1116                 return (chunks);
1117         else
1118                 return (0);
1119 }
1120 
1121 
1122 void
1123 cpr_show_range(caddr_t vaddr, size_t size,
1124     int mapflag, bitfunc_t bitfunc, pgcnt_t count)
1125 {
1126         char *action, *bname;
1127 
1128         bname = (mapflag == REGULAR_BITMAP) ? "regular" : "volatile";
1129         if (bitfunc == cpr_setbit)
1130                 action = "tag";
1131         else if (bitfunc == cpr_clrbit)
1132                 action = "untag";
1133         else
1134                 action = "none";
1135         prom_printf("range (0x%p, 0x%p), %s bitmap, %s %ld\n",
1136             (void *)vaddr, (void *)(vaddr + size), bname, action, count);
1137 }
1138 
1139 
1140 pgcnt_t
1141 cpr_count_pages(caddr_t sva, size_t size,
1142     int mapflag, bitfunc_t bitfunc, int showrange)
1143 {
1144         caddr_t va, eva;
1145         pfn_t pfn;
1146         pgcnt_t count = 0;
1147 
1148         eva = sva + PAGE_ROUNDUP(size);
1149         for (va = sva; va < eva; va += MMU_PAGESIZE) {
1150                 pfn = va_to_pfn(va);
1151                 if (pfn != PFN_INVALID && pf_is_memory(pfn)) {
1152                         if ((*bitfunc)(pfn, mapflag) == 0)
1153                                 count++;
1154                 }
1155         }
1156 
1157         if ((cpr_debug & CPR_DEBUG7) && showrange == DBG_SHOWRANGE)
1158                 cpr_show_range(sva, size, mapflag, bitfunc, count);
1159 
1160         return (count);
1161 }
1162 
1163 
1164 pgcnt_t
1165 cpr_count_volatile_pages(int mapflag, bitfunc_t bitfunc)
1166 {
1167         pgcnt_t count = 0;
1168 
1169         if (cpr_buf) {
1170                 count += cpr_count_pages(cpr_buf, cpr_buf_size,
1171                     mapflag, bitfunc, DBG_SHOWRANGE);
1172         }
1173         if (cpr_pagedata) {
1174                 count += cpr_count_pages(cpr_pagedata, cpr_pagedata_size,
1175                     mapflag, bitfunc, DBG_SHOWRANGE);
1176         }
1177         count += i_cpr_count_storage_pages(mapflag, bitfunc);
1178 
1179         CPR_DEBUG(CPR_DEBUG7, "cpr_count_vpages: %ld pages, 0x%lx bytes\n",
1180             count, mmu_ptob(count));
1181         return (count);
1182 }
1183 
1184 
1185 static int
1186 cpr_dump_regular_pages(vnode_t *vp)
1187 {
1188         int error;
1189 
1190         cpr_regular_pgs_dumped = 0;
1191         error = cpr_contig_pages(vp, WRITE_TO_STATEFILE);
1192         if (!error)
1193                 CPR_DEBUG(CPR_DEBUG7, "cpr_dump_regular_pages() done.\n");
1194         return (error);
1195 }
1196 #endif