Home | History | Annotate | Download | only in runtime
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Garbage collector: type and heap bitmaps.
      6 //
      7 // Stack, data, and bss bitmaps
      8 //
      9 // Stack frames and global variables in the data and bss sections are described
     10 // by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
     11 // to be visited during GC. The bits in each byte are consumed starting with
     12 // the low bit: 1<<0, 1<<1, and so on.
     13 //
     14 // Heap bitmap
     15 //
     16 // The allocated heap comes from a subset of the memory in the range [start, used),
     17 // where start == mheap_.arena_start and used == mheap_.arena_used.
     18 // The heap bitmap comprises 2 bits for each pointer-sized word in that range,
     19 // stored in bytes indexed backward in memory from start.
     20 // That is, the byte at address start-1 holds the 2-bit entries for the four words
     21 // start through start+3*ptrSize, the byte at start-2 holds the entries for
     22 // start+4*ptrSize through start+7*ptrSize, and so on.
     23 //
     24 // In each 2-bit entry, the lower bit holds the same information as in the 1-bit
     25 // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
     26 // The meaning of the high bit depends on the position of the word being described
     27 // in its allocated object. In all words *except* the second word, the
     28 // high bit indicates that the object is still being described. In
     29 // these words, if a bit pair with a high bit 0 is encountered, the
     30 // low bit can also be assumed to be 0, and the object description is
     31 // over. This 00 is called the ``dead'' encoding: it signals that the
     32 // rest of the words in the object are uninteresting to the garbage
     33 // collector.
     34 //
     35 // In the second word, the high bit is the GC ``checkmarked'' bit (see below).
     36 //
     37 // The 2-bit entries are split when written into the byte, so that the top half
     38 // of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
     39 // bits.
     40 // This form allows a copy from the 1-bit to the 4-bit form to keep the
     41 // pointer bits contiguous, instead of having to space them out.
     42 //
     43 // The code makes use of the fact that the zero value for a heap bitmap
     44 // has no live pointer bit set and is (depending on position), not used,
     45 // not checkmarked, and is the dead encoding.
     46 // These properties must be preserved when modifying the encoding.
     47 //
     48 // The bitmap for noscan spans is not maintained. Code must ensure
     49 // that an object is scannable before consulting its bitmap by
     50 // checking either the noscan bit in the span or by consulting its
     51 // type's information.
     52 //
     53 // Checkmarks
     54 //
     55 // In a concurrent garbage collector, one worries about failing to mark
     56 // a live object due to mutations without write barriers or bugs in the
     57 // collector implementation. As a sanity check, the GC has a 'checkmark'
     58 // mode that retraverses the object graph with the world stopped, to make
     59 // sure that everything that should be marked is marked.
     60 // In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
     61 // for the second word of the object holds the checkmark bit.
     62 // When not in checkmark mode, this bit is set to 1.
     63 //
     64 // The smallest possible allocation is 8 bytes. On a 32-bit machine, that
     65 // means every allocated object has two words, so there is room for the
     66 // checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
     67 // just one word, so the second bit pair is not available for encoding the
     68 // checkmark. However, because non-pointer allocations are combined
     69 // into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
     70 // must be a pointer, so the type bit in the first word is not actually needed.
     71 // It is still used in general, except in checkmark the type bit is repurposed
     72 // as the checkmark bit and then reinitialized (to 1) as the type bit when
     73 // finished.
     74 //
     75 
     76 package runtime
     77 
     78 import (
     79 	"runtime/internal/atomic"
     80 	"runtime/internal/sys"
     81 	"unsafe"
     82 )
     83 
     84 const (
     85 	bitPointer = 1 << 0
     86 	bitScan    = 1 << 4
     87 
     88 	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitScan entries
     89 	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
     90 
     91 	// all scan/pointer bits in a byte
     92 	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
     93 	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
     94 )
     95 
     96 // addb returns the byte pointer p+n.
     97 //go:nowritebarrier
     98 //go:nosplit
     99 func addb(p *byte, n uintptr) *byte {
    100 	// Note: wrote out full expression instead of calling add(p, n)
    101 	// to reduce the number of temporaries generated by the
    102 	// compiler for this trivial expression during inlining.
    103 	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + n))
    104 }
    105 
    106 // subtractb returns the byte pointer p-n.
    107 // subtractb is typically used when traversing the pointer tables referred to by hbits
    108 // which are arranged in reverse order.
    109 //go:nowritebarrier
    110 //go:nosplit
    111 func subtractb(p *byte, n uintptr) *byte {
    112 	// Note: wrote out full expression instead of calling add(p, -n)
    113 	// to reduce the number of temporaries generated by the
    114 	// compiler for this trivial expression during inlining.
    115 	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - n))
    116 }
    117 
    118 // add1 returns the byte pointer p+1.
    119 //go:nowritebarrier
    120 //go:nosplit
    121 func add1(p *byte) *byte {
    122 	// Note: wrote out full expression instead of calling addb(p, 1)
    123 	// to reduce the number of temporaries generated by the
    124 	// compiler for this trivial expression during inlining.
    125 	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + 1))
    126 }
    127 
    128 // subtract1 returns the byte pointer p-1.
    129 // subtract1 is typically used when traversing the pointer tables referred to by hbits
    130 // which are arranged in reverse order.
    131 //go:nowritebarrier
    132 //
    133 // nosplit because it is used during write barriers and must not be preempted.
    134 //go:nosplit
    135 func subtract1(p *byte) *byte {
    136 	// Note: wrote out full expression instead of calling subtractb(p, 1)
    137 	// to reduce the number of temporaries generated by the
    138 	// compiler for this trivial expression during inlining.
    139 	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
    140 }
    141 
    142 // mapBits maps any additional bitmap memory needed for the new arena memory.
    143 //
    144 // Don't call this directly. Call mheap.setArenaUsed.
    145 //
    146 //go:nowritebarrier
    147 func (h *mheap) mapBits(arena_used uintptr) {
    148 	// Caller has added extra mappings to the arena.
    149 	// Add extra mappings of bitmap words as needed.
    150 	// We allocate extra bitmap pieces in chunks of bitmapChunk.
    151 	const bitmapChunk = 8192
    152 
    153 	n := (arena_used - mheap_.arena_start) / heapBitmapScale
    154 	n = round(n, bitmapChunk)
    155 	n = round(n, physPageSize)
    156 	if h.bitmap_mapped >= n {
    157 		return
    158 	}
    159 
    160 	sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
    161 	h.bitmap_mapped = n
    162 }
    163 
    164 // heapBits provides access to the bitmap bits for a single heap word.
    165 // The methods on heapBits take value receivers so that the compiler
    166 // can more easily inline calls to those methods and registerize the
    167 // struct fields independently.
    168 type heapBits struct {
    169 	bitp  *uint8
    170 	shift uint32
    171 }
    172 
    173 // markBits provides access to the mark bit for an object in the heap.
    174 // bytep points to the byte holding the mark bit.
    175 // mask is a byte with a single bit set that can be &ed with *bytep
    176 // to see if the bit has been set.
    177 // *m.byte&m.mask != 0 indicates the mark bit is set.
    178 // index can be used along with span information to generate
    179 // the address of the object in the heap.
    180 // We maintain one set of mark bits for allocation and one for
    181 // marking purposes.
    182 type markBits struct {
    183 	bytep *uint8
    184 	mask  uint8
    185 	index uintptr
    186 }
    187 
    188 //go:nosplit
    189 func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
    190 	bytep, mask := s.allocBits.bitp(allocBitIndex)
    191 	return markBits{bytep, mask, allocBitIndex}
    192 }
    193 
    194 // refillaCache takes 8 bytes s.allocBits starting at whichByte
    195 // and negates them so that ctz (count trailing zeros) instructions
    196 // can be used. It then places these 8 bytes into the cached 64 bit
    197 // s.allocCache.
    198 func (s *mspan) refillAllocCache(whichByte uintptr) {
    199 	bytes := (*[8]uint8)(unsafe.Pointer(s.allocBits.bytep(whichByte)))
    200 	aCache := uint64(0)
    201 	aCache |= uint64(bytes[0])
    202 	aCache |= uint64(bytes[1]) << (1 * 8)
    203 	aCache |= uint64(bytes[2]) << (2 * 8)
    204 	aCache |= uint64(bytes[3]) << (3 * 8)
    205 	aCache |= uint64(bytes[4]) << (4 * 8)
    206 	aCache |= uint64(bytes[5]) << (5 * 8)
    207 	aCache |= uint64(bytes[6]) << (6 * 8)
    208 	aCache |= uint64(bytes[7]) << (7 * 8)
    209 	s.allocCache = ^aCache
    210 }
    211 
    212 // nextFreeIndex returns the index of the next free object in s at
    213 // or after s.freeindex.
    214 // There are hardware instructions that can be used to make this
    215 // faster if profiling warrants it.
    216 func (s *mspan) nextFreeIndex() uintptr {
    217 	sfreeindex := s.freeindex
    218 	snelems := s.nelems
    219 	if sfreeindex == snelems {
    220 		return sfreeindex
    221 	}
    222 	if sfreeindex > snelems {
    223 		throw("s.freeindex > s.nelems")
    224 	}
    225 
    226 	aCache := s.allocCache
    227 
    228 	bitIndex := sys.Ctz64(aCache)
    229 	for bitIndex == 64 {
    230 		// Move index to start of next cached bits.
    231 		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
    232 		if sfreeindex >= snelems {
    233 			s.freeindex = snelems
    234 			return snelems
    235 		}
    236 		whichByte := sfreeindex / 8
    237 		// Refill s.allocCache with the next 64 alloc bits.
    238 		s.refillAllocCache(whichByte)
    239 		aCache = s.allocCache
    240 		bitIndex = sys.Ctz64(aCache)
    241 		// nothing available in cached bits
    242 		// grab the next 8 bytes and try again.
    243 	}
    244 	result := sfreeindex + uintptr(bitIndex)
    245 	if result >= snelems {
    246 		s.freeindex = snelems
    247 		return snelems
    248 	}
    249 
    250 	s.allocCache >>= uint(bitIndex + 1)
    251 	sfreeindex = result + 1
    252 
    253 	if sfreeindex%64 == 0 && sfreeindex != snelems {
    254 		// We just incremented s.freeindex so it isn't 0.
    255 		// As each 1 in s.allocCache was encountered and used for allocation
    256 		// it was shifted away. At this point s.allocCache contains all 0s.
    257 		// Refill s.allocCache so that it corresponds
    258 		// to the bits at s.allocBits starting at s.freeindex.
    259 		whichByte := sfreeindex / 8
    260 		s.refillAllocCache(whichByte)
    261 	}
    262 	s.freeindex = sfreeindex
    263 	return result
    264 }
    265 
    266 // isFree returns whether the index'th object in s is unallocated.
    267 func (s *mspan) isFree(index uintptr) bool {
    268 	if index < s.freeindex {
    269 		return false
    270 	}
    271 	bytep, mask := s.allocBits.bitp(index)
    272 	return *bytep&mask == 0
    273 }
    274 
    275 func (s *mspan) objIndex(p uintptr) uintptr {
    276 	byteOffset := p - s.base()
    277 	if byteOffset == 0 {
    278 		return 0
    279 	}
    280 	if s.baseMask != 0 {
    281 		// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
    282 		return byteOffset >> s.divShift
    283 	}
    284 	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
    285 }
    286 
    287 func markBitsForAddr(p uintptr) markBits {
    288 	s := spanOf(p)
    289 	objIndex := s.objIndex(p)
    290 	return s.markBitsForIndex(objIndex)
    291 }
    292 
    293 func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
    294 	bytep, mask := s.gcmarkBits.bitp(objIndex)
    295 	return markBits{bytep, mask, objIndex}
    296 }
    297 
    298 func (s *mspan) markBitsForBase() markBits {
    299 	return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
    300 }
    301 
    302 // isMarked reports whether mark bit m is set.
    303 func (m markBits) isMarked() bool {
    304 	return *m.bytep&m.mask != 0
    305 }
    306 
    307 // setMarked sets the marked bit in the markbits, atomically. Some compilers
    308 // are not able to inline atomic.Or8 function so if it appears as a hot spot consider
    309 // inlining it manually.
    310 func (m markBits) setMarked() {
    311 	// Might be racing with other updates, so use atomic update always.
    312 	// We used to be clever here and use a non-atomic update in certain
    313 	// cases, but it's not worth the risk.
    314 	atomic.Or8(m.bytep, m.mask)
    315 }
    316 
    317 // setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
    318 func (m markBits) setMarkedNonAtomic() {
    319 	*m.bytep |= m.mask
    320 }
    321 
    322 // clearMarked clears the marked bit in the markbits, atomically.
    323 func (m markBits) clearMarked() {
    324 	// Might be racing with other updates, so use atomic update always.
    325 	// We used to be clever here and use a non-atomic update in certain
    326 	// cases, but it's not worth the risk.
    327 	atomic.And8(m.bytep, ^m.mask)
    328 }
    329 
    330 // markBitsForSpan returns the markBits for the span base address base.
    331 func markBitsForSpan(base uintptr) (mbits markBits) {
    332 	if base < mheap_.arena_start || base >= mheap_.arena_used {
    333 		throw("markBitsForSpan: base out of range")
    334 	}
    335 	mbits = markBitsForAddr(base)
    336 	if mbits.mask != 1 {
    337 		throw("markBitsForSpan: unaligned start")
    338 	}
    339 	return mbits
    340 }
    341 
    342 // advance advances the markBits to the next object in the span.
    343 func (m *markBits) advance() {
    344 	if m.mask == 1<<7 {
    345 		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
    346 		m.mask = 1
    347 	} else {
    348 		m.mask = m.mask << 1
    349 	}
    350 	m.index++
    351 }
    352 
    353 // heapBitsForAddr returns the heapBits for the address addr.
    354 // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
    355 //
    356 // nosplit because it is used during write barriers and must not be preempted.
    357 //go:nosplit
    358 func heapBitsForAddr(addr uintptr) heapBits {
    359 	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
    360 	off := (addr - mheap_.arena_start) / sys.PtrSize
    361 	return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
    362 }
    363 
    364 // heapBitsForSpan returns the heapBits for the span base address base.
    365 func heapBitsForSpan(base uintptr) (hbits heapBits) {
    366 	if base < mheap_.arena_start || base >= mheap_.arena_used {
    367 		print("runtime: base ", hex(base), " not in range [", hex(mheap_.arena_start), ",", hex(mheap_.arena_used), ")\n")
    368 		throw("heapBitsForSpan: base out of range")
    369 	}
    370 	return heapBitsForAddr(base)
    371 }
    372 
    373 // heapBitsForObject returns the base address for the heap object
    374 // containing the address p, the heapBits for base,
    375 // the object's span, and of the index of the object in s.
    376 // If p does not point into a heap object,
    377 // return base == 0
    378 // otherwise return the base of the object.
    379 //
    380 // refBase and refOff optionally give the base address of the object
    381 // in which the pointer p was found and the byte offset at which it
    382 // was found. These are used for error reporting.
    383 func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
    384 	arenaStart := mheap_.arena_start
    385 	if p < arenaStart || p >= mheap_.arena_used {
    386 		return
    387 	}
    388 	off := p - arenaStart
    389 	idx := off >> _PageShift
    390 	// p points into the heap, but possibly to the middle of an object.
    391 	// Consult the span table to find the block beginning.
    392 	s = mheap_.spans[idx]
    393 	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
    394 		if s == nil || s.state == _MSpanManual {
    395 			// If s is nil, the virtual address has never been part of the heap.
    396 			// This pointer may be to some mmap'd region, so we allow it.
    397 			// Pointers into stacks are also ok, the runtime manages these explicitly.
    398 			return
    399 		}
    400 
    401 		// The following ensures that we are rigorous about what data
    402 		// structures hold valid pointers.
    403 		if debug.invalidptr != 0 {
    404 			// Typically this indicates an incorrect use
    405 			// of unsafe or cgo to store a bad pointer in
    406 			// the Go heap. It may also indicate a runtime
    407 			// bug.
    408 			//
    409 			// TODO(austin): We could be more aggressive
    410 			// and detect pointers to unallocated objects
    411 			// in allocated spans.
    412 			printlock()
    413 			print("runtime: pointer ", hex(p))
    414 			if s.state != mSpanInUse {
    415 				print(" to unallocated span")
    416 			} else {
    417 				print(" to unused region of span")
    418 			}
    419 			print(" idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
    420 			if refBase != 0 {
    421 				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
    422 				gcDumpObject("object", refBase, refOff)
    423 			}
    424 			getg().m.traceback = 2
    425 			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
    426 		}
    427 		return
    428 	}
    429 	// If this span holds object of a power of 2 size, just mask off the bits to
    430 	// the interior of the object. Otherwise use the size to get the base.
    431 	if s.baseMask != 0 {
    432 		// optimize for power of 2 sized objects.
    433 		base = s.base()
    434 		base = base + (p-base)&uintptr(s.baseMask)
    435 		objIndex = (base - s.base()) >> s.divShift
    436 		// base = p & s.baseMask is faster for small spans,
    437 		// but doesn't work for large spans.
    438 		// Overall, it's faster to use the more general computation above.
    439 	} else {
    440 		base = s.base()
    441 		if p-base >= s.elemsize {
    442 			// n := (p - base) / s.elemsize, using division by multiplication
    443 			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
    444 			base += objIndex * s.elemsize
    445 		}
    446 	}
    447 	// Now that we know the actual base, compute heapBits to return to caller.
    448 	hbits = heapBitsForAddr(base)
    449 	return
    450 }
    451 
    452 // next returns the heapBits describing the next pointer-sized word in memory.
    453 // That is, if h describes address p, h.next() describes p+ptrSize.
    454 // Note that next does not modify h. The caller must record the result.
    455 //
    456 // nosplit because it is used during write barriers and must not be preempted.
    457 //go:nosplit
    458 func (h heapBits) next() heapBits {
    459 	if h.shift < 3*heapBitsShift {
    460 		return heapBits{h.bitp, h.shift + heapBitsShift}
    461 	}
    462 	return heapBits{subtract1(h.bitp), 0}
    463 }
    464 
    465 // forward returns the heapBits describing n pointer-sized words ahead of h in memory.
    466 // That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
    467 // h.forward(1) is equivalent to h.next(), just slower.
    468 // Note that forward does not modify h. The caller must record the result.
    469 // bits returns the heap bits for the current word.
    470 func (h heapBits) forward(n uintptr) heapBits {
    471 	n += uintptr(h.shift) / heapBitsShift
    472 	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsShift}
    473 }
    474 
    475 // The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
    476 // The result includes in its higher bits the bits for subsequent words
    477 // described by the same bitmap byte.
    478 //
    479 // nosplit because it is used during write barriers and must not be preempted.
    480 //go:nosplit
    481 func (h heapBits) bits() uint32 {
    482 	// The (shift & 31) eliminates a test and conditional branch
    483 	// from the generated code.
    484 	return uint32(*h.bitp) >> (h.shift & 31)
    485 }
    486 
    487 // morePointers returns true if this word and all remaining words in this object
    488 // are scalars.
    489 // h must not describe the second word of the object.
    490 func (h heapBits) morePointers() bool {
    491 	return h.bits()&bitScan != 0
    492 }
    493 
    494 // isPointer reports whether the heap bits describe a pointer word.
    495 //
    496 // nosplit because it is used during write barriers and must not be preempted.
    497 //go:nosplit
    498 func (h heapBits) isPointer() bool {
    499 	return h.bits()&bitPointer != 0
    500 }
    501 
    502 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
    503 // It must be told how large the object at h is, because the encoding of the
    504 // checkmark bit varies by size.
    505 // h must describe the initial word of the object.
    506 func (h heapBits) isCheckmarked(size uintptr) bool {
    507 	if size == sys.PtrSize {
    508 		return (*h.bitp>>h.shift)&bitPointer != 0
    509 	}
    510 	// All multiword objects are 2-word aligned,
    511 	// so we know that the initial word's 2-bit pair
    512 	// and the second word's 2-bit pair are in the
    513 	// same heap bitmap byte, *h.bitp.
    514 	return (*h.bitp>>(heapBitsShift+h.shift))&bitScan != 0
    515 }
    516 
    517 // setCheckmarked sets the checkmarked bit.
    518 // It must be told how large the object at h is, because the encoding of the
    519 // checkmark bit varies by size.
    520 // h must describe the initial word of the object.
    521 func (h heapBits) setCheckmarked(size uintptr) {
    522 	if size == sys.PtrSize {
    523 		atomic.Or8(h.bitp, bitPointer<<h.shift)
    524 		return
    525 	}
    526 	atomic.Or8(h.bitp, bitScan<<(heapBitsShift+h.shift))
    527 }
    528 
    529 // bulkBarrierPreWrite executes a write barrier
    530 // for every pointer slot in the memory range [src, src+size),
    531 // using pointer/scalar information from [dst, dst+size).
    532 // This executes the write barriers necessary before a memmove.
    533 // src, dst, and size must be pointer-aligned.
    534 // The range [dst, dst+size) must lie within a single object.
    535 // It does not perform the actual writes.
    536 //
    537 // As a special case, src == 0 indicates that this is being used for a
    538 // memclr. bulkBarrierPreWrite will pass 0 for the src of each write
    539 // barrier.
    540 //
    541 // Callers should call bulkBarrierPreWrite immediately before
    542 // calling memmove(dst, src, size). This function is marked nosplit
    543 // to avoid being preempted; the GC must not stop the goroutine
    544 // between the memmove and the execution of the barriers.
    545 // The caller is also responsible for cgo pointer checks if this
    546 // may be writing Go pointers into non-Go memory.
    547 //
    548 // The pointer bitmap is not maintained for allocations containing
    549 // no pointers at all; any caller of bulkBarrierPreWrite must first
    550 // make sure the underlying allocation contains pointers, usually
    551 // by checking typ.kind&kindNoPointers.
    552 //
    553 //go:nosplit
    554 func bulkBarrierPreWrite(dst, src, size uintptr) {
    555 	if (dst|src|size)&(sys.PtrSize-1) != 0 {
    556 		throw("bulkBarrierPreWrite: unaligned arguments")
    557 	}
    558 	if !writeBarrier.needed {
    559 		return
    560 	}
    561 	if !inheap(dst) {
    562 		gp := getg().m.curg
    563 		if gp != nil && gp.stack.lo <= dst && dst < gp.stack.hi {
    564 			// Destination is our own stack. No need for barriers.
    565 			return
    566 		}
    567 
    568 		// If dst is a global, use the data or BSS bitmaps to
    569 		// execute write barriers.
    570 		for _, datap := range activeModules() {
    571 			if datap.data <= dst && dst < datap.edata {
    572 				bulkBarrierBitmap(dst, src, size, dst-datap.data, datap.gcdatamask.bytedata)
    573 				return
    574 			}
    575 		}
    576 		for _, datap := range activeModules() {
    577 			if datap.bss <= dst && dst < datap.ebss {
    578 				bulkBarrierBitmap(dst, src, size, dst-datap.bss, datap.gcbssmask.bytedata)
    579 				return
    580 			}
    581 		}
    582 		return
    583 	}
    584 
    585 	buf := &getg().m.p.ptr().wbBuf
    586 	h := heapBitsForAddr(dst)
    587 	if src == 0 {
    588 		for i := uintptr(0); i < size; i += sys.PtrSize {
    589 			if h.isPointer() {
    590 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
    591 				if !buf.putFast(*dstx, 0) {
    592 					wbBufFlush(nil, 0)
    593 				}
    594 			}
    595 			h = h.next()
    596 		}
    597 	} else {
    598 		for i := uintptr(0); i < size; i += sys.PtrSize {
    599 			if h.isPointer() {
    600 				dstx := (*uintptr)(unsafe.Pointer(dst + i))
    601 				srcx := (*uintptr)(unsafe.Pointer(src + i))
    602 				if !buf.putFast(*dstx, *srcx) {
    603 					wbBufFlush(nil, 0)
    604 				}
    605 			}
    606 			h = h.next()
    607 		}
    608 	}
    609 }
    610 
    611 // bulkBarrierBitmap executes write barriers for copying from [src,
    612 // src+size) to [dst, dst+size) using a 1-bit pointer bitmap. src is
    613 // assumed to start maskOffset bytes into the data covered by the
    614 // bitmap in bits (which may not be a multiple of 8).
    615 //
    616 // This is used by bulkBarrierPreWrite for writes to data and BSS.
    617 //
    618 //go:nosplit
    619 func bulkBarrierBitmap(dst, src, size, maskOffset uintptr, bits *uint8) {
    620 	word := maskOffset / sys.PtrSize
    621 	bits = addb(bits, word/8)
    622 	mask := uint8(1) << (word % 8)
    623 
    624 	buf := &getg().m.p.ptr().wbBuf
    625 	for i := uintptr(0); i < size; i += sys.PtrSize {
    626 		if mask == 0 {
    627 			bits = addb(bits, 1)
    628 			if *bits == 0 {
    629 				// Skip 8 words.
    630 				i += 7 * sys.PtrSize
    631 				continue
    632 			}
    633 			mask = 1
    634 		}
    635 		if *bits&mask != 0 {
    636 			dstx := (*uintptr)(unsafe.Pointer(dst + i))
    637 			if src == 0 {
    638 				if !buf.putFast(*dstx, 0) {
    639 					wbBufFlush(nil, 0)
    640 				}
    641 			} else {
    642 				srcx := (*uintptr)(unsafe.Pointer(src + i))
    643 				if !buf.putFast(*dstx, *srcx) {
    644 					wbBufFlush(nil, 0)
    645 				}
    646 			}
    647 		}
    648 		mask <<= 1
    649 	}
    650 }
    651 
    652 // typeBitsBulkBarrier executes writebarrierptr_prewrite for every
    653 // pointer that would be copied from [src, src+size) to [dst,
    654 // dst+size) by a memmove using the type bitmap to locate those
    655 // pointer slots.
    656 //
    657 // The type typ must correspond exactly to [src, src+size) and [dst, dst+size).
    658 // dst, src, and size must be pointer-aligned.
    659 // The type typ must have a plain bitmap, not a GC program.
    660 // The only use of this function is in channel sends, and the
    661 // 64 kB channel element limit takes care of this for us.
    662 //
    663 // Must not be preempted because it typically runs right before memmove,
    664 // and the GC must observe them as an atomic action.
    665 //
    666 //go:nosplit
    667 func typeBitsBulkBarrier(typ *_type, dst, src, size uintptr) {
    668 	if typ == nil {
    669 		throw("runtime: typeBitsBulkBarrier without type")
    670 	}
    671 	if typ.size != size {
    672 		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
    673 		throw("runtime: invalid typeBitsBulkBarrier")
    674 	}
    675 	if typ.kind&kindGCProg != 0 {
    676 		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
    677 		throw("runtime: invalid typeBitsBulkBarrier")
    678 	}
    679 	if !writeBarrier.needed {
    680 		return
    681 	}
    682 	ptrmask := typ.gcdata
    683 	var bits uint32
    684 	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
    685 		if i&(sys.PtrSize*8-1) == 0 {
    686 			bits = uint32(*ptrmask)
    687 			ptrmask = addb(ptrmask, 1)
    688 		} else {
    689 			bits = bits >> 1
    690 		}
    691 		if bits&1 != 0 {
    692 			dstx := (*uintptr)(unsafe.Pointer(dst + i))
    693 			srcx := (*uintptr)(unsafe.Pointer(src + i))
    694 			writebarrierptr_prewrite(dstx, *srcx)
    695 		}
    696 	}
    697 }
    698 
    699 // The methods operating on spans all require that h has been returned
    700 // by heapBitsForSpan and that size, n, total are the span layout description
    701 // returned by the mspan's layout method.
    702 // If total > size*n, it means that there is extra leftover memory in the span,
    703 // usually due to rounding.
    704 //
    705 // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
    706 
    707 // initSpan initializes the heap bitmap for a span.
    708 // It clears all checkmark bits.
    709 // If this is a span of pointer-sized objects, it initializes all
    710 // words to pointer/scan.
    711 // Otherwise, it initializes all words to scalar/dead.
    712 func (h heapBits) initSpan(s *mspan) {
    713 	size, n, total := s.layout()
    714 
    715 	// Init the markbit structures
    716 	s.freeindex = 0
    717 	s.allocCache = ^uint64(0) // all 1s indicating all free.
    718 	s.nelems = n
    719 	s.allocBits = nil
    720 	s.gcmarkBits = nil
    721 	s.gcmarkBits = newMarkBits(s.nelems)
    722 	s.allocBits = newAllocBits(s.nelems)
    723 
    724 	// Clear bits corresponding to objects.
    725 	if total%heapBitmapScale != 0 {
    726 		throw("initSpan: unaligned length")
    727 	}
    728 	nbyte := total / heapBitmapScale
    729 	if sys.PtrSize == 8 && size == sys.PtrSize {
    730 		end := h.bitp
    731 		bitp := subtractb(end, nbyte-1)
    732 		for {
    733 			*bitp = bitPointerAll | bitScanAll
    734 			if bitp == end {
    735 				break
    736 			}
    737 			bitp = add1(bitp)
    738 		}
    739 		return
    740 	}
    741 	memclrNoHeapPointers(unsafe.Pointer(subtractb(h.bitp, nbyte-1)), nbyte)
    742 }
    743 
    744 // initCheckmarkSpan initializes a span for being checkmarked.
    745 // It clears the checkmark bits, which are set to 1 in normal operation.
    746 func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
    747 	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
    748 	if sys.PtrSize == 8 && size == sys.PtrSize {
    749 		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
    750 		// Only possible on 64-bit system, since minimum size is 8.
    751 		// Must clear type bit (checkmark bit) of every word.
    752 		// The type bit is the lower of every two-bit pair.
    753 		bitp := h.bitp
    754 		for i := uintptr(0); i < n; i += 4 {
    755 			*bitp &^= bitPointerAll
    756 			bitp = subtract1(bitp)
    757 		}
    758 		return
    759 	}
    760 	for i := uintptr(0); i < n; i++ {
    761 		*h.bitp &^= bitScan << (heapBitsShift + h.shift)
    762 		h = h.forward(size / sys.PtrSize)
    763 	}
    764 }
    765 
    766 // clearCheckmarkSpan undoes all the checkmarking in a span.
    767 // The actual checkmark bits are ignored, so the only work to do
    768 // is to fix the pointer bits. (Pointer bits are ignored by scanobject
    769 // but consulted by typedmemmove.)
    770 func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
    771 	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
    772 	if sys.PtrSize == 8 && size == sys.PtrSize {
    773 		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
    774 		// Only possible on 64-bit system, since minimum size is 8.
    775 		// Must clear type bit (checkmark bit) of every word.
    776 		// The type bit is the lower of every two-bit pair.
    777 		bitp := h.bitp
    778 		for i := uintptr(0); i < n; i += 4 {
    779 			*bitp |= bitPointerAll
    780 			bitp = subtract1(bitp)
    781 		}
    782 	}
    783 }
    784 
    785 // oneBitCount is indexed by byte and produces the
    786 // number of 1 bits in that byte. For example 128 has 1 bit set
    787 // and oneBitCount[128] will holds 1.
    788 var oneBitCount = [256]uint8{
    789 	0, 1, 1, 2, 1, 2, 2, 3,
    790 	1, 2, 2, 3, 2, 3, 3, 4,
    791 	1, 2, 2, 3, 2, 3, 3, 4,
    792 	2, 3, 3, 4, 3, 4, 4, 5,
    793 	1, 2, 2, 3, 2, 3, 3, 4,
    794 	2, 3, 3, 4, 3, 4, 4, 5,
    795 	2, 3, 3, 4, 3, 4, 4, 5,
    796 	3, 4, 4, 5, 4, 5, 5, 6,
    797 	1, 2, 2, 3, 2, 3, 3, 4,
    798 	2, 3, 3, 4, 3, 4, 4, 5,
    799 	2, 3, 3, 4, 3, 4, 4, 5,
    800 	3, 4, 4, 5, 4, 5, 5, 6,
    801 	2, 3, 3, 4, 3, 4, 4, 5,
    802 	3, 4, 4, 5, 4, 5, 5, 6,
    803 	3, 4, 4, 5, 4, 5, 5, 6,
    804 	4, 5, 5, 6, 5, 6, 6, 7,
    805 	1, 2, 2, 3, 2, 3, 3, 4,
    806 	2, 3, 3, 4, 3, 4, 4, 5,
    807 	2, 3, 3, 4, 3, 4, 4, 5,
    808 	3, 4, 4, 5, 4, 5, 5, 6,
    809 	2, 3, 3, 4, 3, 4, 4, 5,
    810 	3, 4, 4, 5, 4, 5, 5, 6,
    811 	3, 4, 4, 5, 4, 5, 5, 6,
    812 	4, 5, 5, 6, 5, 6, 6, 7,
    813 	2, 3, 3, 4, 3, 4, 4, 5,
    814 	3, 4, 4, 5, 4, 5, 5, 6,
    815 	3, 4, 4, 5, 4, 5, 5, 6,
    816 	4, 5, 5, 6, 5, 6, 6, 7,
    817 	3, 4, 4, 5, 4, 5, 5, 6,
    818 	4, 5, 5, 6, 5, 6, 6, 7,
    819 	4, 5, 5, 6, 5, 6, 6, 7,
    820 	5, 6, 6, 7, 6, 7, 7, 8}
    821 
    822 // countAlloc returns the number of objects allocated in span s by
    823 // scanning the allocation bitmap.
    824 // TODO:(rlh) Use popcount intrinsic.
    825 func (s *mspan) countAlloc() int {
    826 	count := 0
    827 	maxIndex := s.nelems / 8
    828 	for i := uintptr(0); i < maxIndex; i++ {
    829 		mrkBits := *s.gcmarkBits.bytep(i)
    830 		count += int(oneBitCount[mrkBits])
    831 	}
    832 	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
    833 		mrkBits := *s.gcmarkBits.bytep(maxIndex)
    834 		mask := uint8((1 << bitsInLastByte) - 1)
    835 		bits := mrkBits & mask
    836 		count += int(oneBitCount[bits])
    837 	}
    838 	return count
    839 }
    840 
    841 // heapBitsSetType records that the new allocation [x, x+size)
    842 // holds in [x, x+dataSize) one or more values of type typ.
    843 // (The number of values is given by dataSize / typ.size.)
    844 // If dataSize < size, the fragment [x+dataSize, x+size) is
    845 // recorded as non-pointer data.
    846 // It is known that the type has pointers somewhere;
    847 // malloc does not call heapBitsSetType when there are no pointers,
    848 // because all free objects are marked as noscan during
    849 // heapBitsSweepSpan.
    850 //
    851 // There can only be one allocation from a given span active at a time,
    852 // and the bitmap for a span always falls on byte boundaries,
    853 // so there are no write-write races for access to the heap bitmap.
    854 // Hence, heapBitsSetType can access the bitmap without atomics.
    855 //
    856 // There can be read-write races between heapBitsSetType and things
    857 // that read the heap bitmap like scanobject. However, since
    858 // heapBitsSetType is only used for objects that have not yet been
    859 // made reachable, readers will ignore bits being modified by this
    860 // function. This does mean this function cannot transiently modify
    861 // bits that belong to neighboring objects. Also, on weakly-ordered
    862 // machines, callers must execute a store/store (publication) barrier
    863 // between calling this function and making the object reachable.
    864 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
    865 	const doubleCheck = false // slow but helpful; enable to test modifications to this code
    866 
    867 	// dataSize is always size rounded up to the next malloc size class,
    868 	// except in the case of allocating a defer block, in which case
    869 	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
    870 	// arbitrarily larger.
    871 	//
    872 	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
    873 	// assume that dataSize == size without checking it explicitly.
    874 
    875 	if sys.PtrSize == 8 && size == sys.PtrSize {
    876 		// It's one word and it has pointers, it must be a pointer.
    877 		// Since all allocated one-word objects are pointers
    878 		// (non-pointers are aggregated into tinySize allocations),
    879 		// initSpan sets the pointer bits for us. Nothing to do here.
    880 		if doubleCheck {
    881 			h := heapBitsForAddr(x)
    882 			if !h.isPointer() {
    883 				throw("heapBitsSetType: pointer bit missing")
    884 			}
    885 			if !h.morePointers() {
    886 				throw("heapBitsSetType: scan bit missing")
    887 			}
    888 		}
    889 		return
    890 	}
    891 
    892 	h := heapBitsForAddr(x)
    893 	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
    894 
    895 	// Heap bitmap bits for 2-word object are only 4 bits,
    896 	// so also shared with objects next to it.
    897 	// This is called out as a special case primarily for 32-bit systems,
    898 	// so that on 32-bit systems the code below can assume all objects
    899 	// are 4-word aligned (because they're all 16-byte aligned).
    900 	if size == 2*sys.PtrSize {
    901 		if typ.size == sys.PtrSize {
    902 			// We're allocating a block big enough to hold two pointers.
    903 			// On 64-bit, that means the actual object must be two pointers,
    904 			// or else we'd have used the one-pointer-sized block.
    905 			// On 32-bit, however, this is the 8-byte block, the smallest one.
    906 			// So it could be that we're allocating one pointer and this was
    907 			// just the smallest block available. Distinguish by checking dataSize.
    908 			// (In general the number of instances of typ being allocated is
    909 			// dataSize/typ.size.)
    910 			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
    911 				// 1 pointer object. On 32-bit machines clear the bit for the
    912 				// unused second word.
    913 				*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
    914 				*h.bitp |= (bitPointer | bitScan) << h.shift
    915 			} else {
    916 				// 2-element slice of pointer.
    917 				*h.bitp |= (bitPointer | bitScan | bitPointer<<heapBitsShift) << h.shift
    918 			}
    919 			return
    920 		}
    921 		// Otherwise typ.size must be 2*sys.PtrSize,
    922 		// and typ.kind&kindGCProg == 0.
    923 		if doubleCheck {
    924 			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
    925 				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
    926 				throw("heapBitsSetType")
    927 			}
    928 		}
    929 		b := uint32(*ptrmask)
    930 		hb := (b & 3) | bitScan
    931 		// bitPointer == 1, bitScan is 1 << 4, heapBitsShift is 1.
    932 		// 110011 is shifted h.shift and complemented.
    933 		// This clears out the bits that are about to be
    934 		// ored into *h.hbitp in the next instructions.
    935 		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
    936 		*h.bitp |= uint8(hb << h.shift)
    937 		return
    938 	}
    939 
    940 	// Copy from 1-bit ptrmask into 2-bit bitmap.
    941 	// The basic approach is to use a single uintptr as a bit buffer,
    942 	// alternating between reloading the buffer and writing bitmap bytes.
    943 	// In general, one load can supply two bitmap byte writes.
    944 	// This is a lot of lines of code, but it compiles into relatively few
    945 	// machine instructions.
    946 
    947 	var (
    948 		// Ptrmask input.
    949 		p     *byte   // last ptrmask byte read
    950 		b     uintptr // ptrmask bits already loaded
    951 		nb    uintptr // number of bits in b at next read
    952 		endp  *byte   // final ptrmask byte to read (then repeat)
    953 		endnb uintptr // number of valid bits in *endp
    954 		pbits uintptr // alternate source of bits
    955 
    956 		// Heap bitmap output.
    957 		w     uintptr // words processed
    958 		nw    uintptr // number of words to process
    959 		hbitp *byte   // next heap bitmap byte to write
    960 		hb    uintptr // bits being prepared for *hbitp
    961 	)
    962 
    963 	hbitp = h.bitp
    964 
    965 	// Handle GC program. Delayed until this part of the code
    966 	// so that we can use the same double-checking mechanism
    967 	// as the 1-bit case. Nothing above could have encountered
    968 	// GC programs: the cases were all too small.
    969 	if typ.kind&kindGCProg != 0 {
    970 		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
    971 		if doubleCheck {
    972 			// Double-check the heap bits written by GC program
    973 			// by running the GC program to create a 1-bit pointer mask
    974 			// and then jumping to the double-check code below.
    975 			// This doesn't catch bugs shared between the 1-bit and 4-bit
    976 			// GC program execution, but it does catch mistakes specific
    977 			// to just one of those and bugs in heapBitsSetTypeGCProg's
    978 			// implementation of arrays.
    979 			lock(&debugPtrmask.lock)
    980 			if debugPtrmask.data == nil {
    981 				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
    982 			}
    983 			ptrmask = debugPtrmask.data
    984 			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
    985 			goto Phase4
    986 		}
    987 		return
    988 	}
    989 
    990 	// Note about sizes:
    991 	//
    992 	// typ.size is the number of words in the object,
    993 	// and typ.ptrdata is the number of words in the prefix
    994 	// of the object that contains pointers. That is, the final
    995 	// typ.size - typ.ptrdata words contain no pointers.
    996 	// This allows optimization of a common pattern where
    997 	// an object has a small header followed by a large scalar
    998 	// buffer. If we know the pointers are over, we don't have
    999 	// to scan the buffer's heap bitmap at all.
   1000 	// The 1-bit ptrmasks are sized to contain only bits for
   1001 	// the typ.ptrdata prefix, zero padded out to a full byte
   1002 	// of bitmap. This code sets nw (below) so that heap bitmap
   1003 	// bits are only written for the typ.ptrdata prefix; if there is
   1004 	// more room in the allocated object, the next heap bitmap
   1005 	// entry is a 00, indicating that there are no more pointers
   1006 	// to scan. So only the ptrmask for the ptrdata bytes is needed.
   1007 	//
   1008 	// Replicated copies are not as nice: if there is an array of
   1009 	// objects with scalar tails, all but the last tail does have to
   1010 	// be initialized, because there is no way to say "skip forward".
   1011 	// However, because of the possibility of a repeated type with
   1012 	// size not a multiple of 4 pointers (one heap bitmap byte),
   1013 	// the code already must handle the last ptrmask byte specially
   1014 	// by treating it as containing only the bits for endnb pointers,
   1015 	// where endnb <= 4. We represent large scalar tails that must
   1016 	// be expanded in the replication by setting endnb larger than 4.
   1017 	// This will have the effect of reading many bits out of b,
   1018 	// but once the real bits are shifted out, b will supply as many
   1019 	// zero bits as we try to read, which is exactly what we need.
   1020 
   1021 	p = ptrmask
   1022 	if typ.size < dataSize {
   1023 		// Filling in bits for an array of typ.
   1024 		// Set up for repetition of ptrmask during main loop.
   1025 		// Note that ptrmask describes only a prefix of
   1026 		const maxBits = sys.PtrSize*8 - 7
   1027 		if typ.ptrdata/sys.PtrSize <= maxBits {
   1028 			// Entire ptrmask fits in uintptr with room for a byte fragment.
   1029 			// Load into pbits and never read from ptrmask again.
   1030 			// This is especially important when the ptrmask has
   1031 			// fewer than 8 bits in it; otherwise the reload in the middle
   1032 			// of the Phase 2 loop would itself need to loop to gather
   1033 			// at least 8 bits.
   1034 
   1035 			// Accumulate ptrmask into b.
   1036 			// ptrmask is sized to describe only typ.ptrdata, but we record
   1037 			// it as describing typ.size bytes, since all the high bits are zero.
   1038 			nb = typ.ptrdata / sys.PtrSize
   1039 			for i := uintptr(0); i < nb; i += 8 {
   1040 				b |= uintptr(*p) << i
   1041 				p = add1(p)
   1042 			}
   1043 			nb = typ.size / sys.PtrSize
   1044 
   1045 			// Replicate ptrmask to fill entire pbits uintptr.
   1046 			// Doubling and truncating is fewer steps than
   1047 			// iterating by nb each time. (nb could be 1.)
   1048 			// Since we loaded typ.ptrdata/sys.PtrSize bits
   1049 			// but are pretending to have typ.size/sys.PtrSize,
   1050 			// there might be no replication necessary/possible.
   1051 			pbits = b
   1052 			endnb = nb
   1053 			if nb+nb <= maxBits {
   1054 				for endnb <= sys.PtrSize*8 {
   1055 					pbits |= pbits << endnb
   1056 					endnb += endnb
   1057 				}
   1058 				// Truncate to a multiple of original ptrmask.
   1059 				// Because nb+nb <= maxBits, nb fits in a byte.
   1060 				// Byte division is cheaper than uintptr division.
   1061 				endnb = uintptr(maxBits/byte(nb)) * nb
   1062 				pbits &= 1<<endnb - 1
   1063 				b = pbits
   1064 				nb = endnb
   1065 			}
   1066 
   1067 			// Clear p and endp as sentinel for using pbits.
   1068 			// Checked during Phase 2 loop.
   1069 			p = nil
   1070 			endp = nil
   1071 		} else {
   1072 			// Ptrmask is larger. Read it multiple times.
   1073 			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
   1074 			endp = addb(ptrmask, n)
   1075 			endnb = typ.size/sys.PtrSize - n*8
   1076 		}
   1077 	}
   1078 	if p != nil {
   1079 		b = uintptr(*p)
   1080 		p = add1(p)
   1081 		nb = 8
   1082 	}
   1083 
   1084 	if typ.size == dataSize {
   1085 		// Single entry: can stop once we reach the non-pointer data.
   1086 		nw = typ.ptrdata / sys.PtrSize
   1087 	} else {
   1088 		// Repeated instances of typ in an array.
   1089 		// Have to process first N-1 entries in full, but can stop
   1090 		// once we reach the non-pointer data in the final entry.
   1091 		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
   1092 	}
   1093 	if nw == 0 {
   1094 		// No pointers! Caller was supposed to check.
   1095 		println("runtime: invalid type ", typ.string())
   1096 		throw("heapBitsSetType: called with non-pointer type")
   1097 		return
   1098 	}
   1099 	if nw < 2 {
   1100 		// Must write at least 2 words, because the "no scan"
   1101 		// encoding doesn't take effect until the third word.
   1102 		nw = 2
   1103 	}
   1104 
   1105 	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
   1106 	// The leading byte is special because it contains the bits for word 1,
   1107 	// which does not have the scan bit set.
   1108 	// The leading half-byte is special because it's a half a byte,
   1109 	// so we have to be careful with the bits already there.
   1110 	switch {
   1111 	default:
   1112 		throw("heapBitsSetType: unexpected shift")
   1113 
   1114 	case h.shift == 0:
   1115 		// Ptrmask and heap bitmap are aligned.
   1116 		// Handle first byte of bitmap specially.
   1117 		//
   1118 		// The first byte we write out covers the first four
   1119 		// words of the object. The scan/dead bit on the first
   1120 		// word must be set to scan since there are pointers
   1121 		// somewhere in the object. The scan/dead bit on the
   1122 		// second word is the checkmark, so we don't set it.
   1123 		// In all following words, we set the scan/dead
   1124 		// appropriately to indicate that the object contains
   1125 		// to the next 2-bit entry in the bitmap.
   1126 		//
   1127 		// TODO: It doesn't matter if we set the checkmark, so
   1128 		// maybe this case isn't needed any more.
   1129 		hb = b & bitPointerAll
   1130 		hb |= bitScan | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
   1131 		if w += 4; w >= nw {
   1132 			goto Phase3
   1133 		}
   1134 		*hbitp = uint8(hb)
   1135 		hbitp = subtract1(hbitp)
   1136 		b >>= 4
   1137 		nb -= 4
   1138 
   1139 	case sys.PtrSize == 8 && h.shift == 2:
   1140 		// Ptrmask and heap bitmap are misaligned.
   1141 		// The bits for the first two words are in a byte shared
   1142 		// with another object, so we must be careful with the bits
   1143 		// already there.
   1144 		// We took care of 1-word and 2-word objects above,
   1145 		// so this is at least a 6-word object.
   1146 		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
   1147 		// This is not noscan, so set the scan bit in the
   1148 		// first word.
   1149 		hb |= bitScan << (2 * heapBitsShift)
   1150 		b >>= 2
   1151 		nb -= 2
   1152 		// Note: no bitScan for second word because that's
   1153 		// the checkmark.
   1154 		*hbitp &^= uint8((bitPointer | bitScan | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
   1155 		*hbitp |= uint8(hb)
   1156 		hbitp = subtract1(hbitp)
   1157 		if w += 2; w >= nw {
   1158 			// We know that there is more data, because we handled 2-word objects above.
   1159 			// This must be at least a 6-word object. If we're out of pointer words,
   1160 			// mark no scan in next bitmap byte and finish.
   1161 			hb = 0
   1162 			w += 4
   1163 			goto Phase3
   1164 		}
   1165 	}
   1166 
   1167 	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
   1168 	// The loop computes the bits for that last write but does not execute the write;
   1169 	// it leaves the bits in hb for processing by phase 3.
   1170 	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
   1171 	// use in the first half of the loop right now, and then we only adjust nb explicitly
   1172 	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
   1173 	nb -= 4
   1174 	for {
   1175 		// Emit bitmap byte.
   1176 		// b has at least nb+4 bits, with one exception:
   1177 		// if w+4 >= nw, then b has only nw-w bits,
   1178 		// but we'll stop at the break and then truncate
   1179 		// appropriately in Phase 3.
   1180 		hb = b & bitPointerAll
   1181 		hb |= bitScanAll
   1182 		if w += 4; w >= nw {
   1183 			break
   1184 		}
   1185 		*hbitp = uint8(hb)
   1186 		hbitp = subtract1(hbitp)
   1187 		b >>= 4
   1188 
   1189 		// Load more bits. b has nb right now.
   1190 		if p != endp {
   1191 			// Fast path: keep reading from ptrmask.
   1192 			// nb unmodified: we just loaded 8 bits,
   1193 			// and the next iteration will consume 8 bits,
   1194 			// leaving us with the same nb the next time we're here.
   1195 			if nb < 8 {
   1196 				b |= uintptr(*p) << nb
   1197 				p = add1(p)
   1198 			} else {
   1199 				// Reduce the number of bits in b.
   1200 				// This is important if we skipped
   1201 				// over a scalar tail, since nb could
   1202 				// be larger than the bit width of b.
   1203 				nb -= 8
   1204 			}
   1205 		} else if p == nil {
   1206 			// Almost as fast path: track bit count and refill from pbits.
   1207 			// For short repetitions.
   1208 			if nb < 8 {
   1209 				b |= pbits << nb
   1210 				nb += endnb
   1211 			}
   1212 			nb -= 8 // for next iteration
   1213 		} else {
   1214 			// Slow path: reached end of ptrmask.
   1215 			// Process final partial byte and rewind to start.
   1216 			b |= uintptr(*p) << nb
   1217 			nb += endnb
   1218 			if nb < 8 {
   1219 				b |= uintptr(*ptrmask) << nb
   1220 				p = add1(ptrmask)
   1221 			} else {
   1222 				nb -= 8
   1223 				p = ptrmask
   1224 			}
   1225 		}
   1226 
   1227 		// Emit bitmap byte.
   1228 		hb = b & bitPointerAll
   1229 		hb |= bitScanAll
   1230 		if w += 4; w >= nw {
   1231 			break
   1232 		}
   1233 		*hbitp = uint8(hb)
   1234 		hbitp = subtract1(hbitp)
   1235 		b >>= 4
   1236 	}
   1237 
   1238 Phase3:
   1239 	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
   1240 	if w > nw {
   1241 		// Counting the 4 entries in hb not yet written to memory,
   1242 		// there are more entries than possible pointer slots.
   1243 		// Discard the excess entries (can't be more than 3).
   1244 		mask := uintptr(1)<<(4-(w-nw)) - 1
   1245 		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
   1246 	}
   1247 
   1248 	// Change nw from counting possibly-pointer words to total words in allocation.
   1249 	nw = size / sys.PtrSize
   1250 
   1251 	// Write whole bitmap bytes.
   1252 	// The first is hb, the rest are zero.
   1253 	if w <= nw {
   1254 		*hbitp = uint8(hb)
   1255 		hbitp = subtract1(hbitp)
   1256 		hb = 0 // for possible final half-byte below
   1257 		for w += 4; w <= nw; w += 4 {
   1258 			*hbitp = 0
   1259 			hbitp = subtract1(hbitp)
   1260 		}
   1261 	}
   1262 
   1263 	// Write final partial bitmap byte if any.
   1264 	// We know w > nw, or else we'd still be in the loop above.
   1265 	// It can be bigger only due to the 4 entries in hb that it counts.
   1266 	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
   1267 	// and can discard the 4 sitting in hb.
   1268 	// But if w == nw+2, we need to write first two in hb.
   1269 	// The byte is shared with the next object, so be careful with
   1270 	// existing bits.
   1271 	if w == nw+2 {
   1272 		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
   1273 	}
   1274 
   1275 Phase4:
   1276 	// Phase 4: all done, but perhaps double check.
   1277 	if doubleCheck {
   1278 		end := heapBitsForAddr(x + size)
   1279 		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
   1280 			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
   1281 			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
   1282 			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
   1283 			h0 := heapBitsForAddr(x)
   1284 			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
   1285 			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
   1286 			throw("bad heapBitsSetType")
   1287 		}
   1288 
   1289 		// Double-check that bits to be written were written correctly.
   1290 		// Does not check that other bits were not written, unfortunately.
   1291 		h := heapBitsForAddr(x)
   1292 		nptr := typ.ptrdata / sys.PtrSize
   1293 		ndata := typ.size / sys.PtrSize
   1294 		count := dataSize / typ.size
   1295 		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
   1296 		for i := uintptr(0); i < size/sys.PtrSize; i++ {
   1297 			j := i % ndata
   1298 			var have, want uint8
   1299 			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
   1300 			if i >= totalptr {
   1301 				want = 0 // deadmarker
   1302 				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
   1303 					want = bitScan
   1304 				}
   1305 			} else {
   1306 				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
   1307 					want |= bitPointer
   1308 				}
   1309 				if i != 1 {
   1310 					want |= bitScan
   1311 				} else {
   1312 					have &^= bitScan
   1313 				}
   1314 			}
   1315 			if have != want {
   1316 				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
   1317 				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
   1318 				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
   1319 				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
   1320 				h0 := heapBitsForAddr(x)
   1321 				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
   1322 				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
   1323 				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
   1324 				println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
   1325 				if typ.kind&kindGCProg != 0 {
   1326 					println("GC program:")
   1327 					dumpGCProg(addb(typ.gcdata, 4))
   1328 				}
   1329 				throw("bad heapBitsSetType")
   1330 			}
   1331 			h = h.next()
   1332 		}
   1333 		if ptrmask == debugPtrmask.data {
   1334 			unlock(&debugPtrmask.lock)
   1335 		}
   1336 	}
   1337 }
   1338 
   1339 var debugPtrmask struct {
   1340 	lock mutex
   1341 	data *byte
   1342 }
   1343 
   1344 // heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
   1345 // progSize is the size of the memory described by the program.
   1346 // elemSize is the size of the element that the GC program describes (a prefix of).
   1347 // dataSize is the total size of the intended data, a multiple of elemSize.
   1348 // allocSize is the total size of the allocated memory.
   1349 //
   1350 // GC programs are only used for large allocations.
   1351 // heapBitsSetType requires that allocSize is a multiple of 4 words,
   1352 // so that the relevant bitmap bytes are not shared with surrounding
   1353 // objects.
   1354 func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
   1355 	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
   1356 		// Alignment will be wrong.
   1357 		throw("heapBitsSetTypeGCProg: small allocation")
   1358 	}
   1359 	var totalBits uintptr
   1360 	if elemSize == dataSize {
   1361 		totalBits = runGCProg(prog, nil, h.bitp, 2)
   1362 		if totalBits*sys.PtrSize != progSize {
   1363 			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
   1364 			throw("heapBitsSetTypeGCProg: unexpected bit count")
   1365 		}
   1366 	} else {
   1367 		count := dataSize / elemSize
   1368 
   1369 		// Piece together program trailer to run after prog that does:
   1370 		//	literal(0)
   1371 		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
   1372 		//	repeat(elemSize, count-1) // repeat that element for count
   1373 		// This zero-pads the data remaining in the first element and then
   1374 		// repeats that first element to fill the array.
   1375 		var trailer [40]byte // 3 varints (max 10 each) + some bytes
   1376 		i := 0
   1377 		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
   1378 			// literal(0)
   1379 			trailer[i] = 0x01
   1380 			i++
   1381 			trailer[i] = 0
   1382 			i++
   1383 			if n > 1 {
   1384 				// repeat(1, n-1)
   1385 				trailer[i] = 0x81
   1386 				i++
   1387 				n--
   1388 				for ; n >= 0x80; n >>= 7 {
   1389 					trailer[i] = byte(n | 0x80)
   1390 					i++
   1391 				}
   1392 				trailer[i] = byte(n)
   1393 				i++
   1394 			}
   1395 		}
   1396 		// repeat(elemSize/ptrSize, count-1)
   1397 		trailer[i] = 0x80
   1398 		i++
   1399 		n := elemSize / sys.PtrSize
   1400 		for ; n >= 0x80; n >>= 7 {
   1401 			trailer[i] = byte(n | 0x80)
   1402 			i++
   1403 		}
   1404 		trailer[i] = byte(n)
   1405 		i++
   1406 		n = count - 1
   1407 		for ; n >= 0x80; n >>= 7 {
   1408 			trailer[i] = byte(n | 0x80)
   1409 			i++
   1410 		}
   1411 		trailer[i] = byte(n)
   1412 		i++
   1413 		trailer[i] = 0
   1414 		i++
   1415 
   1416 		runGCProg(prog, &trailer[0], h.bitp, 2)
   1417 
   1418 		// Even though we filled in the full array just now,
   1419 		// record that we only filled in up to the ptrdata of the
   1420 		// last element. This will cause the code below to
   1421 		// memclr the dead section of the final array element,
   1422 		// so that scanobject can stop early in the final element.
   1423 		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
   1424 	}
   1425 	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
   1426 	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
   1427 	memclrNoHeapPointers(add(endAlloc, 1), uintptr(endProg)-uintptr(endAlloc))
   1428 }
   1429 
   1430 // progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
   1431 // size the size of the region described by prog, in bytes.
   1432 // The resulting bitvector will have no more than size/sys.PtrSize bits.
   1433 func progToPointerMask(prog *byte, size uintptr) bitvector {
   1434 	n := (size/sys.PtrSize + 7) / 8
   1435 	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
   1436 	x[len(x)-1] = 0xa1 // overflow check sentinel
   1437 	n = runGCProg(prog, nil, &x[0], 1)
   1438 	if x[len(x)-1] != 0xa1 {
   1439 		throw("progToPointerMask: overflow")
   1440 	}
   1441 	return bitvector{int32(n), &x[0]}
   1442 }
   1443 
   1444 // Packed GC pointer bitmaps, aka GC programs.
   1445 //
   1446 // For large types containing arrays, the type information has a
   1447 // natural repetition that can be encoded to save space in the
   1448 // binary and in the memory representation of the type information.
   1449 //
   1450 // The encoding is a simple Lempel-Ziv style bytecode machine
   1451 // with the following instructions:
   1452 //
   1453 //	00000000: stop
   1454 //	0nnnnnnn: emit n bits copied from the next (n+7)/8 bytes
   1455 //	10000000 n c: repeat the previous n bits c times; n, c are varints
   1456 //	1nnnnnnn c: repeat the previous n bits c times; c is a varint
   1457 
   1458 // runGCProg executes the GC program prog, and then trailer if non-nil,
   1459 // writing to dst with entries of the given size.
   1460 // If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
   1461 // If size == 2, dst is the 2-bit heap bitmap, and writes move backward
   1462 // starting at dst (because the heap bitmap does). In this case, the caller guarantees
   1463 // that only whole bytes in dst need to be written.
   1464 //
   1465 // runGCProg returns the number of 1- or 2-bit entries written to memory.
   1466 func runGCProg(prog, trailer, dst *byte, size int) uintptr {
   1467 	dstStart := dst
   1468 
   1469 	// Bits waiting to be written to memory.
   1470 	var bits uintptr
   1471 	var nbits uintptr
   1472 
   1473 	p := prog
   1474 Run:
   1475 	for {
   1476 		// Flush accumulated full bytes.
   1477 		// The rest of the loop assumes that nbits <= 7.
   1478 		for ; nbits >= 8; nbits -= 8 {
   1479 			if size == 1 {
   1480 				*dst = uint8(bits)
   1481 				dst = add1(dst)
   1482 				bits >>= 8
   1483 			} else {
   1484 				v := bits&bitPointerAll | bitScanAll
   1485 				*dst = uint8(v)
   1486 				dst = subtract1(dst)
   1487 				bits >>= 4
   1488 				v = bits&bitPointerAll | bitScanAll
   1489 				*dst = uint8(v)
   1490 				dst = subtract1(dst)
   1491 				bits >>= 4
   1492 			}
   1493 		}
   1494 
   1495 		// Process one instruction.
   1496 		inst := uintptr(*p)
   1497 		p = add1(p)
   1498 		n := inst & 0x7F
   1499 		if inst&0x80 == 0 {
   1500 			// Literal bits; n == 0 means end of program.
   1501 			if n == 0 {
   1502 				// Program is over; continue in trailer if present.
   1503 				if trailer != nil {
   1504 					//println("trailer")
   1505 					p = trailer
   1506 					trailer = nil
   1507 					continue
   1508 				}
   1509 				//println("done")
   1510 				break Run
   1511 			}
   1512 			//println("lit", n, dst)
   1513 			nbyte := n / 8
   1514 			for i := uintptr(0); i < nbyte; i++ {
   1515 				bits |= uintptr(*p) << nbits
   1516 				p = add1(p)
   1517 				if size == 1 {
   1518 					*dst = uint8(bits)
   1519 					dst = add1(dst)
   1520 					bits >>= 8
   1521 				} else {
   1522 					v := bits&0xf | bitScanAll
   1523 					*dst = uint8(v)
   1524 					dst = subtract1(dst)
   1525 					bits >>= 4
   1526 					v = bits&0xf | bitScanAll
   1527 					*dst = uint8(v)
   1528 					dst = subtract1(dst)
   1529 					bits >>= 4
   1530 				}
   1531 			}
   1532 			if n %= 8; n > 0 {
   1533 				bits |= uintptr(*p) << nbits
   1534 				p = add1(p)
   1535 				nbits += n
   1536 			}
   1537 			continue Run
   1538 		}
   1539 
   1540 		// Repeat. If n == 0, it is encoded in a varint in the next bytes.
   1541 		if n == 0 {
   1542 			for off := uint(0); ; off += 7 {
   1543 				x := uintptr(*p)
   1544 				p = add1(p)
   1545 				n |= (x & 0x7F) << off
   1546 				if x&0x80 == 0 {
   1547 					break
   1548 				}
   1549 			}
   1550 		}
   1551 
   1552 		// Count is encoded in a varint in the next bytes.
   1553 		c := uintptr(0)
   1554 		for off := uint(0); ; off += 7 {
   1555 			x := uintptr(*p)
   1556 			p = add1(p)
   1557 			c |= (x & 0x7F) << off
   1558 			if x&0x80 == 0 {
   1559 				break
   1560 			}
   1561 		}
   1562 		c *= n // now total number of bits to copy
   1563 
   1564 		// If the number of bits being repeated is small, load them
   1565 		// into a register and use that register for the entire loop
   1566 		// instead of repeatedly reading from memory.
   1567 		// Handling fewer than 8 bits here makes the general loop simpler.
   1568 		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
   1569 		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
   1570 		// it will not overflow.
   1571 		src := dst
   1572 		const maxBits = sys.PtrSize*8 - 7
   1573 		if n <= maxBits {
   1574 			// Start with bits in output buffer.
   1575 			pattern := bits
   1576 			npattern := nbits
   1577 
   1578 			// If we need more bits, fetch them from memory.
   1579 			if size == 1 {
   1580 				src = subtract1(src)
   1581 				for npattern < n {
   1582 					pattern <<= 8
   1583 					pattern |= uintptr(*src)
   1584 					src = subtract1(src)
   1585 					npattern += 8
   1586 				}
   1587 			} else {
   1588 				src = add1(src)
   1589 				for npattern < n {
   1590 					pattern <<= 4
   1591 					pattern |= uintptr(*src) & 0xf
   1592 					src = add1(src)
   1593 					npattern += 4
   1594 				}
   1595 			}
   1596 
   1597 			// We started with the whole bit output buffer,
   1598 			// and then we loaded bits from whole bytes.
   1599 			// Either way, we might now have too many instead of too few.
   1600 			// Discard the extra.
   1601 			if npattern > n {
   1602 				pattern >>= npattern - n
   1603 				npattern = n
   1604 			}
   1605 
   1606 			// Replicate pattern to at most maxBits.
   1607 			if npattern == 1 {
   1608 				// One bit being repeated.
   1609 				// If the bit is 1, make the pattern all 1s.
   1610 				// If the bit is 0, the pattern is already all 0s,
   1611 				// but we can claim that the number of bits
   1612 				// in the word is equal to the number we need (c),
   1613 				// because right shift of bits will zero fill.
   1614 				if pattern == 1 {
   1615 					pattern = 1<<maxBits - 1
   1616 					npattern = maxBits
   1617 				} else {
   1618 					npattern = c
   1619 				}
   1620 			} else {
   1621 				b := pattern
   1622 				nb := npattern
   1623 				if nb+nb <= maxBits {
   1624 					// Double pattern until the whole uintptr is filled.
   1625 					for nb <= sys.PtrSize*8 {
   1626 						b |= b << nb
   1627 						nb += nb
   1628 					}
   1629 					// Trim away incomplete copy of original pattern in high bits.
   1630 					// TODO(rsc): Replace with table lookup or loop on systems without divide?
   1631 					nb = maxBits / npattern * npattern
   1632 					b &= 1<<nb - 1
   1633 					pattern = b
   1634 					npattern = nb
   1635 				}
   1636 			}
   1637 
   1638 			// Add pattern to bit buffer and flush bit buffer, c/npattern times.
   1639 			// Since pattern contains >8 bits, there will be full bytes to flush
   1640 			// on each iteration.
   1641 			for ; c >= npattern; c -= npattern {
   1642 				bits |= pattern << nbits
   1643 				nbits += npattern
   1644 				if size == 1 {
   1645 					for nbits >= 8 {
   1646 						*dst = uint8(bits)
   1647 						dst = add1(dst)
   1648 						bits >>= 8
   1649 						nbits -= 8
   1650 					}
   1651 				} else {
   1652 					for nbits >= 4 {
   1653 						*dst = uint8(bits&0xf | bitScanAll)
   1654 						dst = subtract1(dst)
   1655 						bits >>= 4
   1656 						nbits -= 4
   1657 					}
   1658 				}
   1659 			}
   1660 
   1661 			// Add final fragment to bit buffer.
   1662 			if c > 0 {
   1663 				pattern &= 1<<c - 1
   1664 				bits |= pattern << nbits
   1665 				nbits += c
   1666 			}
   1667 			continue Run
   1668 		}
   1669 
   1670 		// Repeat; n too large to fit in a register.
   1671 		// Since nbits <= 7, we know the first few bytes of repeated data
   1672 		// are already written to memory.
   1673 		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
   1674 		if size == 1 {
   1675 			// Leading src fragment.
   1676 			src = subtractb(src, (off+7)/8)
   1677 			if frag := off & 7; frag != 0 {
   1678 				bits |= uintptr(*src) >> (8 - frag) << nbits
   1679 				src = add1(src)
   1680 				nbits += frag
   1681 				c -= frag
   1682 			}
   1683 			// Main loop: load one byte, write another.
   1684 			// The bits are rotating through the bit buffer.
   1685 			for i := c / 8; i > 0; i-- {
   1686 				bits |= uintptr(*src) << nbits
   1687 				src = add1(src)
   1688 				*dst = uint8(bits)
   1689 				dst = add1(dst)
   1690 				bits >>= 8
   1691 			}
   1692 			// Final src fragment.
   1693 			if c %= 8; c > 0 {
   1694 				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
   1695 				nbits += c
   1696 			}
   1697 		} else {
   1698 			// Leading src fragment.
   1699 			src = addb(src, (off+3)/4)
   1700 			if frag := off & 3; frag != 0 {
   1701 				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
   1702 				src = subtract1(src)
   1703 				nbits += frag
   1704 				c -= frag
   1705 			}
   1706 			// Main loop: load one byte, write another.
   1707 			// The bits are rotating through the bit buffer.
   1708 			for i := c / 4; i > 0; i-- {
   1709 				bits |= (uintptr(*src) & 0xf) << nbits
   1710 				src = subtract1(src)
   1711 				*dst = uint8(bits&0xf | bitScanAll)
   1712 				dst = subtract1(dst)
   1713 				bits >>= 4
   1714 			}
   1715 			// Final src fragment.
   1716 			if c %= 4; c > 0 {
   1717 				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
   1718 				nbits += c
   1719 			}
   1720 		}
   1721 	}
   1722 
   1723 	// Write any final bits out, using full-byte writes, even for the final byte.
   1724 	var totalBits uintptr
   1725 	if size == 1 {
   1726 		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
   1727 		nbits += -nbits & 7
   1728 		for ; nbits > 0; nbits -= 8 {
   1729 			*dst = uint8(bits)
   1730 			dst = add1(dst)
   1731 			bits >>= 8
   1732 		}
   1733 	} else {
   1734 		totalBits = (uintptr(unsafe.Pointer(dstStart))-uintptr(unsafe.Pointer(dst)))*4 + nbits
   1735 		nbits += -nbits & 3
   1736 		for ; nbits > 0; nbits -= 4 {
   1737 			v := bits&0xf | bitScanAll
   1738 			*dst = uint8(v)
   1739 			dst = subtract1(dst)
   1740 			bits >>= 4
   1741 		}
   1742 	}
   1743 	return totalBits
   1744 }
   1745 
   1746 func dumpGCProg(p *byte) {
   1747 	nptr := 0
   1748 	for {
   1749 		x := *p
   1750 		p = add1(p)
   1751 		if x == 0 {
   1752 			print("\t", nptr, " end\n")
   1753 			break
   1754 		}
   1755 		if x&0x80 == 0 {
   1756 			print("\t", nptr, " lit ", x, ":")
   1757 			n := int(x+7) / 8
   1758 			for i := 0; i < n; i++ {
   1759 				print(" ", hex(*p))
   1760 				p = add1(p)
   1761 			}
   1762 			print("\n")
   1763 			nptr += int(x)
   1764 		} else {
   1765 			nbit := int(x &^ 0x80)
   1766 			if nbit == 0 {
   1767 				for nb := uint(0); ; nb += 7 {
   1768 					x := *p
   1769 					p = add1(p)
   1770 					nbit |= int(x&0x7f) << nb
   1771 					if x&0x80 == 0 {
   1772 						break
   1773 					}
   1774 				}
   1775 			}
   1776 			count := 0
   1777 			for nb := uint(0); ; nb += 7 {
   1778 				x := *p
   1779 				p = add1(p)
   1780 				count |= int(x&0x7f) << nb
   1781 				if x&0x80 == 0 {
   1782 					break
   1783 				}
   1784 			}
   1785 			print("\t", nptr, " repeat ", nbit, "  ", count, "\n")
   1786 			nptr += nbit * count
   1787 		}
   1788 	}
   1789 }
   1790 
   1791 // Testing.
   1792 
   1793 func getgcmaskcb(frame *stkframe, ctxt unsafe.Pointer) bool {
   1794 	target := (*stkframe)(ctxt)
   1795 	if frame.sp <= target.sp && target.sp < frame.varp {
   1796 		*target = *frame
   1797 		return false
   1798 	}
   1799 	return true
   1800 }
   1801 
   1802 // gcbits returns the GC type info for x, for testing.
   1803 // The result is the bitmap entries (0 or 1), one entry per byte.
   1804 //go:linkname reflect_gcbits reflect.gcbits
   1805 func reflect_gcbits(x interface{}) []byte {
   1806 	ret := getgcmask(x)
   1807 	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
   1808 	nptr := typ.ptrdata / sys.PtrSize
   1809 	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
   1810 		ret = ret[:len(ret)-1]
   1811 	}
   1812 	return ret
   1813 }
   1814 
   1815 // Returns GC type info for object p for testing.
   1816 func getgcmask(ep interface{}) (mask []byte) {
   1817 	e := *efaceOf(&ep)
   1818 	p := e.data
   1819 	t := e._type
   1820 	// data or bss
   1821 	for _, datap := range activeModules() {
   1822 		// data
   1823 		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
   1824 			bitmap := datap.gcdatamask.bytedata
   1825 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
   1826 			mask = make([]byte, n/sys.PtrSize)
   1827 			for i := uintptr(0); i < n; i += sys.PtrSize {
   1828 				off := (uintptr(p) + i - datap.data) / sys.PtrSize
   1829 				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
   1830 			}
   1831 			return
   1832 		}
   1833 
   1834 		// bss
   1835 		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
   1836 			bitmap := datap.gcbssmask.bytedata
   1837 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
   1838 			mask = make([]byte, n/sys.PtrSize)
   1839 			for i := uintptr(0); i < n; i += sys.PtrSize {
   1840 				off := (uintptr(p) + i - datap.bss) / sys.PtrSize
   1841 				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
   1842 			}
   1843 			return
   1844 		}
   1845 	}
   1846 
   1847 	// heap
   1848 	var n uintptr
   1849 	var base uintptr
   1850 	if mlookup(uintptr(p), &base, &n, nil) != 0 {
   1851 		mask = make([]byte, n/sys.PtrSize)
   1852 		for i := uintptr(0); i < n; i += sys.PtrSize {
   1853 			hbits := heapBitsForAddr(base + i)
   1854 			if hbits.isPointer() {
   1855 				mask[i/sys.PtrSize] = 1
   1856 			}
   1857 			if i != 1*sys.PtrSize && !hbits.morePointers() {
   1858 				mask = mask[:i/sys.PtrSize]
   1859 				break
   1860 			}
   1861 		}
   1862 		return
   1863 	}
   1864 
   1865 	// stack
   1866 	if _g_ := getg(); _g_.m.curg.stack.lo <= uintptr(p) && uintptr(p) < _g_.m.curg.stack.hi {
   1867 		var frame stkframe
   1868 		frame.sp = uintptr(p)
   1869 		_g_ := getg()
   1870 		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
   1871 		if frame.fn.valid() {
   1872 			f := frame.fn
   1873 			targetpc := frame.continpc
   1874 			if targetpc == 0 {
   1875 				return
   1876 			}
   1877 			if targetpc != f.entry {
   1878 				targetpc--
   1879 			}
   1880 			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil)
   1881 			if pcdata == -1 {
   1882 				return
   1883 			}
   1884 			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
   1885 			if stkmap == nil || stkmap.n <= 0 {
   1886 				return
   1887 			}
   1888 			bv := stackmapdata(stkmap, pcdata)
   1889 			size := uintptr(bv.n) * sys.PtrSize
   1890 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
   1891 			mask = make([]byte, n/sys.PtrSize)
   1892 			for i := uintptr(0); i < n; i += sys.PtrSize {
   1893 				bitmap := bv.bytedata
   1894 				off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize
   1895 				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
   1896 			}
   1897 		}
   1898 		return
   1899 	}
   1900 
   1901 	// otherwise, not something the GC knows about.
   1902 	// possibly read-only data, like malloc(0).
   1903 	// must not have pointers
   1904 	return
   1905 }
   1906