// Copyright 2009 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.package runtimeimport ()// sigPerThreadSyscall is the same signal (SIGSETXID) used by glibc for// per-thread syscalls on Linux. We use it for the same purpose in non-cgo// binaries.constsigPerThreadSyscall = _SIGRTMIN + 1typemOSstruct {// profileTimer holds the ID of the POSIX interval timer for profiling CPU // usage on this thread. // // It is valid when the profileTimerValid field is true. A thread // creates and manages its own timer, and these fields are read and written // only by this thread. But because some of the reads on profileTimerValid // are in signal handling code, this field should be atomic type.profileTimerint32profileTimerValidatomic.Bool// needPerThreadSyscall indicates that a per-thread syscall is required // for doAllThreadsSyscall.needPerThreadSyscallatomic.Uint8}//go:noescapefunc ( unsafe.Pointer, int32, uint32, , unsafe.Pointer, uint32) int32// Linux futex.//// futexsleep(uint32 *addr, uint32 val)// futexwakeup(uint32 *addr)//// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.// Futexwakeup wakes up threads sleeping on addr.// Futexsleep is allowed to wake up spuriously.const (_FUTEX_PRIVATE_FLAG = 128_FUTEX_WAIT_PRIVATE = 0 | _FUTEX_PRIVATE_FLAG_FUTEX_WAKE_PRIVATE = 1 | _FUTEX_PRIVATE_FLAG)// Atomically,//// if(*addr == val) sleep//// Might be woken up spuriously; that's allowed.// Don't sleep longer than ns; ns < 0 means forever.////go:nosplitfunc ( *uint32, uint32, int64) {// Some Linux kernels have a bug where futex of // FUTEX_WAIT returns an internal error code // as an errno. Libpthread ignores the return value // here, and so can we: as it says a few lines up, // spurious wakeups are allowed.if < 0 {futex(unsafe.Pointer(), _FUTEX_WAIT_PRIVATE, , nil, nil, 0)return }vartimespec .setNsec()futex(unsafe.Pointer(), _FUTEX_WAIT_PRIVATE, , unsafe.Pointer(&), nil, 0)}// If any procs are sleeping on addr, wake up at most cnt.////go:nosplitfunc ( *uint32, uint32) { := futex(unsafe.Pointer(), _FUTEX_WAKE_PRIVATE, , nil, nil, 0)if >= 0 {return }// I don't know that futex wakeup can return // EAGAIN or EINTR, but if it does, it would be // safe to loop and call futex again.systemstack(func() {print("futexwakeup addr=", , " returned ", , "\n") }) *(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006}func () int32 {// This buffer is huge (8 kB) but we are on the system stack // and there should be plenty of space (64 kB). // Also this is a leaf, so we're not holding up the memory for long. // See golang.org/issue/11823. // The suggested behavior here is to keep trying with ever-larger // buffers, but we don't have a dynamic memory allocator at the // moment, so that's a bit tricky and seems like overkill.const = 64 * 1024var [ / 8]byte := sched_getaffinity(0, unsafe.Sizeof(), &[0])if < 0 {return1 } := int32(0)for , := range [:] {for != 0 { += int32( & 1) >>= 1 } }if == 0 { = 1 }return}// Clone, the Linux rfork.const (_CLONE_VM = 0x100_CLONE_FS = 0x200_CLONE_FILES = 0x400_CLONE_SIGHAND = 0x800_CLONE_PTRACE = 0x2000_CLONE_VFORK = 0x4000_CLONE_PARENT = 0x8000_CLONE_THREAD = 0x10000_CLONE_NEWNS = 0x20000_CLONE_SYSVSEM = 0x40000_CLONE_SETTLS = 0x80000_CLONE_PARENT_SETTID = 0x100000_CLONE_CHILD_CLEARTID = 0x200000_CLONE_UNTRACED = 0x800000_CLONE_CHILD_SETTID = 0x1000000_CLONE_STOPPED = 0x2000000_CLONE_NEWUTS = 0x4000000_CLONE_NEWIPC = 0x8000000// As of QEMU 2.8.0 (5ea2fc84d), user emulation requires all six of these // flags to be set when creating a thread; attempts to share the other // five but leave SYSVSEM unshared will fail with -EINVAL. // // In non-QEMU environments CLONE_SYSVSEM is inconsequential as we do not // use System V semaphores.cloneFlags = _CLONE_VM | /* share memory */_CLONE_FS | /* share cwd, etc */_CLONE_FILES | /* share fd table */_CLONE_SIGHAND | /* share sig handler table */_CLONE_SYSVSEM | /* share SysV semaphore undo lists (see issue #20763) */_CLONE_THREAD/* revisit - okay for now */)//go:noescapefunc ( int32, , , , unsafe.Pointer) int32// May run with m.p==nil, so write barriers are not allowed.////go:nowritebarrierfunc ( *m) { := unsafe.Pointer(.g0.stack.hi)/* * note: strace gets confused if we use CLONE_PTRACE here. */iffalse {print("newosproc stk=", , " m=", , " g=", .g0, " clone=", abi.FuncPCABI0(clone), " id=", .id, " ostk=", &, "\n") }// Disable signals during clone, so that the new thread starts // with signals disabled. It will enable them in minit.varsigsetsigprocmask(_SIG_SETMASK, &sigset_all, &) := retryOnEAGAIN(func() int32 { := clone(cloneFlags, , unsafe.Pointer(), unsafe.Pointer(.g0), unsafe.Pointer(abi.FuncPCABI0(mstart)))// clone returns positive TID, negative errno. // We don't care about the TID.if >= 0 {return0 }return - })sigprocmask(_SIG_SETMASK, &, nil)if != 0 {print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", , ")\n")if == _EAGAIN {println("runtime: may need to increase max user processes (ulimit -u)") }throw("newosproc") }}// Version of newosproc that doesn't require a valid G.////go:nosplitfunc ( uintptr, unsafe.Pointer) { := sysAlloc(, &memstats.stacks_sys)if == nil {writeErrStr(failallocatestack)exit(1) } := clone(cloneFlags, unsafe.Pointer(uintptr()+), nil, nil, )if < 0 {writeErrStr(failthreadcreate)exit(1) }}const (_AT_NULL = 0// End of vector_AT_PAGESZ = 6// System physical page size_AT_PLATFORM = 15// string identifying platform_AT_HWCAP = 16// hardware capability bit vector_AT_SECURE = 23// secure mode boolean_AT_RANDOM = 25// introduced in 2.6.29_AT_HWCAP2 = 26// hardware capability bit vector 2)varprocAuxv = []byte("/proc/self/auxv\x00")varaddrspace_vec [1]bytefunc ( unsafe.Pointer, uintptr, *byte) int32varauxvreadbuf [128]uintptrfunc ( int32, **byte) { := + 1// skip over argv, envp to get to auxvforargv_index(, ) != nil { ++ }// skip NULL separator ++// now argv+n is auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(), uintptr()*goarch.PtrSize))if := sysauxv([:]); != 0 {auxv = [: *2 : *2]return }// In some situations we don't get a loader-provided // auxv, such as when loaded as a library on Android. // Fall back to /proc/self/auxv. := open(&procAuxv[0], 0/* O_RDONLY */, 0)if < 0 {// On Android, /proc/self/auxv might be unreadable (issue 9229), so we fallback to // try using mincore to detect the physical page size. // mincore should return EINVAL when address is not a multiple of system page size.const = 256 << 10// size of memory region to allocate , := mmap(nil, , _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)if != 0 {return }varuintptrfor = 4 << 10; < ; <<= 1 { := mincore(unsafe.Pointer(uintptr()+), 1, &addrspace_vec[0])if == 0 {physPageSize = break } }ifphysPageSize == 0 {physPageSize = }munmap(, )return } = read(, noescape(unsafe.Pointer(&auxvreadbuf[0])), int32(unsafe.Sizeof(auxvreadbuf)))closefd()if < 0 {return }// Make sure buf is terminated, even if we didn't read // the whole file.auxvreadbuf[len(auxvreadbuf)-2] = _AT_NULL := sysauxv(auxvreadbuf[:])auxv = auxvreadbuf[: *2 : *2]}// secureMode holds the value of AT_SECURE passed in the auxiliary vector.varsecureModeboolfunc ( []uintptr) ( int) {varintfor ; [] != _AT_NULL; += 2 { , := [], [+1]switch {case_AT_RANDOM:// The kernel provides a pointer to 16-bytes // worth of random data.startupRand = (*[16]byte)(unsafe.Pointer())[:]case_AT_PAGESZ:physPageSize = case_AT_SECURE:secureMode = == 1 }archauxv(, )vdsoauxv(, ) }return / 2}varsysTHPSizePath = []byte("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size\x00")func () uintptr {var [20]byte := open(&sysTHPSizePath[0], 0/* O_RDONLY */, 0)if < 0 {return0 } := noescape(unsafe.Pointer(&[0])) := read(, , int32(len()))closefd()if <= 0 {return0 } -- // remove trailing newline , := atoi(slicebytetostringtmp((*byte)(), int()))if ! || < 0 { = 0 }if &(-1) != 0 {// v is not a power of 2return0 }returnuintptr()}func () {ncpu = getproccount()physHugePageSize = getHugePageSize()osArchInit()}varurandom_dev = []byte("/dev/urandom\x00")func ( []byte) int { := open(&urandom_dev[0], 0/* O_RDONLY */, 0) := read(, unsafe.Pointer(&[0]), int32(len()))closefd()returnint()}func () {goenvs_unix()}// Called to do synchronous initialization of Go code built with// -buildmode=c-archive or -buildmode=c-shared.// None of the Go runtime is initialized.////go:nosplit//go:nowritebarrierrecfunc () {initsig(true)}// Called to initialize a new m (including the bootstrap m).// Called on the parent thread (main thread in case of bootstrap), can allocate memory.func ( *m) { .gsignal = malg(32 * 1024) // Linux wants >= 2K .gsignal.m = }func () uint32// Called to initialize a new m (including the bootstrap m).// Called on the new thread, cannot allocate memory.func () {minitSignals()// Cgo-created threads and the bootstrap m are missing a // procid. We need this for asynchronous preemption and it's // useful in debuggers.getg().m.procid = uint64(gettid())}// Called from dropm to undo the effect of an minit.////go:nosplitfunc () {unminitSignals()getg().m.procid = 0}// Called from exitm, but not from drop, to undo the effect of thread-owned// resources in minit, semacreate, or elsewhere. Do not take locks after calling this.func ( *m) {}// #ifdef GOARCH_386// #define sa_handler k_sa_handler// #endiffunc ()func () // Called via C ABIfunc ()//go:noescapefunc (, *stackt)//go:noescapefunc ( int32, , *itimerval)//go:noescapefunc ( int32, *sigevent, *int32) int32//go:noescapefunc ( int32, int32, , *itimerspec) int32//go:noescapefunc ( int32) int32//go:noescapefunc ( int32, , *sigset, int32)//go:nosplit//go:nowritebarrierrecfunc ( int32, , *sigset) {rtsigprocmask(, , , int32(unsafe.Sizeof(*)))}func ( uint32)func ( uint32)//go:noescapefunc (, uintptr, *byte) int32func ()//go:nosplitfunc () {osyield()}func ( int32) (, int32, int32)//go:nosplitfunc (, , int32) ( int32, int32) { , , := syscall.Syscall6(syscall.SYS_FCNTL, uintptr(), uintptr(), uintptr(), 0, 0, 0)returnint32(), int32()}const (_si_max_size = 128_sigev_max_size = 64)//go:nosplit//go:nowritebarrierrecfunc ( uint32, uintptr) {varsigactiont .sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER | _SA_RESTARTsigfillset(&.sa_mask)// Although Linux manpage says "sa_restorer element is obsolete and // should not be used". x86_64 kernel requires it. Only use it on // x86.ifGOARCH == "386" || GOARCH == "amd64" { .sa_restorer = abi.FuncPCABI0(sigreturn__sigaction) }if == abi.FuncPCABIInternal(sighandler) { // abi.FuncPCABIInternal(sighandler) matches the callers in signal_unix.goifiscgo { = abi.FuncPCABI0(cgoSigtramp) } else { = abi.FuncPCABI0(sigtramp) } } .sa_handler = sigaction(, &, nil)}//go:nosplit//go:nowritebarrierrecfunc ( uint32) {varsigactiontsigaction(, nil, &)if .sa_flags&_SA_ONSTACK != 0 {return } .sa_flags |= _SA_ONSTACKsigaction(, &, nil)}//go:nosplit//go:nowritebarrierrecfunc ( uint32) uintptr {varsigactiontsigaction(, nil, &)return .sa_handler}// setSignalstackSP sets the ss_sp field of a stackt.////go:nosplitfunc ( *stackt, uintptr) { *(*uintptr)(unsafe.Pointer(&.ss_sp)) = }//go:nosplitfunc ( *sigctxt) ( uint32) {}// sysSigaction calls the rt_sigaction system call.////go:nosplitfunc ( uint32, , *sigactiont) {ifrt_sigaction(uintptr(), , , unsafe.Sizeof(sigactiont{}.sa_mask)) != 0 {// Workaround for bugs in QEMU user mode emulation. // // QEMU turns calls to the sigaction system call into // calls to the C library sigaction call; the C // library call rejects attempts to call sigaction for // SIGCANCEL (32) or SIGSETXID (33). // // QEMU rejects calling sigaction on SIGRTMAX (64). // // Just ignore the error in these case. There isn't // anything we can do about it anyhow.if != 32 && != 33 && != 64 {// Use system stack to avoid split stack overflow on ppc64/ppc64le.systemstack(func() {throw("sigaction failed") }) } }}// rt_sigaction is implemented in assembly.////go:noescapefunc ( uintptr, , *sigactiont, uintptr) int32func () intfunc (, , int)// signalM sends a signal to mp.func ( *m, int) {tgkill(getpid(), int(.procid), )}// validSIGPROF compares this signal delivery's code against the signal sources// that the profiler uses, returning whether the delivery should be processed.// To be processed, a signal delivery from a known profiling mechanism should// correspond to the best profiling mechanism available to this thread. Signals// from other sources are always considered valid.////go:nosplitfunc ( *m, *sigctxt) bool { := int32(.sigcode()) := == _SI_KERNEL := == _SI_TIMERif !( || ) {// The signal doesn't correspond to a profiling mechanism that the // runtime enables itself. There's no reason to process it, but there's // no reason to ignore it either.returntrue }if == nil {// Since we don't have an M, we can't check if there's an active // per-thread timer for this thread. We don't know how long this thread // has been around, and if it happened to interact with the Go scheduler // at a time when profiling was active (causing it to have a per-thread // timer). But it may have never interacted with the Go scheduler, or // never while profiling was active. To avoid double-counting, process // only signals from setitimer. // // When a custom cgo traceback function has been registered (on // platforms that support runtime.SetCgoTraceback), SIGPROF signals // delivered to a thread that cannot find a matching M do this check in // the assembly implementations of runtime.cgoSigtramp.return }// Having an M means the thread interacts with the Go scheduler, and we can // check whether there's an active per-thread timer for this thread.if .profileTimerValid.Load() {// If this M has its own per-thread CPU profiling interval timer, we // should track the SIGPROF signals that come from that timer (for // accurate reporting of its CPU usage; see issue 35057) and ignore any // that it gets from the process-wide setitimer (to not over-count its // CPU consumption).return }// No active per-thread timer means the only valid profiler is setitimer.return}func ( int32) {setProcessCPUProfilerTimer()}func ( int32) { := getg().m .profilehz = // destroy any active timerif .profileTimerValid.Load() { := .profileTimer .profileTimerValid.Store(false) .profileTimer = 0 := timer_delete()if != 0 {print("runtime: failed to disable profiling timer; timer_delete(", , ") errno=", -, "\n")throw("timer_delete") } }if == 0 {// If the goal was to disable profiling for this thread, then the job's done.return }// The period of the timer should be 1/Hz. For every "1/Hz" of additional // work, the user should expect one additional sample in the profile. // // But to scale down to very small amounts of application work, to observe // even CPU usage of "one tenth" of the requested period, set the initial // timing delay in a different way: So that "one tenth" of a period of CPU // spend shows up as a 10% chance of one sample (for an expected value of // 0.1 samples), and so that "two and six tenths" periods of CPU spend show // up as a 60% chance of 3 samples and a 40% chance of 2 samples (for an // expected value of 2.6). Set the initial delay to a value in the unifom // random distribution between 0 and the desired period. And because "0" // means "disable timer", add 1 so the half-open interval [0,period) turns // into (0,period]. // // Otherwise, this would show up as a bias away from short-lived threads and // from threads that are only occasionally active: for example, when the // garbage collector runs on a mostly-idle system, the additional threads it // activates may do a couple milliseconds of GC-related work and nothing // else in the few seconds that the profiler observes. := new(itimerspec) .it_value.setNsec(1 + int64(cheaprandn(uint32(1e9/)))) .it_interval.setNsec(1e9 / int64())varint32varsigevent .notify = _SIGEV_THREAD_ID .signo = _SIGPROF .sigev_notify_thread_id = int32(.procid) := timer_create(_CLOCK_THREAD_CPUTIME_ID, &, &)if != 0 {// If we cannot create a timer for this M, leave profileTimerValid false // to fall back to the process-wide setitimer profiler.return } = timer_settime(, 0, , nil)if != 0 {print("runtime: failed to configure profiling timer; timer_settime(", ,", 0, {interval: {", .it_interval.tv_sec, "s + ", .it_interval.tv_nsec, "ns} value: {", .it_value.tv_sec, "s + ", .it_value.tv_nsec, "ns}}, nil) errno=", -, "\n")throw("timer_settime") } .profileTimer = .profileTimerValid.Store(true)}// perThreadSyscallArgs contains the system call number, arguments, and// expected return values for a system call to be executed on all threads.typeperThreadSyscallArgsstruct {trapuintptra1uintptra2uintptra3uintptra4uintptra5uintptra6uintptrr1uintptrr2uintptr}// perThreadSyscall is the system call to execute for the ongoing// doAllThreadsSyscall.//// perThreadSyscall may only be written while mp.needPerThreadSyscall == 0 on// all Ms.varperThreadSyscallperThreadSyscallArgs// syscall_runtime_doAllThreadsSyscall and executes a specified system call on// all Ms.//// The system call is expected to succeed and return the same value on every// thread. If any threads do not match, the runtime throws.////go:linkname syscall_runtime_doAllThreadsSyscall syscall.runtime_doAllThreadsSyscall//go:uintptrescapesfunc (, , , , , , uintptr) (, , uintptr) {ifiscgo {// In cgo, we are not aware of threads created in C, so this approach will not work.panic("doAllThreadsSyscall not supported with cgo enabled") }// STW to guarantee that user goroutines see an atomic change to thread // state. Without STW, goroutines could migrate Ms while change is in // progress and e.g., see state old -> new -> old -> new. // // N.B. Internally, this function does not depend on STW to // successfully change every thread. It is only needed for user // expectations, per above. := stopTheWorld(stwAllThreadsSyscall)// This function depends on several properties: // // 1. All OS threads that already exist are associated with an M in // allm. i.e., we won't miss any pre-existing threads. // 2. All Ms listed in allm will eventually have an OS thread exist. // i.e., they will set procid and be able to receive signals. // 3. OS threads created after we read allm will clone from a thread // that has executed the system call. i.e., they inherit the // modified state. // // We achieve these through different mechanisms: // // 1. Addition of new Ms to allm in allocm happens before clone of its // OS thread later in newm. // 2. newm does acquirem to avoid being preempted, ensuring that new Ms // created in allocm will eventually reach OS thread clone later in // newm. // 3. We take allocmLock for write here to prevent allocation of new Ms // while this function runs. Per (1), this prevents clone of OS // threads that are not yet in allm.allocmLock.lock()// Disable preemption, preventing us from changing Ms, as we handle // this M specially. // // N.B. STW and lock() above do this as well, this is added for extra // clarity.acquirem()// N.B. allocmLock also prevents concurrent execution of this function, // serializing use of perThreadSyscall, mp.needPerThreadSyscall, and // ensuring all threads execute system calls from multiple calls in the // same order. , , := syscall.Syscall6(, , , , , , )ifGOARCH == "ppc64" || GOARCH == "ppc64le" {// TODO(https://go.dev/issue/51192 ): ppc64 doesn't use r2. = 0 }if != 0 {releasem(getg().m)allocmLock.unlock()startTheWorld()return , , }perThreadSyscall = perThreadSyscallArgs{trap: ,a1: ,a2: ,a3: ,a4: ,a5: ,a6: ,r1: ,r2: , }// Wait for all threads to start. // // As described above, some Ms have been added to allm prior to // allocmLock, but not yet completed OS clone and set procid. // // At minimum we must wait for a thread to set procid before we can // send it a signal. // // We take this one step further and wait for all threads to start // before sending any signals. This prevents system calls from getting // applied twice: once in the parent and once in the child, like so: // // A B C // add C to allm // doAllThreadsSyscall // allocmLock.lock() // signal B // <receive signal> // execute syscall // <signal return> // clone C // <thread start> // set procid // signal C // <receive signal> // execute syscall // <signal return> // // In this case, thread C inherited the syscall-modified state from // thread B and did not need to execute the syscall, but did anyway // because doAllThreadsSyscall could not be sure whether it was // required. // // Some system calls may not be idempotent, so we ensure each thread // executes the system call exactly once.for := allm; != nil; = .alllink {foratomic.Load64(&.procid) == 0 {// Thread is starting.osyield() } }// Signal every other thread, where they will execute perThreadSyscall // from the signal handler. := getg() := .m.procidfor := allm; != nil; = .alllink {ifatomic.Load64(&.procid) == {// Our thread already performed the syscall.continue } .needPerThreadSyscall.Store(1)signalM(, sigPerThreadSyscall) }// Wait for all threads to complete.for := allm; != nil; = .alllink {if .procid == {continue }for .needPerThreadSyscall.Load() != 0 {osyield() } }perThreadSyscall = perThreadSyscallArgs{}releasem(getg().m)allocmLock.unlock()startTheWorld()return , , }// runPerThreadSyscall runs perThreadSyscall for this M if required.//// This function throws if the system call returns with anything other than the// expected values.////go:nosplitfunc () { := getg()if .m.needPerThreadSyscall.Load() == 0 {return } := perThreadSyscall , , := syscall.Syscall6(.trap, .a1, .a2, .a3, .a4, .a5, .a6)ifGOARCH == "ppc64" || GOARCH == "ppc64le" {// TODO(https://go.dev/issue/51192 ): ppc64 doesn't use r2. = 0 }if != 0 || != .r1 || != .r2 {print("trap:", .trap, ", a123456=[", .a1, ",", .a2, ",", .a3, ",", .a4, ",", .a5, ",", .a6, "]\n")print("results: got {r1=", , ",r2=", , ",errno=", , "}, want {r1=", .r1, ",r2=", .r2, ",errno=0}\n")fatal("AllThreadsSyscall6 results differ between threads; runtime corrupted") } .m.needPerThreadSyscall.Store(0)}const (_SI_USER = 0_SI_TKILL = -6_SYS_SECCOMP = 1)// sigFromUser reports whether the signal was sent because of a call// to kill or tgkill.////go:nosplitfunc ( *sigctxt) () bool { := int32(.sigcode())return == _SI_USER || == _SI_TKILL}// sigFromSeccomp reports whether the signal was sent from seccomp.////go:nosplitfunc ( *sigctxt) () bool { := int32(.sigcode())return == _SYS_SECCOMP}//go:nosplitfunc ( unsafe.Pointer, uintptr, int32) ( int32, int32) { , , := syscall.Syscall6(syscall.SYS_MPROTECT, uintptr(), , uintptr(), 0, 0, 0)returnint32(), int32()}
The pages are generated with Goldsv0.7.6. (GOOS=linux GOARCH=amd64)
Golds is a Go 101 project developed by Tapir Liu.
PR and bug reports are welcome and can be submitted to the issue list.
Please follow @zigo_101 (reachable from the left QR code) to get the latest news of Golds.