package  syscallimport  (	errpkg  "errors" 	"internal/itoa" 	"runtime" 	"unsafe" )const  (	CLONE_VM              = 0x00000100  	CLONE_FS              = 0x00000200  	CLONE_FILES           = 0x00000400  	CLONE_SIGHAND         = 0x00000800  	CLONE_PIDFD           = 0x00001000  	CLONE_PTRACE          = 0x00002000  	CLONE_VFORK           = 0x00004000  	CLONE_PARENT          = 0x00008000  	CLONE_THREAD          = 0x00010000  	CLONE_NEWNS           = 0x00020000  	CLONE_SYSVSEM         = 0x00040000  	CLONE_SETTLS          = 0x00080000  	CLONE_PARENT_SETTID   = 0x00100000  	CLONE_CHILD_CLEARTID  = 0x00200000  	CLONE_DETACHED        = 0x00400000  	CLONE_UNTRACED        = 0x00800000  	CLONE_CHILD_SETTID    = 0x01000000  	CLONE_NEWCGROUP       = 0x02000000  	CLONE_NEWUTS          = 0x04000000  	CLONE_NEWIPC          = 0x08000000  	CLONE_NEWUSER         = 0x10000000  	CLONE_NEWPID          = 0x20000000  	CLONE_NEWNET          = 0x40000000  	CLONE_IO              = 0x80000000  		CLONE_CLEAR_SIGHAND  = 0x100000000  	CLONE_INTO_CGROUP    = 0x200000000  		CLONE_NEWTIME  = 0x00000080  )type  SysProcIDMap  struct  {	ContainerID  int  	HostID       int  	Size         int  }type  SysProcAttr  struct  {	Chroot      string       	Credential  *Credential  		Ptrace  bool 	Setsid  bool  		Setpgid  bool 		Setctty  bool 	Noctty   bool  	Ctty     int   		Foreground  bool 	Pgid        int  		Pdeathsig     Signal 	Cloneflags    uintptr         	Unshareflags  uintptr         	UidMappings   []SysProcIDMap  	GidMappings   []SysProcIDMap  		GidMappingsEnableSetgroups  bool 	AmbientCaps                 []uintptr  	UseCgroupFD                 bool       	CgroupFD                    int        		PidFD  *int }var  (	none   = [...]byte {'n' , 'o' , 'n' , 'e' , 0 }	slash  = [...]byte {'/' , 0 }	forceClone3  = false  )func  runtime_BeforeFork func  runtime_AfterFork func  runtime_AfterForkInChild func  forkAndExecInChild argv0  *byte , argv , envv  []*byte , chroot , dir  *byte , attr  *ProcAttr , sys  *SysProcAttr , pipe  int ) (pid  int , err  Errno ) {		upid , pidfd , err , mapPipe , locked  := forkAndExecInChild1 (argv0 , argv , envv , chroot , dir , attr , sys , pipe )	if  locked  {		runtime_AfterFork ()	}	if  err  != 0  {		return  0 , err 	}		pid  = int (upid )	if  sys .PidFD  != nil  {		*sys .PidFD  = int (pidfd )	}	if  sys .UidMappings  != nil  || sys .GidMappings  != nil  {		Close (mapPipe [0 ])		var  err2  Errno 				if  sys .Unshareflags &CLONE_NEWUSER  == 0  {			if  err  := writeUidGidMappings (pid , sys ); err  != nil  {				err2  = err .(Errno )			}		}		RawSyscall (SYS_WRITE , uintptr (mapPipe [1 ]), uintptr (unsafe .Pointer (&err2 )), unsafe .Sizeof (err2 ))		Close (mapPipe [1 ])	}	return  pid , 0 }const  _LINUX_CAPABILITY_VERSION_3  = 0x20080522 type  capHeader  struct  {	version  uint32 	pid      int32 }type  capData  struct  {	effective    uint32 	permitted    uint32 	inheritable  uint32 }type  caps  struct  {	hdr   capHeader 	data  [2 ]capData }func  capToIndex cap  uintptr ) uintptr  { return  cap  >> 5  }func  capToMask cap  uintptr ) uint32  { return  1  << uint (cap &31 ) }type  cloneArgs  struct  {	flags       uint64  	pidFD       uint64  	childTID    uint64  	parentTID   uint64  	exitSignal  uint64  	stack       uint64  	stackSize   uint64  	tls         uint64  	setTID      uint64  	setTIDSize  uint64  	cgroup      uint64  }func  forkAndExecInChild1 argv0  *byte , argv , envv  []*byte , chroot , dir  *byte , attr  *ProcAttr , sys  *SysProcAttr , pipe  int ) (pid  uintptr , pidfd  int32 , err1  Errno , mapPipe  [2 ]int , locked  bool ) {		const  (		PR_CAP_AMBIENT        = 0x2f 		PR_CAP_AMBIENT_RAISE  = 0x2 	)		var  (		err2                       Errno 		nextfd                     int 		i                          int 		caps                       caps 		fd1 , flags                 uintptr 		puid , psetgroups , pgid     []byte 		uidmap , setgroups , gidmap  []byte 		clone3                     *cloneArgs 		pgrp                       int32 		dirfd                      int 		cred                       *Credential 		ngroups , groups            uintptr 		c                          uintptr 	)	pidfd  = -1 	rlim  := origRlimitNofile .Load ()	if  sys .UidMappings  != nil  {		puid  = []byte ("/proc/self/uid_map\000" )		uidmap  = formatIDMappings (sys .UidMappings )	}	if  sys .GidMappings  != nil  {		psetgroups  = []byte ("/proc/self/setgroups\000" )		pgid  = []byte ("/proc/self/gid_map\000" )		if  sys .GidMappingsEnableSetgroups  {			setgroups  = []byte ("allow\000" )		} else  {			setgroups  = []byte ("deny\000" )		}		gidmap  = formatIDMappings (sys .GidMappings )	}		ppid , _  := rawSyscallNoError (SYS_GETPID , 0 , 0 , 0 )		fd  := make ([]int , len (attr .Files ))	nextfd  = len (attr .Files )	for  i , ufd  := range  attr .Files  {		if  nextfd  < int (ufd ) {			nextfd  = int (ufd )		}		fd [i ] = int (ufd )	}	nextfd ++		if  sys .UidMappings  != nil  || sys .GidMappings  != nil  {		if  err  := forkExecPipe (mapPipe [:]); err  != nil  {			err1  = err .(Errno )			return 		}	}	flags  = sys .Cloneflags 	if  sys .Cloneflags &CLONE_NEWUSER  == 0  && sys .Unshareflags &CLONE_NEWUSER  == 0  {		flags  |= CLONE_VFORK  | CLONE_VM 	}	if  sys .PidFD  != nil  {		flags  |= CLONE_PIDFD 	}		if  sys .UseCgroupFD  || flags &CLONE_NEWTIME  != 0  || forceClone3  {		clone3  = &cloneArgs {			flags :      uint64 (flags ),			exitSignal : uint64 (SIGCHLD ),		}		if  sys .UseCgroupFD  {			clone3 .flags  |= CLONE_INTO_CGROUP 			clone3 .cgroup  = uint64 (sys .CgroupFD )		}		if  sys .PidFD  != nil  {			clone3 .pidFD  = uint64 (uintptr (unsafe .Pointer (&pidfd )))		}	}		runtime_BeforeFork ()	locked  = true 	if  clone3  != nil  {		pid , err1  = rawVforkSyscall (_SYS_clone3 , uintptr (unsafe .Pointer (clone3 )), unsafe .Sizeof (*clone3 ), 0 )	} else  {				flags  |= uintptr (SIGCHLD )		if  runtime .GOARCH  == "s390x"  {						pid , err1  = rawVforkSyscall (SYS_CLONE , 0 , flags , uintptr (unsafe .Pointer (&pidfd )))		} else  {			pid , err1  = rawVforkSyscall (SYS_CLONE , flags , 0 , uintptr (unsafe .Pointer (&pidfd )))		}	}	if  err1  != 0  || pid  != 0  {				return 	}			if  len (sys .AmbientCaps ) > 0  {		_, _, err1  = RawSyscall6 (SYS_PRCTL , PR_SET_KEEPCAPS , 1 , 0 , 0 , 0 , 0 )		if  err1  != 0  {			goto  childerror 		}	}		if  sys .UidMappings  != nil  || sys .GidMappings  != nil  {		if  _, _, err1  = RawSyscall (SYS_CLOSE , uintptr (mapPipe [1 ]), 0 , 0 ); err1  != 0  {			goto  childerror 		}		pid , _, err1  = RawSyscall (SYS_READ , uintptr (mapPipe [0 ]), uintptr (unsafe .Pointer (&err2 )), unsafe .Sizeof (err2 ))		if  err1  != 0  {			goto  childerror 		}		if  pid  != unsafe .Sizeof (err2 ) {			err1  = EINVAL 			goto  childerror 		}		if  err2  != 0  {			err1  = err2 			goto  childerror 		}	}		if  sys .Setsid  {		_, _, err1  = RawSyscall (SYS_SETSID , 0 , 0 , 0 )		if  err1  != 0  {			goto  childerror 		}	}		if  sys .Setpgid  || sys .Foreground  {				_, _, err1  = RawSyscall (SYS_SETPGID , 0 , uintptr (sys .Pgid ), 0 )		if  err1  != 0  {			goto  childerror 		}	}	if  sys .Foreground  {		pgrp  = int32 (sys .Pgid )		if  pgrp  == 0  {			pid , _ = rawSyscallNoError (SYS_GETPID , 0 , 0 , 0 )			pgrp  = int32 (pid )		}				_, _, err1  = RawSyscall (SYS_IOCTL , uintptr (sys .Ctty ), uintptr (TIOCSPGRP ), uintptr (unsafe .Pointer (&pgrp )))		if  err1  != 0  {			goto  childerror 		}	}		runtime_AfterForkInChild ()		if  sys .Unshareflags  != 0  {		_, _, err1  = RawSyscall (SYS_UNSHARE , sys .Unshareflags , 0 , 0 )		if  err1  != 0  {			goto  childerror 		}		if  sys .Unshareflags &CLONE_NEWUSER  != 0  && sys .GidMappings  != nil  {			dirfd  = int (_AT_FDCWD )			if  fd1 , _, err1  = RawSyscall6 (SYS_OPENAT , uintptr (dirfd ), uintptr (unsafe .Pointer (&psetgroups [0 ])), uintptr (O_WRONLY ), 0 , 0 , 0 ); err1  != 0  {				goto  childerror 			}			pid , _, err1  = RawSyscall (SYS_WRITE , fd1 , uintptr (unsafe .Pointer (&setgroups [0 ])), uintptr (len (setgroups )))			if  err1  != 0  {				goto  childerror 			}			if  _, _, err1  = RawSyscall (SYS_CLOSE , fd1 , 0 , 0 ); err1  != 0  {				goto  childerror 			}			if  fd1 , _, err1  = RawSyscall6 (SYS_OPENAT , uintptr (dirfd ), uintptr (unsafe .Pointer (&pgid [0 ])), uintptr (O_WRONLY ), 0 , 0 , 0 ); err1  != 0  {				goto  childerror 			}			pid , _, err1  = RawSyscall (SYS_WRITE , fd1 , uintptr (unsafe .Pointer (&gidmap [0 ])), uintptr (len (gidmap )))			if  err1  != 0  {				goto  childerror 			}			if  _, _, err1  = RawSyscall (SYS_CLOSE , fd1 , 0 , 0 ); err1  != 0  {				goto  childerror 			}		}		if  sys .Unshareflags &CLONE_NEWUSER  != 0  && sys .UidMappings  != nil  {			dirfd  = int (_AT_FDCWD )			if  fd1 , _, err1  = RawSyscall6 (SYS_OPENAT , uintptr (dirfd ), uintptr (unsafe .Pointer (&puid [0 ])), uintptr (O_WRONLY ), 0 , 0 , 0 ); err1  != 0  {				goto  childerror 			}			pid , _, err1  = RawSyscall (SYS_WRITE , fd1 , uintptr (unsafe .Pointer (&uidmap [0 ])), uintptr (len (uidmap )))			if  err1  != 0  {				goto  childerror 			}			if  _, _, err1  = RawSyscall (SYS_CLOSE , fd1 , 0 , 0 ); err1  != 0  {				goto  childerror 			}		}				if  sys .Unshareflags &CLONE_NEWNS  == CLONE_NEWNS  {			_, _, err1  = RawSyscall6 (SYS_MOUNT , uintptr (unsafe .Pointer (&none [0 ])), uintptr (unsafe .Pointer (&slash [0 ])), 0 , MS_REC |MS_PRIVATE , 0 , 0 )			if  err1  != 0  {				goto  childerror 			}		}	}		if  chroot  != nil  {		_, _, err1  = RawSyscall (SYS_CHROOT , uintptr (unsafe .Pointer (chroot )), 0 , 0 )		if  err1  != 0  {			goto  childerror 		}	}		if  cred  = sys .Credential ; cred  != nil  {		ngroups  = uintptr (len (cred .Groups ))		groups  = uintptr (0 )		if  ngroups  > 0  {			groups  = uintptr (unsafe .Pointer (&cred .Groups [0 ]))		}		if  !(sys .GidMappings  != nil  && !sys .GidMappingsEnableSetgroups  && ngroups  == 0 ) && !cred .NoSetGroups  {			_, _, err1  = RawSyscall (_SYS_setgroups , ngroups , groups , 0 )			if  err1  != 0  {				goto  childerror 			}		}		_, _, err1  = RawSyscall (sys_SETGID , uintptr (cred .Gid ), 0 , 0 )		if  err1  != 0  {			goto  childerror 		}		_, _, err1  = RawSyscall (sys_SETUID , uintptr (cred .Uid ), 0 , 0 )		if  err1  != 0  {			goto  childerror 		}	}	if  len (sys .AmbientCaps ) != 0  {				caps .hdr .version  = _LINUX_CAPABILITY_VERSION_3 		if  _, _, err1  = RawSyscall (SYS_CAPGET , uintptr (unsafe .Pointer (&caps .hdr )), uintptr (unsafe .Pointer (&caps .data [0 ])), 0 ); err1  != 0  {			goto  childerror 		}		for  _, c  = range  sys .AmbientCaps  {						caps .data [capToIndex (c )].permitted  |= capToMask (c )			caps .data [capToIndex (c )].inheritable  |= capToMask (c )		}		if  _, _, err1  = RawSyscall (SYS_CAPSET , uintptr (unsafe .Pointer (&caps .hdr )), uintptr (unsafe .Pointer (&caps .data [0 ])), 0 ); err1  != 0  {			goto  childerror 		}		for  _, c  = range  sys .AmbientCaps  {			_, _, err1  = RawSyscall6 (SYS_PRCTL , PR_CAP_AMBIENT , uintptr (PR_CAP_AMBIENT_RAISE ), c , 0 , 0 , 0 )			if  err1  != 0  {				goto  childerror 			}		}	}		if  dir  != nil  {		_, _, err1  = RawSyscall (SYS_CHDIR , uintptr (unsafe .Pointer (dir )), 0 , 0 )		if  err1  != 0  {			goto  childerror 		}	}		if  sys .Pdeathsig  != 0  {		_, _, err1  = RawSyscall6 (SYS_PRCTL , PR_SET_PDEATHSIG , uintptr (sys .Pdeathsig ), 0 , 0 , 0 , 0 )		if  err1  != 0  {			goto  childerror 		}				pid , _ = rawSyscallNoError (SYS_GETPPID , 0 , 0 , 0 )		if  pid  != ppid  {			pid , _ = rawSyscallNoError (SYS_GETPID , 0 , 0 , 0 )			_, _, err1  = RawSyscall (SYS_KILL , pid , uintptr (sys .Pdeathsig ), 0 )			if  err1  != 0  {				goto  childerror 			}		}	}		if  pipe  < nextfd  {		_, _, err1  = RawSyscall (SYS_DUP3 , uintptr (pipe ), uintptr (nextfd ), O_CLOEXEC )		if  err1  != 0  {			goto  childerror 		}		pipe  = nextfd 		nextfd ++	}	for  i  = 0 ; i  < len (fd ); i ++ {		if  fd [i ] >= 0  && fd [i ] < i  {			if  nextfd  == pipe  { 				nextfd ++			}			_, _, err1  = RawSyscall (SYS_DUP3 , uintptr (fd [i ]), uintptr (nextfd ), O_CLOEXEC )			if  err1  != 0  {				goto  childerror 			}			fd [i ] = nextfd 			nextfd ++		}	}		for  i  = 0 ; i  < len (fd ); i ++ {		if  fd [i ] == -1  {			RawSyscall (SYS_CLOSE , uintptr (i ), 0 , 0 )			continue 		}		if  fd [i ] == i  {						_, _, err1  = RawSyscall (fcntl64Syscall , uintptr (fd [i ]), F_SETFD , 0 )			if  err1  != 0  {				goto  childerror 			}			continue 		}				_, _, err1  = RawSyscall (SYS_DUP3 , uintptr (fd [i ]), uintptr (i ), 0 )		if  err1  != 0  {			goto  childerror 		}	}		for  i  = len (fd ); i  < 3 ; i ++ {		RawSyscall (SYS_CLOSE , uintptr (i ), 0 , 0 )	}		if  sys .Noctty  {		_, _, err1  = RawSyscall (SYS_IOCTL , 0 , uintptr (TIOCNOTTY ), 0 )		if  err1  != 0  {			goto  childerror 		}	}		if  sys .Setctty  {		_, _, err1  = RawSyscall (SYS_IOCTL , uintptr (sys .Ctty ), uintptr (TIOCSCTTY ), 1 )		if  err1  != 0  {			goto  childerror 		}	}		if  rlim  != nil  {		rawSetrlimit (RLIMIT_NOFILE , rlim )	}		if  sys .Ptrace  {		_, _, err1  = RawSyscall (SYS_PTRACE , uintptr (PTRACE_TRACEME ), 0 , 0 )		if  err1  != 0  {			goto  childerror 		}	}		_, _, err1  = RawSyscall (SYS_EXECVE ,		uintptr (unsafe .Pointer (argv0 )),		uintptr (unsafe .Pointer (&argv [0 ])),		uintptr (unsafe .Pointer (&envv [0 ])))childerror :		RawSyscall (SYS_WRITE , uintptr (pipe ), uintptr (unsafe .Pointer (&err1 )), unsafe .Sizeof (err1 ))	for  {		RawSyscall (SYS_EXIT , 253 , 0 , 0 )	}}func  formatIDMappings idMap  []SysProcIDMap ) []byte  {	var  data  []byte 	for  _ , im  := range  idMap  {		data  = append (data , itoa .Itoa (im .ContainerID )+" " +itoa .Itoa (im .HostID )+" " +itoa .Itoa (im .Size )+"\n" ...)	}	return  data }func  writeIDMappings path  string , idMap  []SysProcIDMap ) error  {	fd , err  := Open (path , O_RDWR , 0 )	if  err  != nil  {		return  err 	}	if  _ , err  := Write (fd , formatIDMappings (idMap )); err  != nil  {		Close (fd )		return  err 	}	if  err  := Close (fd ); err  != nil  {		return  err 	}	return  nil }func  writeSetgroups pid  int , enable  bool ) error  {	sgf  := "/proc/"  + itoa .Itoa (pid ) + "/setgroups" 	fd , err  := Open (sgf , O_RDWR , 0 )	if  err  != nil  {		return  err 	}	var  data  []byte 	if  enable  {		data  = []byte ("allow" )	} else  {		data  = []byte ("deny" )	}	if  _ , err  := Write (fd , data ); err  != nil  {		Close (fd )		return  err 	}	return  Close (fd )}func  writeUidGidMappings pid  int , sys  *SysProcAttr ) error  {	if  sys .UidMappings  != nil  {		uidf  := "/proc/"  + itoa .Itoa (pid ) + "/uid_map" 		if  err  := writeIDMappings (uidf , sys .UidMappings ); err  != nil  {			return  err 		}	}	if  sys .GidMappings  != nil  {				if  err  := writeSetgroups (pid , sys .GidMappingsEnableSetgroups ); err  != nil  && err  != ENOENT  {			return  err 		}		gidf  := "/proc/"  + itoa .Itoa (pid ) + "/gid_map" 		if  err  := writeIDMappings (gidf , sys .GidMappings ); err  != nil  {			return  err 		}	}	return  nil }func  forkAndExecFailureCleanup attr  *ProcAttr , sys  *SysProcAttr ) {	if  sys .PidFD  != nil  && *sys .PidFD  != -1  {		Close (*sys .PidFD )		*sys .PidFD  = -1 	}}func  os_checkClonePidfd error  {	pidfd  := int32 (-1 )	pid , errno  := doCheckClonePidfd (&pidfd )	if  errno  != 0  {		return  errno 	}	if  pidfd  == -1  {				var  err  error 		for  {			var  status  WaitStatus 						flags  := uint (WCLONE )			_, err  = Wait4 (int (pid ), &status , int (flags ), nil )			if  err  != EINTR  {				break 			}		}		if  err  != nil  {			return  err 		}		return  errpkg .New ("clone(CLONE_PIDFD) failed to return pidfd" )	}		defer  Close (int (pidfd ))	for  {		const  _P_PIDFD  = 3 		_, _, errno  = Syscall6 (SYS_WAITID , _P_PIDFD , uintptr (pidfd ), 0 , WEXITED  | WCLONE , 0 , 0 )		if  errno  != EINTR  {			break 		}	}	if  errno  != 0  {		return  errno 	}	return  nil }func  doCheckClonePidfd pidfd  *int32 ) (pid  uintptr , errno  Errno ) {	flags  := uintptr (CLONE_VFORK  | CLONE_VM  | CLONE_PIDFD )	if  runtime .GOARCH  == "s390x"  {				pid , errno  = rawVforkSyscall (SYS_CLONE , 0 , flags , uintptr (unsafe .Pointer (pidfd )))	} else  {		pid , errno  = rawVforkSyscall (SYS_CLONE , flags , 0 , uintptr (unsafe .Pointer (pidfd )))	}	if  errno  != 0  || pid  != 0  {				return 	}	for  {		RawSyscall (SYS_EXIT_GROUP , 0 , 0 , 0 )	}} The pages are generated with Golds v0.7.6 . (GOOS=linux GOARCH=amd64)
Golds  is a Go 101  project developed by Tapir Liu .
PR and bug reports are welcome and can be submitted to the issue list .
Please follow @zigo_101  (reachable from the left QR code) to get the latest news of Golds .