forked from Shiloh/githaven
c88547ce71
Continues on from #19202. Following the addition of pprof labels we can now more easily understand the relationship between a goroutine and the requests that spawn them. This PR takes advantage of the labels and adds a few others, then provides a mechanism for the monitoring page to query the pprof goroutine profile. The binary profile that results from this profile is immediately piped in to the google library for parsing this and then stack traces are formed for the goroutines. If the goroutine is within a context or has been created from a goroutine within a process context it will acquire the process description labels for that process. The goroutines are mapped with there associate pids and any that do not have an associated pid are placed in a group at the bottom as unbound. In this way we should be able to more easily examine goroutines that have been stuck. A manager command `gitea manager processes` is also provided that can export the processes (with or without stacktraces) to the command line. Signed-off-by: Andrew Thornton <art27@cantab.net>
356 lines
10 KiB
Go
356 lines
10 KiB
Go
// Copyright 2022 The Gitea Authors. All rights reserved.
|
|
// Use of this source code is governed by a MIT-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package process
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"runtime/pprof"
|
|
"sort"
|
|
"time"
|
|
|
|
"github.com/google/pprof/profile"
|
|
)
|
|
|
|
// StackEntry is an entry on a stacktrace
|
|
type StackEntry struct {
|
|
Function string
|
|
File string
|
|
Line int
|
|
}
|
|
|
|
// Label represents a pprof label assigned to goroutine stack
|
|
type Label struct {
|
|
Name string
|
|
Value string
|
|
}
|
|
|
|
// Stack is a stacktrace relating to a goroutine. (Multiple goroutines may have the same stacktrace)
|
|
type Stack struct {
|
|
Count int64 // Number of goroutines with this stack trace
|
|
Description string
|
|
Labels []*Label `json:",omitempty"`
|
|
Entry []*StackEntry `json:",omitempty"`
|
|
}
|
|
|
|
// A Process is a combined representation of a Process and a Stacktrace for the goroutines associated with it
|
|
type Process struct {
|
|
PID IDType
|
|
ParentPID IDType
|
|
Description string
|
|
Start time.Time
|
|
Type string
|
|
|
|
Children []*Process `json:",omitempty"`
|
|
Stacks []*Stack `json:",omitempty"`
|
|
}
|
|
|
|
// Processes gets the processes in a thread safe manner
|
|
func (pm *Manager) Processes(flat, noSystem bool) ([]*Process, int) {
|
|
pm.mutex.Lock()
|
|
processCount := len(pm.processMap)
|
|
processes := make([]*Process, 0, len(pm.processMap))
|
|
if flat {
|
|
for _, process := range pm.processMap {
|
|
if noSystem && process.Type == SystemProcessType {
|
|
continue
|
|
}
|
|
processes = append(processes, process.toProcess())
|
|
}
|
|
} else {
|
|
// We need our own processMap
|
|
processMap := map[IDType]*Process{}
|
|
for _, internalProcess := range pm.processMap {
|
|
process, ok := processMap[internalProcess.PID]
|
|
if !ok {
|
|
process = internalProcess.toProcess()
|
|
processMap[process.PID] = process
|
|
}
|
|
|
|
// Check its parent
|
|
if process.ParentPID == "" {
|
|
processes = append(processes, process)
|
|
continue
|
|
}
|
|
|
|
internalParentProcess, ok := pm.processMap[internalProcess.ParentPID]
|
|
if ok {
|
|
parentProcess, ok := processMap[process.ParentPID]
|
|
if !ok {
|
|
parentProcess = internalParentProcess.toProcess()
|
|
processMap[parentProcess.PID] = parentProcess
|
|
}
|
|
parentProcess.Children = append(parentProcess.Children, process)
|
|
continue
|
|
}
|
|
|
|
processes = append(processes, process)
|
|
}
|
|
}
|
|
pm.mutex.Unlock()
|
|
|
|
if !flat && noSystem {
|
|
for i := 0; i < len(processes); i++ {
|
|
process := processes[i]
|
|
if process.Type != SystemProcessType {
|
|
continue
|
|
}
|
|
processes[len(processes)-1], processes[i] = processes[i], processes[len(processes)-1]
|
|
processes = append(processes[:len(processes)-1], process.Children...)
|
|
i--
|
|
}
|
|
}
|
|
|
|
// Sort by process' start time. Oldest process appears first.
|
|
sort.Slice(processes, func(i, j int) bool {
|
|
left, right := processes[i], processes[j]
|
|
|
|
return left.Start.Before(right.Start)
|
|
})
|
|
|
|
return processes, processCount
|
|
}
|
|
|
|
// ProcessStacktraces gets the processes and stacktraces in a thread safe manner
|
|
func (pm *Manager) ProcessStacktraces(flat, noSystem bool) ([]*Process, int, int64, error) {
|
|
var stacks *profile.Profile
|
|
var err error
|
|
|
|
// We cannot use the pm.ProcessMap here because we will release the mutex ...
|
|
processMap := map[IDType]*Process{}
|
|
processCount := 0
|
|
|
|
// Lock the manager
|
|
pm.mutex.Lock()
|
|
processCount = len(pm.processMap)
|
|
|
|
// Add a defer to unlock in case there is a panic
|
|
unlocked := false
|
|
defer func() {
|
|
if !unlocked {
|
|
pm.mutex.Unlock()
|
|
}
|
|
}()
|
|
|
|
processes := make([]*Process, 0, len(pm.processMap))
|
|
if flat {
|
|
for _, internalProcess := range pm.processMap {
|
|
process := internalProcess.toProcess()
|
|
processMap[process.PID] = process
|
|
if noSystem && internalProcess.Type == SystemProcessType {
|
|
continue
|
|
}
|
|
processes = append(processes, process)
|
|
}
|
|
} else {
|
|
for _, internalProcess := range pm.processMap {
|
|
process, ok := processMap[internalProcess.PID]
|
|
if !ok {
|
|
process = internalProcess.toProcess()
|
|
processMap[process.PID] = process
|
|
}
|
|
|
|
// Check its parent
|
|
if process.ParentPID == "" {
|
|
processes = append(processes, process)
|
|
continue
|
|
}
|
|
|
|
internalParentProcess, ok := pm.processMap[internalProcess.ParentPID]
|
|
if ok {
|
|
parentProcess, ok := processMap[process.ParentPID]
|
|
if !ok {
|
|
parentProcess = internalParentProcess.toProcess()
|
|
processMap[parentProcess.PID] = parentProcess
|
|
}
|
|
parentProcess.Children = append(parentProcess.Children, process)
|
|
continue
|
|
}
|
|
|
|
processes = append(processes, process)
|
|
}
|
|
}
|
|
|
|
// Now from within the lock we need to get the goroutines.
|
|
// Why? If we release the lock then between between filling the above map and getting
|
|
// the stacktraces another process could be created which would then look like a dead process below
|
|
reader, writer := io.Pipe()
|
|
defer reader.Close()
|
|
go func() {
|
|
err := pprof.Lookup("goroutine").WriteTo(writer, 0)
|
|
_ = writer.CloseWithError(err)
|
|
}()
|
|
stacks, err = profile.Parse(reader)
|
|
if err != nil {
|
|
return nil, 0, 0, err
|
|
}
|
|
|
|
// Unlock the mutex
|
|
pm.mutex.Unlock()
|
|
unlocked = true
|
|
|
|
goroutineCount := int64(0)
|
|
|
|
// Now walk through the "Sample" slice in the goroutines stack
|
|
for _, sample := range stacks.Sample {
|
|
// In the "goroutine" pprof profile each sample represents one or more goroutines
|
|
// with the same labels and stacktraces.
|
|
|
|
// We will represent each goroutine by a `Stack`
|
|
stack := &Stack{}
|
|
|
|
// Add the non-process associated labels from the goroutine sample to the Stack
|
|
for name, value := range sample.Label {
|
|
if name == DescriptionPProfLabel || name == PIDPProfLabel || (!flat && name == PPIDPProfLabel) || name == ProcessTypePProfLabel {
|
|
continue
|
|
}
|
|
|
|
// Labels from the "goroutine" pprof profile only have one value.
|
|
// This is because the underlying representation is a map[string]string
|
|
if len(value) != 1 {
|
|
// Unexpected...
|
|
return nil, 0, 0, fmt.Errorf("label: %s in goroutine stack with unexpected number of values: %v", name, value)
|
|
}
|
|
|
|
stack.Labels = append(stack.Labels, &Label{Name: name, Value: value[0]})
|
|
}
|
|
|
|
// The number of goroutines that this sample represents is the `stack.Value[0]`
|
|
stack.Count = sample.Value[0]
|
|
goroutineCount += stack.Count
|
|
|
|
// Now we want to associate this Stack with a Process.
|
|
var process *Process
|
|
|
|
// Try to get the PID from the goroutine labels
|
|
if pidvalue, ok := sample.Label[PIDPProfLabel]; ok && len(pidvalue) == 1 {
|
|
pid := IDType(pidvalue[0])
|
|
|
|
// Now try to get the process from our map
|
|
process, ok = processMap[pid]
|
|
if !ok && pid != "" {
|
|
// This means that no process has been found in the process map - but there was a process PID
|
|
// Therefore this goroutine belongs to a dead process and it has escaped control of the process as it
|
|
// should have died with the process context cancellation.
|
|
|
|
// We need to create a dead process holder for this process and label it appropriately
|
|
|
|
// get the parent PID
|
|
ppid := IDType("")
|
|
if value, ok := sample.Label[PPIDPProfLabel]; ok && len(value) == 1 {
|
|
ppid = IDType(value[0])
|
|
}
|
|
|
|
// format the description
|
|
description := "(dead process)"
|
|
if value, ok := sample.Label[DescriptionPProfLabel]; ok && len(value) == 1 {
|
|
description = value[0] + " " + description
|
|
}
|
|
|
|
// override the type of the process to "code" but add the old type as a label on the first stack
|
|
ptype := NoneProcessType
|
|
if value, ok := sample.Label[ProcessTypePProfLabel]; ok && len(value) == 1 {
|
|
stack.Labels = append(stack.Labels, &Label{Name: ProcessTypePProfLabel, Value: value[0]})
|
|
}
|
|
process = &Process{
|
|
PID: pid,
|
|
ParentPID: ppid,
|
|
Description: description,
|
|
Type: ptype,
|
|
}
|
|
|
|
// Now add the dead process back to the map and tree so we don't go back through this again.
|
|
processMap[process.PID] = process
|
|
added := false
|
|
if process.ParentPID != "" && !flat {
|
|
if parent, ok := processMap[process.ParentPID]; ok {
|
|
parent.Children = append(parent.Children, process)
|
|
added = true
|
|
}
|
|
}
|
|
if !added {
|
|
processes = append(processes, process)
|
|
}
|
|
}
|
|
}
|
|
|
|
if process == nil {
|
|
// This means that the sample we're looking has no PID label
|
|
var ok bool
|
|
process, ok = processMap[""]
|
|
if !ok {
|
|
// this is the first time we've come acrross an unassociated goroutine so create a "process" to hold them
|
|
process = &Process{
|
|
Description: "(unassociated)",
|
|
Type: NoneProcessType,
|
|
}
|
|
processMap[process.PID] = process
|
|
processes = append(processes, process)
|
|
}
|
|
}
|
|
|
|
// The sample.Location represents a stack trace for this goroutine,
|
|
// however each Location can represent multiple lines (mostly due to inlining)
|
|
// so we need to walk the lines too
|
|
for _, location := range sample.Location {
|
|
for _, line := range location.Line {
|
|
entry := &StackEntry{
|
|
Function: line.Function.Name,
|
|
File: line.Function.Filename,
|
|
Line: int(line.Line),
|
|
}
|
|
stack.Entry = append(stack.Entry, entry)
|
|
}
|
|
}
|
|
|
|
// Now we need a short-descriptive name to call the stack trace if when it is folded and
|
|
// assuming the stack trace has some lines we'll choose the bottom of the stack (i.e. the
|
|
// initial function that started the stack trace.) The top of the stack is unlikely to
|
|
// be very helpful as a lot of the time it will be runtime.select or some other call into
|
|
// a std library.
|
|
stack.Description = "(unknown)"
|
|
if len(stack.Entry) > 0 {
|
|
stack.Description = stack.Entry[len(stack.Entry)-1].Function
|
|
}
|
|
|
|
process.Stacks = append(process.Stacks, stack)
|
|
}
|
|
|
|
// restrict to not show system processes
|
|
if noSystem {
|
|
for i := 0; i < len(processes); i++ {
|
|
process := processes[i]
|
|
if process.Type != SystemProcessType && process.Type != NoneProcessType {
|
|
continue
|
|
}
|
|
processes[len(processes)-1], processes[i] = processes[i], processes[len(processes)-1]
|
|
processes = append(processes[:len(processes)-1], process.Children...)
|
|
i--
|
|
}
|
|
}
|
|
|
|
// Now finally re-sort the processes. Newest process appears first
|
|
after := func(processes []*Process) func(i, j int) bool {
|
|
return func(i, j int) bool {
|
|
left, right := processes[i], processes[j]
|
|
return left.Start.After(right.Start)
|
|
}
|
|
}
|
|
sort.Slice(processes, after(processes))
|
|
if !flat {
|
|
|
|
var sortChildren func(process *Process)
|
|
|
|
sortChildren = func(process *Process) {
|
|
sort.Slice(process.Children, after(process.Children))
|
|
for _, child := range process.Children {
|
|
sortChildren(child)
|
|
}
|
|
}
|
|
}
|
|
|
|
return processes, processCount, goroutineCount, err
|
|
}
|