conn error sub-codes for better classification (#2889)

This commit is contained in:
Mike Sawka 2026-02-19 11:18:52 -08:00 committed by GitHub
parent 0cc6c454a9
commit 1201273bd4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 150 additions and 22 deletions

2
.gitignore vendored
View file

@ -36,3 +36,5 @@ storybook-static/
test-results.xml
docsite/
.kilo-format-temp-*

View file

@ -749,7 +749,8 @@ func (conn *SSHConn) Connect(ctx context.Context, connFlags *wconfig.ConnKeyword
conn.FireConnChangeEvent()
err := conn.connectInternal(ctx, connFlags)
if err != nil {
errorCode := remote.ClassifyConnError(err)
errorCode, subCode := remote.ClassifyConnError(err)
isContextError := errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded)
conn.Infof(ctx, "ERROR [%s] %v\n\n", errorCode, err)
conn.WithLock(func() {
conn.Status = Status_Error
@ -762,8 +763,10 @@ func (conn *SSHConn) Connect(ctx context.Context, connFlags *wconfig.ConnKeyword
telemetry.GoRecordTEventWrap(&telemetrydata.TEvent{
Event: "conn:connecterror",
Props: telemetrydata.TEventProps{
ConnType: "ssh",
ConnErrorCode: errorCode,
ConnType: "ssh",
ConnErrorCode: errorCode,
ConnSubErrorCode: subCode,
ConnContextError: isContextError,
},
})
} else {

View file

@ -52,6 +52,7 @@ const (
ConnErrCode_KnownHostsNone = "knownhosts-none"
ConnErrCode_KnownHostsFmt = "knownhosts-format"
ConnErrCode_Dial = "dial-error"
ConnErrCode_ProxyJumpDial = "dial-proxy-jump"
ConnErrCode_HostKeyRevoked = "hostkey-revoked"
ConnErrCode_HostKeyChanged = "hostkey-changed"
ConnErrCode_HostKeyVerify = "hostkey-verify"
@ -61,6 +62,27 @@ const (
ConnErrCode_Unknown = "unknown"
)
// Dial error subcodes for more granular classification
const (
DialSubCode_DNS = "dns"
DialSubCode_Refused = "refused"
DialSubCode_Timeout = "timeout"
DialSubCode_ContextCanceled = "context-canceled"
DialSubCode_NoRoute = "no-route"
DialSubCode_HostUnreach = "host-unreachable"
DialSubCode_NetUnreach = "net-unreachable"
DialSubCode_ConnReset = "conn-reset"
DialSubCode_PermDenied = "perm-denied"
DialSubCode_ProxyJump = "proxy-jump"
DialSubCode_Other = "other"
)
// Auth error subcodes for more granular classification
const (
AuthSubCode_UnableToAuth = "unable-to-auth"
AuthSubCode_HandshakeFailed = "handshake-failed"
)
var waveSshConfigUserSettingsInternal *ssh_config.UserSettings
var configUserSettingsOnce = &sync.Once{}
@ -118,33 +140,110 @@ func SimpleMessageFromPossibleConnectionError(err error) string {
return err.Error()
}
func ClassifyConnError(err error) string {
func ClassifyConnError(err error) (string, string) {
code := utilds.GetErrorCode(err)
subCode := utilds.GetErrorSubCode(err)
if code != "" {
return code
return code, subCode
}
var dnsErr *net.DNSError
if errors.As(err, &dnsErr) {
return ConnErrCode_Dial
return ConnErrCode_Dial, ClassifyDialErrorSubCode(err)
}
var opErr *net.OpError
if errors.As(err, &opErr) {
return ConnErrCode_Dial
return ConnErrCode_Dial, ClassifyDialErrorSubCode(err)
}
errStr := err.Error()
if strings.Contains(errStr, "unable to authenticate") {
return ConnErrCode_AuthFailed
return ConnErrCode_AuthFailed, AuthSubCode_UnableToAuth
}
if strings.Contains(errStr, "handshake failed") {
return ConnErrCode_AuthFailed
return ConnErrCode_AuthFailed, AuthSubCode_HandshakeFailed
}
if strings.Contains(errStr, "connection refused") {
return ConnErrCode_Dial
return ConnErrCode_Dial, ClassifyDialErrorSubCode(err)
}
if strings.Contains(errStr, "timed out") || strings.Contains(errStr, "timeout") {
return ConnErrCode_Dial
return ConnErrCode_Dial, ClassifyDialErrorSubCode(err)
}
return ConnErrCode_Unknown
return ConnErrCode_Unknown, ""
}
// ClassifyDialErrorSubCode provides more granular classification of dial errors
// to help identify root causes (DNS, VPN, timeouts, etc.)
func ClassifyDialErrorSubCode(err error) string {
if err == nil {
return ""
}
// Check for context cancellation first
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return DialSubCode_ContextCanceled
}
// Check if it's a DNS error
var dnsErr *net.DNSError
if errors.As(err, &dnsErr) {
return DialSubCode_DNS
}
// Check if it's a network operation error
var opErr *net.OpError
if errors.As(err, &opErr) {
// Check the underlying error for more details
if opErr.Err != nil {
errStr := opErr.Err.Error()
if strings.Contains(errStr, "connection refused") {
return DialSubCode_Refused
}
if strings.Contains(errStr, "no route to host") {
return DialSubCode_NoRoute
}
if strings.Contains(errStr, "host is unreachable") || strings.Contains(errStr, "host unreachable") {
return DialSubCode_HostUnreach
}
if strings.Contains(errStr, "network is unreachable") || strings.Contains(errStr, "network unreachable") {
return DialSubCode_NetUnreach
}
if strings.Contains(errStr, "connection reset") {
return DialSubCode_ConnReset
}
if strings.Contains(errStr, "permission denied") {
return DialSubCode_PermDenied
}
}
// Generic timeout detection in OpError
if opErr.Timeout() {
return DialSubCode_Timeout
}
}
// Check error string for common patterns
errStr := err.Error()
if strings.Contains(errStr, "connection refused") {
return DialSubCode_Refused
}
if strings.Contains(errStr, "timed out") || strings.Contains(errStr, "timeout") || strings.Contains(errStr, "i/o timeout") {
return DialSubCode_Timeout
}
if strings.Contains(errStr, "no route to host") {
return DialSubCode_NoRoute
}
if strings.Contains(errStr, "host is unreachable") || strings.Contains(errStr, "host unreachable") {
return DialSubCode_HostUnreach
}
if strings.Contains(errStr, "network is unreachable") || strings.Contains(errStr, "network unreachable") {
return DialSubCode_NetUnreach
}
if strings.Contains(errStr, "connection reset") {
return DialSubCode_ConnReset
}
if strings.Contains(errStr, "permission denied") {
return DialSubCode_PermDenied
}
return DialSubCode_Other
}
// This exists to trick the ssh library into continuing to try
@ -747,15 +846,17 @@ func connectInternal(ctx context.Context, networkAddr string, clientConfig *ssh.
blocklogger.Infof(ctx, "[conndebug] ssh dial %s\n", networkAddr)
clientConn, err = d.DialContext(ctx, "tcp", networkAddr)
if err != nil {
blocklogger.Infof(ctx, "[conndebug] ERROR dial error: %v\n", err)
return nil, utilds.MakeCodedError(ConnErrCode_Dial, err)
subCode := ClassifyDialErrorSubCode(err)
blocklogger.Infof(ctx, "[conndebug] ERROR dial error [%s]: %v\n", subCode, err)
return nil, utilds.MakeSubCodedError(ConnErrCode_Dial, subCode, err)
}
} else {
blocklogger.Infof(ctx, "[conndebug] ssh dial (from client) %s\n", networkAddr)
clientConn, err = currentClient.DialContext(ctx, "tcp", networkAddr)
if err != nil {
blocklogger.Infof(ctx, "[conndebug] ERROR dial error: %v\n", err)
return nil, utilds.MakeCodedError(ConnErrCode_Dial, err)
subCode := ClassifyDialErrorSubCode(err)
blocklogger.Infof(ctx, "[conndebug] ERROR dial error [%s]: %v\n", subCode, err)
return nil, utilds.MakeSubCodedError(ConnErrCode_ProxyJumpDial, subCode, err)
}
}
c, chans, reqs, err := ssh.NewClientConn(clientConn, networkAddr, clientConfig)

View file

@ -126,9 +126,11 @@ type TEventProps struct {
WshCmd string `json:"wsh:cmd,omitempty"`
WshHadError bool `json:"wsh:haderror,omitempty"`
ConnType string `json:"conn:conntype,omitempty"`
ConnWshErrorCode string `json:"conn:wsherrorcode,omitempty"`
ConnErrorCode string `json:"conn:errorcode,omitempty"`
ConnType string `json:"conn:conntype,omitempty"`
ConnWshErrorCode string `json:"conn:wsherrorcode,omitempty"`
ConnErrorCode string `json:"conn:errorcode,omitempty"`
ConnSubErrorCode string `json:"conn:suberrorcode,omitempty"`
ConnContextError bool `json:"conn:contexterror,omitempty"`
OnboardingFeature string `json:"onboarding:feature,omitempty" tstype:"\"waveai\" | \"durable\" | \"magnify\" | \"wsh\""`
OnboardingVersion string `json:"onboarding:version,omitempty"`

View file

@ -10,9 +10,11 @@ import (
// CodedError wraps an error with a string code for categorization.
// The code can be extracted from anywhere in an error chain using GetErrorCode.
// SubCode provides additional granularity for error classification.
type CodedError struct {
Code string
Err error
Code string
SubCode string
Err error
}
func (e CodedError) Error() string {
@ -25,7 +27,12 @@ func (e CodedError) Unwrap() error {
// MakeCodedError creates a new CodedError with the given code and error.
func MakeCodedError(code string, err error) CodedError {
return CodedError{Code: code, Err: err}
return CodedError{Code: code, SubCode: "", Err: err}
}
// MakeSubCodedError creates a new CodedError with the given code, subcode, and error.
func MakeSubCodedError(code string, subCode string, err error) CodedError {
return CodedError{Code: code, SubCode: subCode, Err: err}
}
// GetErrorCode extracts the error code from anywhere in the error chain.
@ -41,6 +48,19 @@ func GetErrorCode(err error) string {
return ""
}
// GetErrorSubCode extracts the error subcode from anywhere in the error chain.
// Returns empty string if no CodedError is found or if SubCode is not set.
func GetErrorSubCode(err error) string {
if err == nil {
return ""
}
var coded CodedError
if errors.As(err, &coded) {
return coded.SubCode
}
return ""
}
// Errorf creates a formatted error wrapped in a CodedError.
// This is a convenience function that combines fmt.Errorf with MakeCodedError.
func Errorf(code string, format string, args ...interface{}) error {