diff --git a/cmd/certspotter/main.go b/cmd/certspotter/main.go index b56c85a..749d1a3 100644 --- a/cmd/certspotter/main.go +++ b/cmd/certspotter/main.go @@ -192,7 +192,6 @@ func main() { ScriptDir: defaultScriptDir(), Email: flags.email, Stdout: flags.stdout, - Quiet: !flags.verbose, } config := &monitor.Config{ LogListSource: flags.logs, @@ -241,6 +240,19 @@ func main() { ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) defer stop() + go func() { + ticker := time.NewTicker(24*time.Hour) + defer ticker.Stop() + for { + fsstate.PruneOldErrors() + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + } + }() + if err := monitor.Run(ctx, config); ctx.Err() == context.Canceled && errors.Is(err, context.Canceled) { if flags.verbose { fmt.Fprintf(os.Stderr, "%s: exiting due to SIGINT or SIGTERM\n", programName) diff --git a/monitor/daemon.go b/monitor/daemon.go index 4dcd65d..c2bc152 100644 --- a/monitor/daemon.go +++ b/monitor/daemon.go @@ -50,11 +50,21 @@ type daemon struct { func (daemon *daemon) healthCheck(ctx context.Context) error { if time.Since(daemon.logsLoadedAt) >= daemon.config.HealthCheckInterval { + errors, err := daemon.config.State.GetErrors(ctx, nil, recentErrorCount) + if err != nil { + return fmt.Errorf("error getting recent errors: %w", err) + } + var errorsDir string + if fsstate, ok := daemon.config.State.(*FilesystemState); ok { + errorsDir = fsstate.errorDir(nil) + } info := &StaleLogListInfo{ Source: daemon.config.LogListSource, LastSuccess: daemon.logsLoadedAt, LastError: daemon.logListError, LastErrorTime: daemon.logListErrorAt, + RecentErrors: errors, + ErrorsDir: errorsDir, } if err := daemon.config.State.NotifyHealthCheckFailure(ctx, nil, info); err != nil { return fmt.Errorf("error notifying about stale log list: %w", err) diff --git a/monitor/fileutils.go b/monitor/fileutils.go index 26a7919..96826a8 100644 --- a/monitor/fileutils.go +++ b/monitor/fileutils.go @@ -14,7 +14,9 @@ import ( "encoding/hex" "encoding/json" "fmt" + "io" "os" + "slices" ) func randomFileSuffix() string { @@ -69,3 +71,47 @@ func fileExists(filename string) bool { _, err := os.Lstat(filename) return err == nil } + +func tailFile(filename string, linesWanted int) ([]byte, int, error) { + file, err := os.Open(filename) + if err != nil { + return nil, 0, err + } + defer file.Close() + return tail(file, linesWanted, 4096) +} + +func tail(r io.ReadSeeker, linesWanted int, chunkSize int) ([]byte, int, error) { + var buf []byte + linesGot := 0 + + offset, err := r.Seek(0, io.SeekEnd) + if err != nil { + return nil, 0, err + } + for offset > 0 { + readSize := chunkSize + if offset < int64(readSize) { + readSize = int(offset) + } + offset -= int64(readSize) + if _, err := r.Seek(offset, io.SeekStart); err != nil { + return nil, 0, err + } + buf = slices.Grow(buf, readSize) + copy(buf[readSize:len(buf)+readSize], buf) + buf = buf[:len(buf)+readSize] + if _, err := io.ReadFull(r, buf[:readSize]); err != nil { + return nil, 0, err + } + for i := readSize; i > 0; i-- { + if buf[i-1] == '\n' { + if linesGot == linesWanted { + return buf[i:], linesGot, nil + } + linesGot++ + } + } + } + return buf, linesGot, nil +} diff --git a/monitor/fsstate.go b/monitor/fsstate.go index 8ad7851..3ab0d2c 100644 --- a/monitor/fsstate.go +++ b/monitor/fsstate.go @@ -20,12 +20,17 @@ import ( "os" "path/filepath" "strings" + "sync" + "time" "software.sslmate.com/src/certspotter/cttypes" "software.sslmate.com/src/certspotter/loglist" "software.sslmate.com/src/certspotter/merkletree" ) +const keepErrorDays = 7 +const errorDateFormat = "2006-01-02" + type FilesystemState struct { StateDir string CacheDir string @@ -34,7 +39,7 @@ type FilesystemState struct { ScriptDir string Email []string Stdout bool - Quiet bool + errorMu sync.Mutex } func (s *FilesystemState) logStateDir(logID LogID) string { @@ -57,8 +62,9 @@ func (s *FilesystemState) PrepareLog(ctx context.Context, logID LogID) error { sthsDirPath = filepath.Join(stateDirPath, "unverified_sths") malformedDirPath = filepath.Join(stateDirPath, "malformed_entries") healthchecksDirPath = filepath.Join(stateDirPath, "healthchecks") + errorsDirPath = filepath.Join(stateDirPath, "errors") ) - for _, dirPath := range []string{stateDirPath, sthsDirPath, malformedDirPath, healthchecksDirPath} { + for _, dirPath := range []string{stateDirPath, sthsDirPath, malformedDirPath, healthchecksDirPath, errorsDirPath} { if err := os.Mkdir(dirPath, 0777); err != nil && !errors.Is(err, fs.ErrExist) { return err } @@ -227,6 +233,13 @@ func (s *FilesystemState) healthCheckDir(ctlog *loglist.Log) string { } } +func (s *FilesystemState) errorDir(ctlog *loglist.Log) string { + if ctlog == nil { + return filepath.Join(s.StateDir, "errors") + } + return filepath.Join(s.logStateDir(ctlog.LogID), "errors") +} + func (s *FilesystemState) NotifyHealthCheckFailure(ctx context.Context, ctlog *loglist.Log, info HealthCheckFailure) error { textPath := filepath.Join(s.healthCheckDir(ctlog), healthCheckFilename()) environ := []string{ @@ -248,13 +261,80 @@ func (s *FilesystemState) NotifyHealthCheckFailure(ctx context.Context, ctlog *l return nil } -func (s *FilesystemState) NotifyError(ctx context.Context, ctlog *loglist.Log, err error) error { - if !s.Quiet { - if ctlog == nil { - log.Print(err) - } else { - log.Print(ctlog.GetMonitoringURL(), ": ", err) +func (s *FilesystemState) NotifyError(ctx context.Context, ctlog *loglist.Log, notifyErr error) error { + var ( + now = time.Now() + filePath = filepath.Join(s.errorDir(ctlog), now.Format(errorDateFormat)) + line = now.Format(time.RFC3339) + " " + notifyErr.Error() + "\n" + ) + + s.errorMu.Lock() + defer s.errorMu.Unlock() + file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666) + if err != nil { + return err + } + defer file.Close() + if _, err := file.WriteString(line); err != nil { + return err + } + return file.Close() +} + +func (s *FilesystemState) GetErrors(ctx context.Context, ctlog *loglist.Log, count int) (string, error) { + dir := s.errorDir(ctlog) + now := time.Now() + var buf []byte + for daysBack := 0; count > 0 && daysBack < keepErrorDays; daysBack++ { + datePath := filepath.Join(dir, now.AddDate(0, 0, -daysBack).Format(errorDateFormat)) + dateBuf, dateLines, err := tailFile(datePath, count) + if errors.Is(err, fs.ErrNotExist) { + continue + } else if err != nil { + return "", err + } + buf = append(dateBuf, buf...) + count -= dateLines + } + return string(buf), nil +} + +func (s *FilesystemState) PruneOldErrors() { + cutoff := time.Now().AddDate(0, 0, -keepErrorDays) + pruneDir := func(dir string) { + entries, err := os.ReadDir(dir) + if errors.Is(err, fs.ErrNotExist) { + return + } else if err != nil { + log.Printf("unable to read error directory: %s", err) + return + } + for _, entry := range entries { + if entry.IsDir() { + continue + } + date, err := time.Parse(errorDateFormat, entry.Name()) + if err != nil { + continue + } + if date.Before(cutoff) { + if err := os.Remove(filepath.Join(dir, entry.Name())); err != nil && !errors.Is(err, fs.ErrNotExist) { + log.Printf("unable to remove old error file: %s", err) + } + } } } - return nil + pruneDir(filepath.Join(s.StateDir, "errors")) + logsDir := filepath.Join(s.StateDir, "logs") + logDirs, err := os.ReadDir(logsDir) + if err != nil && !errors.Is(err, fs.ErrNotExist) { + log.Printf("unable to read logs directory: %s", err) + return + } + for _, d := range logDirs { + if !d.IsDir() { + continue + } + pruneDir(filepath.Join(logsDir, d.Name(), "errors")) + } } diff --git a/monitor/healthcheck.go b/monitor/healthcheck.go index 9b19dfa..5625fe3 100644 --- a/monitor/healthcheck.go +++ b/monitor/healthcheck.go @@ -19,6 +19,8 @@ import ( "software.sslmate.com/src/certspotter/loglist" ) +const recentErrorCount = 10 + func healthCheckFilename() string { return time.Now().UTC().Format(time.RFC3339) + ".txt" } @@ -48,20 +50,37 @@ func healthCheckLog(ctx context.Context, config *Config, ctlog *loglist.Log) err return fmt.Errorf("error loading STHs: %w", err) } + var errorsDir string + if fsstate, ok := config.State.(*FilesystemState); ok { + errorsDir = fsstate.errorDir(ctlog) + } + if len(sths) == 0 { + errors, err := config.State.GetErrors(ctx, ctlog, recentErrorCount) + if err != nil { + return fmt.Errorf("error getting recent errors: %w", err) + } info := &StaleSTHInfo{ - Log: ctlog, - LastSuccess: lastSuccess, - LatestSTH: verifiedSTH, + Log: ctlog, + LastSuccess: lastSuccess, + LatestSTH: verifiedSTH, + RecentErrors: errors, + ErrorsDir: errorsDir, } if err := config.State.NotifyHealthCheckFailure(ctx, ctlog, info); err != nil { return fmt.Errorf("error notifying about stale STH: %w", err) } } else { + errors, err := config.State.GetErrors(ctx, ctlog, recentErrorCount) + if err != nil { + return fmt.Errorf("error getting recent errors: %w", err) + } info := &BacklogInfo{ - Log: ctlog, - LatestSTH: sths[len(sths)-1], - Position: position, + Log: ctlog, + LatestSTH: sths[len(sths)-1], + Position: position, + RecentErrors: errors, + ErrorsDir: errorsDir, } if err := config.State.NotifyHealthCheckFailure(ctx, ctlog, info); err != nil { return fmt.Errorf("error notifying about backlog: %w", err) @@ -77,15 +96,19 @@ type HealthCheckFailure interface { } type StaleSTHInfo struct { - Log *loglist.Log - LastSuccess time.Time // may be zero - LatestSTH *cttypes.SignedTreeHead // may be nil + Log *loglist.Log + LastSuccess time.Time // may be zero + LatestSTH *cttypes.SignedTreeHead // may be nil + RecentErrors string + ErrorsDir string } type BacklogInfo struct { - Log *loglist.Log - LatestSTH *StoredSTH - Position uint64 + Log *loglist.Log + LatestSTH *StoredSTH + Position uint64 + RecentErrors string + ErrorsDir string } type StaleLogListInfo struct { @@ -93,6 +116,8 @@ type StaleLogListInfo struct { LastSuccess time.Time LastError string LastErrorTime time.Time + RecentErrors string + ErrorsDir string } func (e *StaleSTHInfo) LastSuccessString() string { @@ -120,33 +145,45 @@ func (e *StaleSTHInfo) Text() string { text := new(strings.Builder) fmt.Fprintf(text, "certspotter has been unable to contact %s since %s. Consequentially, certspotter may fail to notify you about certificates in this log.\n", e.Log.GetMonitoringURL(), e.LastSuccessString()) fmt.Fprintf(text, "\n") - fmt.Fprintf(text, "For details, enable -verbose and see certspotter's stderr output.\n") - fmt.Fprintf(text, "\n") if e.LatestSTH != nil { fmt.Fprintf(text, "Latest known log size = %d\n", e.LatestSTH.TreeSize) } else { fmt.Fprintf(text, "Latest known log size = none\n") } + if e.RecentErrors != "" { + fmt.Fprintf(text, "\n") + fmt.Fprintf(text, "Recent errors (see %s for complete records):\n", e.ErrorsDir) + fmt.Fprintf(text, "\n") + fmt.Fprint(text, e.RecentErrors) + } return text.String() } func (e *BacklogInfo) Text() string { text := new(strings.Builder) fmt.Fprintf(text, "certspotter has been unable to download entries from %s in a timely manner. Consequentially, certspotter may be slow to notify you about certificates in this log.\n", e.Log.GetMonitoringURL()) fmt.Fprintf(text, "\n") - fmt.Fprintf(text, "For details, enable -verbose and see certspotter's stderr output.\n") - fmt.Fprintf(text, "\n") fmt.Fprintf(text, "Current log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.StoredAt) fmt.Fprintf(text, "Current position = %d\n", e.Position) fmt.Fprintf(text, " Backlog = %d\n", e.Backlog()) + if e.RecentErrors != "" { + fmt.Fprintf(text, "\n") + fmt.Fprintf(text, "Recent errors (see %s for complete records):\n", e.ErrorsDir) + fmt.Fprintf(text, "\n") + fmt.Fprint(text, e.RecentErrors) + } return text.String() } func (e *StaleLogListInfo) Text() string { text := new(strings.Builder) fmt.Fprintf(text, "certspotter has been unable to retrieve the log list from %s since %s.\n", e.Source, e.LastSuccess) fmt.Fprintf(text, "\n") - fmt.Fprintf(text, "Last error (at %s): %s\n", e.LastErrorTime, e.LastError) - fmt.Fprintf(text, "\n") fmt.Fprintf(text, "Consequentially, certspotter may not be monitoring all logs, and might fail to detect certificates.\n") + if e.RecentErrors != "" { + fmt.Fprintf(text, "\n") + fmt.Fprintf(text, "Recent errors (see %s for complete records):\n", e.ErrorsDir) + fmt.Fprintf(text, "\n") + fmt.Fprint(text, e.RecentErrors) + } return text.String() } diff --git a/monitor/state.go b/monitor/state.go index bc1afed..56ddc20 100644 --- a/monitor/state.go +++ b/monitor/state.go @@ -85,4 +85,7 @@ type StateProvider interface { // not associated with a log. Note that most errors are transient, and // certspotter will retry the failed operation later. NotifyError(context.Context, *loglist.Log, error) error + + // Retrieve the specified number of most recent errors. + GetErrors(context.Context, *loglist.Log, int) (string, error) } diff --git a/monitor/statedir.go b/monitor/statedir.go index a163a7d..d8d4198 100644 --- a/monitor/statedir.go +++ b/monitor/statedir.go @@ -145,7 +145,7 @@ func prepareStateDir(stateDir string) error { return fmt.Errorf("%s was created by a newer version of certspotter; upgrade to the latest version of certspotter or remove this directory to start from scratch", stateDir) } - for _, subdir := range []string{"certs", "logs", "healthchecks"} { + for _, subdir := range []string{"certs", "logs", "healthchecks", "errors"} { if err := os.Mkdir(filepath.Join(stateDir, subdir), 0777); err != nil && !errors.Is(err, fs.ErrExist) { return err }