Store log errors in state directory

Instead of writing log errors to stderr, write them to a file in the state directory. When reporting a health check failure, include the path to the file and the last several lines.

Log files are named by date, and the last 7 days are kept.

Closes #106
This commit is contained in:
Andrew Ayer 2025-06-29 17:21:15 -04:00
parent 5a8dd2ca82
commit 4fbbc5818e
7 changed files with 217 additions and 29 deletions

View File

@ -192,7 +192,6 @@ func main() {
ScriptDir: defaultScriptDir(),
Email: flags.email,
Stdout: flags.stdout,
Quiet: !flags.verbose,
}
config := &monitor.Config{
LogListSource: flags.logs,
@ -241,6 +240,19 @@ func main() {
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
defer stop()
go func() {
ticker := time.NewTicker(24*time.Hour)
defer ticker.Stop()
for {
fsstate.PruneOldErrors()
select {
case <-ctx.Done():
return
case <-ticker.C:
}
}
}()
if err := monitor.Run(ctx, config); ctx.Err() == context.Canceled && errors.Is(err, context.Canceled) {
if flags.verbose {
fmt.Fprintf(os.Stderr, "%s: exiting due to SIGINT or SIGTERM\n", programName)

View File

@ -50,11 +50,21 @@ type daemon struct {
func (daemon *daemon) healthCheck(ctx context.Context) error {
if time.Since(daemon.logsLoadedAt) >= daemon.config.HealthCheckInterval {
errors, err := daemon.config.State.GetErrors(ctx, nil, recentErrorCount)
if err != nil {
return fmt.Errorf("error getting recent errors: %w", err)
}
var errorsDir string
if fsstate, ok := daemon.config.State.(*FilesystemState); ok {
errorsDir = fsstate.errorDir(nil)
}
info := &StaleLogListInfo{
Source: daemon.config.LogListSource,
LastSuccess: daemon.logsLoadedAt,
LastError: daemon.logListError,
LastErrorTime: daemon.logListErrorAt,
RecentErrors: errors,
ErrorsDir: errorsDir,
}
if err := daemon.config.State.NotifyHealthCheckFailure(ctx, nil, info); err != nil {
return fmt.Errorf("error notifying about stale log list: %w", err)

View File

@ -14,7 +14,9 @@ import (
"encoding/hex"
"encoding/json"
"fmt"
"io"
"os"
"slices"
)
func randomFileSuffix() string {
@ -69,3 +71,47 @@ func fileExists(filename string) bool {
_, err := os.Lstat(filename)
return err == nil
}
func tailFile(filename string, linesWanted int) ([]byte, int, error) {
file, err := os.Open(filename)
if err != nil {
return nil, 0, err
}
defer file.Close()
return tail(file, linesWanted, 4096)
}
func tail(r io.ReadSeeker, linesWanted int, chunkSize int) ([]byte, int, error) {
var buf []byte
linesGot := 0
offset, err := r.Seek(0, io.SeekEnd)
if err != nil {
return nil, 0, err
}
for offset > 0 {
readSize := chunkSize
if offset < int64(readSize) {
readSize = int(offset)
}
offset -= int64(readSize)
if _, err := r.Seek(offset, io.SeekStart); err != nil {
return nil, 0, err
}
buf = slices.Grow(buf, readSize)
copy(buf[readSize:len(buf)+readSize], buf)
buf = buf[:len(buf)+readSize]
if _, err := io.ReadFull(r, buf[:readSize]); err != nil {
return nil, 0, err
}
for i := readSize; i > 0; i-- {
if buf[i-1] == '\n' {
if linesGot == linesWanted {
return buf[i:], linesGot, nil
}
linesGot++
}
}
}
return buf, linesGot, nil
}

View File

@ -20,12 +20,17 @@ import (
"os"
"path/filepath"
"strings"
"sync"
"time"
"software.sslmate.com/src/certspotter/cttypes"
"software.sslmate.com/src/certspotter/loglist"
"software.sslmate.com/src/certspotter/merkletree"
)
const keepErrorDays = 7
const errorDateFormat = "2006-01-02"
type FilesystemState struct {
StateDir string
CacheDir string
@ -34,7 +39,7 @@ type FilesystemState struct {
ScriptDir string
Email []string
Stdout bool
Quiet bool
errorMu sync.Mutex
}
func (s *FilesystemState) logStateDir(logID LogID) string {
@ -57,8 +62,9 @@ func (s *FilesystemState) PrepareLog(ctx context.Context, logID LogID) error {
sthsDirPath = filepath.Join(stateDirPath, "unverified_sths")
malformedDirPath = filepath.Join(stateDirPath, "malformed_entries")
healthchecksDirPath = filepath.Join(stateDirPath, "healthchecks")
errorsDirPath = filepath.Join(stateDirPath, "errors")
)
for _, dirPath := range []string{stateDirPath, sthsDirPath, malformedDirPath, healthchecksDirPath} {
for _, dirPath := range []string{stateDirPath, sthsDirPath, malformedDirPath, healthchecksDirPath, errorsDirPath} {
if err := os.Mkdir(dirPath, 0777); err != nil && !errors.Is(err, fs.ErrExist) {
return err
}
@ -227,6 +233,13 @@ func (s *FilesystemState) healthCheckDir(ctlog *loglist.Log) string {
}
}
func (s *FilesystemState) errorDir(ctlog *loglist.Log) string {
if ctlog == nil {
return filepath.Join(s.StateDir, "errors")
}
return filepath.Join(s.logStateDir(ctlog.LogID), "errors")
}
func (s *FilesystemState) NotifyHealthCheckFailure(ctx context.Context, ctlog *loglist.Log, info HealthCheckFailure) error {
textPath := filepath.Join(s.healthCheckDir(ctlog), healthCheckFilename())
environ := []string{
@ -248,13 +261,80 @@ func (s *FilesystemState) NotifyHealthCheckFailure(ctx context.Context, ctlog *l
return nil
}
func (s *FilesystemState) NotifyError(ctx context.Context, ctlog *loglist.Log, err error) error {
if !s.Quiet {
if ctlog == nil {
log.Print(err)
} else {
log.Print(ctlog.GetMonitoringURL(), ": ", err)
func (s *FilesystemState) NotifyError(ctx context.Context, ctlog *loglist.Log, notifyErr error) error {
var (
now = time.Now()
filePath = filepath.Join(s.errorDir(ctlog), now.Format(errorDateFormat))
line = now.Format(time.RFC3339) + " " + notifyErr.Error() + "\n"
)
s.errorMu.Lock()
defer s.errorMu.Unlock()
file, err := os.OpenFile(filePath, os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
if err != nil {
return err
}
defer file.Close()
if _, err := file.WriteString(line); err != nil {
return err
}
return file.Close()
}
func (s *FilesystemState) GetErrors(ctx context.Context, ctlog *loglist.Log, count int) (string, error) {
dir := s.errorDir(ctlog)
now := time.Now()
var buf []byte
for daysBack := 0; count > 0 && daysBack < keepErrorDays; daysBack++ {
datePath := filepath.Join(dir, now.AddDate(0, 0, -daysBack).Format(errorDateFormat))
dateBuf, dateLines, err := tailFile(datePath, count)
if errors.Is(err, fs.ErrNotExist) {
continue
} else if err != nil {
return "", err
}
buf = append(dateBuf, buf...)
count -= dateLines
}
return string(buf), nil
}
func (s *FilesystemState) PruneOldErrors() {
cutoff := time.Now().AddDate(0, 0, -keepErrorDays)
pruneDir := func(dir string) {
entries, err := os.ReadDir(dir)
if errors.Is(err, fs.ErrNotExist) {
return
} else if err != nil {
log.Printf("unable to read error directory: %s", err)
return
}
for _, entry := range entries {
if entry.IsDir() {
continue
}
date, err := time.Parse(errorDateFormat, entry.Name())
if err != nil {
continue
}
if date.Before(cutoff) {
if err := os.Remove(filepath.Join(dir, entry.Name())); err != nil && !errors.Is(err, fs.ErrNotExist) {
log.Printf("unable to remove old error file: %s", err)
}
}
}
}
return nil
pruneDir(filepath.Join(s.StateDir, "errors"))
logsDir := filepath.Join(s.StateDir, "logs")
logDirs, err := os.ReadDir(logsDir)
if err != nil && !errors.Is(err, fs.ErrNotExist) {
log.Printf("unable to read logs directory: %s", err)
return
}
for _, d := range logDirs {
if !d.IsDir() {
continue
}
pruneDir(filepath.Join(logsDir, d.Name(), "errors"))
}
}

View File

@ -19,6 +19,8 @@ import (
"software.sslmate.com/src/certspotter/loglist"
)
const recentErrorCount = 10
func healthCheckFilename() string {
return time.Now().UTC().Format(time.RFC3339) + ".txt"
}
@ -48,20 +50,37 @@ func healthCheckLog(ctx context.Context, config *Config, ctlog *loglist.Log) err
return fmt.Errorf("error loading STHs: %w", err)
}
var errorsDir string
if fsstate, ok := config.State.(*FilesystemState); ok {
errorsDir = fsstate.errorDir(ctlog)
}
if len(sths) == 0 {
errors, err := config.State.GetErrors(ctx, ctlog, recentErrorCount)
if err != nil {
return fmt.Errorf("error getting recent errors: %w", err)
}
info := &StaleSTHInfo{
Log: ctlog,
LastSuccess: lastSuccess,
LatestSTH: verifiedSTH,
Log: ctlog,
LastSuccess: lastSuccess,
LatestSTH: verifiedSTH,
RecentErrors: errors,
ErrorsDir: errorsDir,
}
if err := config.State.NotifyHealthCheckFailure(ctx, ctlog, info); err != nil {
return fmt.Errorf("error notifying about stale STH: %w", err)
}
} else {
errors, err := config.State.GetErrors(ctx, ctlog, recentErrorCount)
if err != nil {
return fmt.Errorf("error getting recent errors: %w", err)
}
info := &BacklogInfo{
Log: ctlog,
LatestSTH: sths[len(sths)-1],
Position: position,
Log: ctlog,
LatestSTH: sths[len(sths)-1],
Position: position,
RecentErrors: errors,
ErrorsDir: errorsDir,
}
if err := config.State.NotifyHealthCheckFailure(ctx, ctlog, info); err != nil {
return fmt.Errorf("error notifying about backlog: %w", err)
@ -77,15 +96,19 @@ type HealthCheckFailure interface {
}
type StaleSTHInfo struct {
Log *loglist.Log
LastSuccess time.Time // may be zero
LatestSTH *cttypes.SignedTreeHead // may be nil
Log *loglist.Log
LastSuccess time.Time // may be zero
LatestSTH *cttypes.SignedTreeHead // may be nil
RecentErrors string
ErrorsDir string
}
type BacklogInfo struct {
Log *loglist.Log
LatestSTH *StoredSTH
Position uint64
Log *loglist.Log
LatestSTH *StoredSTH
Position uint64
RecentErrors string
ErrorsDir string
}
type StaleLogListInfo struct {
@ -93,6 +116,8 @@ type StaleLogListInfo struct {
LastSuccess time.Time
LastError string
LastErrorTime time.Time
RecentErrors string
ErrorsDir string
}
func (e *StaleSTHInfo) LastSuccessString() string {
@ -120,33 +145,45 @@ func (e *StaleSTHInfo) Text() string {
text := new(strings.Builder)
fmt.Fprintf(text, "certspotter has been unable to contact %s since %s. Consequentially, certspotter may fail to notify you about certificates in this log.\n", e.Log.GetMonitoringURL(), e.LastSuccessString())
fmt.Fprintf(text, "\n")
fmt.Fprintf(text, "For details, enable -verbose and see certspotter's stderr output.\n")
fmt.Fprintf(text, "\n")
if e.LatestSTH != nil {
fmt.Fprintf(text, "Latest known log size = %d\n", e.LatestSTH.TreeSize)
} else {
fmt.Fprintf(text, "Latest known log size = none\n")
}
if e.RecentErrors != "" {
fmt.Fprintf(text, "\n")
fmt.Fprintf(text, "Recent errors (see %s for complete records):\n", e.ErrorsDir)
fmt.Fprintf(text, "\n")
fmt.Fprint(text, e.RecentErrors)
}
return text.String()
}
func (e *BacklogInfo) Text() string {
text := new(strings.Builder)
fmt.Fprintf(text, "certspotter has been unable to download entries from %s in a timely manner. Consequentially, certspotter may be slow to notify you about certificates in this log.\n", e.Log.GetMonitoringURL())
fmt.Fprintf(text, "\n")
fmt.Fprintf(text, "For details, enable -verbose and see certspotter's stderr output.\n")
fmt.Fprintf(text, "\n")
fmt.Fprintf(text, "Current log size = %d (as of %s)\n", e.LatestSTH.TreeSize, e.LatestSTH.StoredAt)
fmt.Fprintf(text, "Current position = %d\n", e.Position)
fmt.Fprintf(text, " Backlog = %d\n", e.Backlog())
if e.RecentErrors != "" {
fmt.Fprintf(text, "\n")
fmt.Fprintf(text, "Recent errors (see %s for complete records):\n", e.ErrorsDir)
fmt.Fprintf(text, "\n")
fmt.Fprint(text, e.RecentErrors)
}
return text.String()
}
func (e *StaleLogListInfo) Text() string {
text := new(strings.Builder)
fmt.Fprintf(text, "certspotter has been unable to retrieve the log list from %s since %s.\n", e.Source, e.LastSuccess)
fmt.Fprintf(text, "\n")
fmt.Fprintf(text, "Last error (at %s): %s\n", e.LastErrorTime, e.LastError)
fmt.Fprintf(text, "\n")
fmt.Fprintf(text, "Consequentially, certspotter may not be monitoring all logs, and might fail to detect certificates.\n")
if e.RecentErrors != "" {
fmt.Fprintf(text, "\n")
fmt.Fprintf(text, "Recent errors (see %s for complete records):\n", e.ErrorsDir)
fmt.Fprintf(text, "\n")
fmt.Fprint(text, e.RecentErrors)
}
return text.String()
}

View File

@ -85,4 +85,7 @@ type StateProvider interface {
// not associated with a log. Note that most errors are transient, and
// certspotter will retry the failed operation later.
NotifyError(context.Context, *loglist.Log, error) error
// Retrieve the specified number of most recent errors.
GetErrors(context.Context, *loglist.Log, int) (string, error)
}

View File

@ -145,7 +145,7 @@ func prepareStateDir(stateDir string) error {
return fmt.Errorf("%s was created by a newer version of certspotter; upgrade to the latest version of certspotter or remove this directory to start from scratch", stateDir)
}
for _, subdir := range []string{"certs", "logs", "healthchecks"} {
for _, subdir := range []string{"certs", "logs", "healthchecks", "errors"} {
if err := os.Mkdir(filepath.Join(stateDir, subdir), 0777); err != nil && !errors.Is(err, fs.ErrExist) {
return err
}