parent
a1b7e4ef3f
commit
802f1c96f8
|
@ -34,27 +34,55 @@ import (
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
// Event was passed to the roomserver
|
||||||
|
MetricsOutcomeOK = "ok"
|
||||||
|
// Event failed to be processed
|
||||||
|
MetricsOutcomeFail = "fail"
|
||||||
|
// Event failed auth checks
|
||||||
|
MetricsOutcomeRejected = "rejected"
|
||||||
|
// Terminated the transaction
|
||||||
|
MetricsOutcomeFatal = "fatal"
|
||||||
|
// The event has missing auth_events we need to fetch
|
||||||
|
MetricsWorkMissingAuthEvents = "missing_auth_events"
|
||||||
|
// No work had to be done as we had all prev/auth events
|
||||||
|
MetricsWorkDirect = "direct"
|
||||||
|
// The event has missing prev_events we need to call /g_m_e for
|
||||||
|
MetricsWorkMissingPrevEvents = "missing_prev_events"
|
||||||
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
pduCountTotal = prometheus.NewCounterVec(
|
pduCountTotal = prometheus.NewCounterVec(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Namespace: "dendrite",
|
Namespace: "dendrite",
|
||||||
Subsystem: "federationapi",
|
Subsystem: "federationapi",
|
||||||
Name: "recv_pdus",
|
Name: "recv_pdus",
|
||||||
|
Help: "Number of incoming PDUs from remote servers with labels for success",
|
||||||
},
|
},
|
||||||
[]string{"status"},
|
[]string{"status"}, // 'success' or 'total'
|
||||||
)
|
)
|
||||||
eduCountTotal = prometheus.NewCounter(
|
eduCountTotal = prometheus.NewCounter(
|
||||||
prometheus.CounterOpts{
|
prometheus.CounterOpts{
|
||||||
Namespace: "dendrite",
|
Namespace: "dendrite",
|
||||||
Subsystem: "federationapi",
|
Subsystem: "federationapi",
|
||||||
Name: "recv_edus",
|
Name: "recv_edus",
|
||||||
|
Help: "Number of incoming EDUs from remote servers",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
processEventSummary = prometheus.NewSummaryVec(
|
||||||
|
prometheus.SummaryOpts{
|
||||||
|
Namespace: "dendrite",
|
||||||
|
Subsystem: "federationapi",
|
||||||
|
Name: "process_event",
|
||||||
|
Help: "How long it takes to process an incoming event and what work had to be done for it",
|
||||||
|
},
|
||||||
|
[]string{"work", "outcome"},
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
func init() {
|
func init() {
|
||||||
prometheus.MustRegister(
|
prometheus.MustRegister(
|
||||||
pduCountTotal, eduCountTotal,
|
pduCountTotal, eduCountTotal, processEventSummary,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -140,6 +168,7 @@ type txnReq struct {
|
||||||
// new events which the roomserver does not know about
|
// new events which the roomserver does not know about
|
||||||
newEvents map[string]bool
|
newEvents map[string]bool
|
||||||
newEventsMutex sync.RWMutex
|
newEventsMutex sync.RWMutex
|
||||||
|
work string // metrics
|
||||||
}
|
}
|
||||||
|
|
||||||
// A subset of FederationClient functionality that txn requires. Useful for testing.
|
// A subset of FederationClient functionality that txn requires. Useful for testing.
|
||||||
|
@ -212,6 +241,7 @@ func (t *txnReq) processTransaction(ctx context.Context) (*gomatrixserverlib.Res
|
||||||
|
|
||||||
// Process the events.
|
// Process the events.
|
||||||
for _, e := range pdus {
|
for _, e := range pdus {
|
||||||
|
evStart := time.Now()
|
||||||
if err := t.processEvent(ctx, e.Unwrap()); err != nil {
|
if err := t.processEvent(ctx, e.Unwrap()); err != nil {
|
||||||
// If the error is due to the event itself being bad then we skip
|
// If the error is due to the event itself being bad then we skip
|
||||||
// it and move onto the next event. We report an error so that the
|
// it and move onto the next event. We report an error so that the
|
||||||
|
@ -233,17 +263,25 @@ func (t *txnReq) processTransaction(ctx context.Context) (*gomatrixserverlib.Res
|
||||||
// our server so we should bail processing the transaction entirely.
|
// our server so we should bail processing the transaction entirely.
|
||||||
util.GetLogger(ctx).Warnf("Processing %s failed fatally: %s", e.EventID(), err)
|
util.GetLogger(ctx).Warnf("Processing %s failed fatally: %s", e.EventID(), err)
|
||||||
jsonErr := util.ErrorResponse(err)
|
jsonErr := util.ErrorResponse(err)
|
||||||
|
processEventSummary.WithLabelValues(t.work, MetricsOutcomeFatal).Observe(
|
||||||
|
float64(time.Since(evStart).Nanoseconds()) / 1000.,
|
||||||
|
)
|
||||||
return nil, &jsonErr
|
return nil, &jsonErr
|
||||||
} else {
|
} else {
|
||||||
// Auth errors mean the event is 'rejected' which have to be silent to appease sytest
|
// Auth errors mean the event is 'rejected' which have to be silent to appease sytest
|
||||||
errMsg := ""
|
errMsg := ""
|
||||||
|
outcome := MetricsOutcomeRejected
|
||||||
_, rejected := err.(*gomatrixserverlib.NotAllowed)
|
_, rejected := err.(*gomatrixserverlib.NotAllowed)
|
||||||
if !rejected {
|
if !rejected {
|
||||||
errMsg = err.Error()
|
errMsg = err.Error()
|
||||||
|
outcome = MetricsOutcomeFail
|
||||||
}
|
}
|
||||||
util.GetLogger(ctx).WithError(err).WithField("event_id", e.EventID()).WithField("rejected", rejected).Warn(
|
util.GetLogger(ctx).WithError(err).WithField("event_id", e.EventID()).WithField("rejected", rejected).Warn(
|
||||||
"Failed to process incoming federation event, skipping",
|
"Failed to process incoming federation event, skipping",
|
||||||
)
|
)
|
||||||
|
processEventSummary.WithLabelValues(t.work, outcome).Observe(
|
||||||
|
float64(time.Since(evStart).Nanoseconds()) / 1000.,
|
||||||
|
)
|
||||||
results[e.EventID()] = gomatrixserverlib.PDUResult{
|
results[e.EventID()] = gomatrixserverlib.PDUResult{
|
||||||
Error: errMsg,
|
Error: errMsg,
|
||||||
}
|
}
|
||||||
|
@ -251,6 +289,9 @@ func (t *txnReq) processTransaction(ctx context.Context) (*gomatrixserverlib.Res
|
||||||
} else {
|
} else {
|
||||||
results[e.EventID()] = gomatrixserverlib.PDUResult{}
|
results[e.EventID()] = gomatrixserverlib.PDUResult{}
|
||||||
pduCountTotal.WithLabelValues("success").Inc()
|
pduCountTotal.WithLabelValues("success").Inc()
|
||||||
|
processEventSummary.WithLabelValues(t.work, MetricsOutcomeOK).Observe(
|
||||||
|
float64(time.Since(evStart).Nanoseconds()) / 1000.,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -452,6 +493,7 @@ func (t *txnReq) getServers(ctx context.Context, roomID string) []gomatrixserver
|
||||||
|
|
||||||
func (t *txnReq) processEvent(ctx context.Context, e *gomatrixserverlib.Event) error {
|
func (t *txnReq) processEvent(ctx context.Context, e *gomatrixserverlib.Event) error {
|
||||||
logger := util.GetLogger(ctx).WithField("event_id", e.EventID()).WithField("room_id", e.RoomID())
|
logger := util.GetLogger(ctx).WithField("event_id", e.EventID()).WithField("room_id", e.RoomID())
|
||||||
|
t.work = "" // reset from previous event
|
||||||
|
|
||||||
// Work out if the roomserver knows everything it needs to know to auth
|
// Work out if the roomserver knows everything it needs to know to auth
|
||||||
// the event. This includes the prev_events and auth_events.
|
// the event. This includes the prev_events and auth_events.
|
||||||
|
@ -480,6 +522,7 @@ func (t *txnReq) processEvent(ctx context.Context, e *gomatrixserverlib.Event) e
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(stateResp.MissingAuthEventIDs) > 0 {
|
if len(stateResp.MissingAuthEventIDs) > 0 {
|
||||||
|
t.work = MetricsWorkMissingAuthEvents
|
||||||
logger.Infof("Event refers to %d unknown auth_events", len(stateResp.MissingAuthEventIDs))
|
logger.Infof("Event refers to %d unknown auth_events", len(stateResp.MissingAuthEventIDs))
|
||||||
if err := t.retrieveMissingAuthEvents(ctx, e, &stateResp); err != nil {
|
if err := t.retrieveMissingAuthEvents(ctx, e, &stateResp); err != nil {
|
||||||
return fmt.Errorf("t.retrieveMissingAuthEvents: %w", err)
|
return fmt.Errorf("t.retrieveMissingAuthEvents: %w", err)
|
||||||
|
@ -487,9 +530,11 @@ func (t *txnReq) processEvent(ctx context.Context, e *gomatrixserverlib.Event) e
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(stateResp.MissingPrevEventIDs) > 0 {
|
if len(stateResp.MissingPrevEventIDs) > 0 {
|
||||||
|
t.work = MetricsWorkMissingPrevEvents
|
||||||
logger.Infof("Event refers to %d unknown prev_events", len(stateResp.MissingPrevEventIDs))
|
logger.Infof("Event refers to %d unknown prev_events", len(stateResp.MissingPrevEventIDs))
|
||||||
return t.processEventWithMissingState(ctx, e, stateResp.RoomVersion)
|
return t.processEventWithMissingState(ctx, e, stateResp.RoomVersion)
|
||||||
}
|
}
|
||||||
|
t.work = MetricsWorkDirect
|
||||||
|
|
||||||
// pass the event to the roomserver which will do auth checks
|
// pass the event to the roomserver which will do auth checks
|
||||||
// If the event fail auth checks, gmsl.NotAllowed error will be returned which we be silently
|
// If the event fail auth checks, gmsl.NotAllowed error will be returned which we be silently
|
||||||
|
@ -784,7 +829,7 @@ func (t *txnReq) lookupStateAfterEventLocally(ctx context.Context, roomID, event
|
||||||
queryReq := api.QueryEventsByIDRequest{
|
queryReq := api.QueryEventsByIDRequest{
|
||||||
EventIDs: missingEventList,
|
EventIDs: missingEventList,
|
||||||
}
|
}
|
||||||
util.GetLogger(ctx).Infof("Fetching missing auth events: %v", missingEventList)
|
util.GetLogger(ctx).WithField("count", len(missingEventList)).Infof("Fetching missing auth events")
|
||||||
var queryRes api.QueryEventsByIDResponse
|
var queryRes api.QueryEventsByIDResponse
|
||||||
if err = t.rsAPI.QueryEventsByID(ctx, &queryReq, &queryRes); err != nil {
|
if err = t.rsAPI.QueryEventsByID(ctx, &queryReq, &queryRes); err != nil {
|
||||||
return nil
|
return nil
|
||||||
|
@ -854,10 +899,6 @@ retryAllowedState:
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// getMissingEvents returns a nil backwardsExtremity if missing events were fetched and handled, else returns the new backwards extremity which we should
|
|
||||||
// begin from. Returns an error only if we should terminate the transaction which initiated /get_missing_events
|
|
||||||
// This function recursively calls txnReq.processEvent with the missing events, which will be processed before this function returns.
|
|
||||||
// This means that we may recursively call this function, as we spider back up prev_events.
|
|
||||||
func (t *txnReq) getMissingEvents(ctx context.Context, e *gomatrixserverlib.Event, roomVersion gomatrixserverlib.RoomVersion) (newEvents []*gomatrixserverlib.Event, err error) {
|
func (t *txnReq) getMissingEvents(ctx context.Context, e *gomatrixserverlib.Event, roomVersion gomatrixserverlib.RoomVersion) (newEvents []*gomatrixserverlib.Event, err error) {
|
||||||
logger := util.GetLogger(ctx).WithField("event_id", e.EventID()).WithField("room_id", e.RoomID())
|
logger := util.GetLogger(ctx).WithField("event_id", e.EventID()).WithField("room_id", e.RoomID())
|
||||||
needed := gomatrixserverlib.StateNeededForAuth([]*gomatrixserverlib.Event{e})
|
needed := gomatrixserverlib.StateNeededForAuth([]*gomatrixserverlib.Event{e})
|
||||||
|
|
|
@ -26,9 +26,28 @@ import (
|
||||||
"github.com/matrix-org/dendrite/keyserver/api"
|
"github.com/matrix-org/dendrite/keyserver/api"
|
||||||
"github.com/matrix-org/gomatrixserverlib"
|
"github.com/matrix-org/gomatrixserverlib"
|
||||||
"github.com/matrix-org/util"
|
"github.com/matrix-org/util"
|
||||||
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var (
|
||||||
|
deviceListUpdateCount = prometheus.NewCounterVec(
|
||||||
|
prometheus.CounterOpts{
|
||||||
|
Namespace: "dendrite",
|
||||||
|
Subsystem: "keyserver",
|
||||||
|
Name: "device_list_update",
|
||||||
|
Help: "Number of times we have attempted to update device lists from this server",
|
||||||
|
},
|
||||||
|
[]string{"server"},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
prometheus.MustRegister(
|
||||||
|
deviceListUpdateCount,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// DeviceListUpdater handles device list updates from remote servers.
|
// DeviceListUpdater handles device list updates from remote servers.
|
||||||
//
|
//
|
||||||
// In the case where we have the prev_id for an update, the updater just stores the update (after acquiring a per-user lock).
|
// In the case where we have the prev_id for an update, the updater just stores the update (after acquiring a per-user lock).
|
||||||
|
@ -319,6 +338,7 @@ func (u *DeviceListUpdater) worker(ch chan gomatrixserverlib.ServerName) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerName) (time.Duration, bool) {
|
func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerName) (time.Duration, bool) {
|
||||||
|
deviceListUpdateCount.WithLabelValues(string(serverName)).Inc()
|
||||||
requestTimeout := time.Second * 30 // max amount of time we want to spend on each request
|
requestTimeout := time.Second * 30 // max amount of time we want to spend on each request
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
|
ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
|
|
Loading…
Reference in New Issue