better reconnection better tracing

main
Charlotte Som 2024-11-25 20:35:26 +02:00
parent aefb726109
commit 642abf6aa5
2 changed files with 73 additions and 43 deletions

View File

@ -1,6 +1,6 @@
use std::{io::Cursor, sync::Arc}; use std::{io::Cursor, sync::Arc, time::Duration};
use anyhow::{anyhow, bail, Context, Result}; use anyhow::{bail, Result};
use atrium_api::com::atproto::sync::subscribe_repos; use atrium_api::com::atproto::sync::subscribe_repos;
use bytes::Bytes; use bytes::Bytes;
use fastwebsockets::{FragmentCollector, OpCode, Payload, WebSocketError}; use fastwebsockets::{FragmentCollector, OpCode, Payload, WebSocketError};
@ -8,14 +8,8 @@ use hyper::{header, upgrade::Upgraded, Request, Uri};
use hyper_util::rt::{TokioExecutor, TokioIo}; use hyper_util::rt::{TokioExecutor, TokioIo};
use ipld_core::ipld::Ipld; use ipld_core::ipld::Ipld;
use serde_ipld_dagcbor::DecodeError; use serde_ipld_dagcbor::DecodeError;
use tokio::{ use tokio::{net::TcpStream, sync::mpsc};
net::TcpStream, use tokio_rustls::rustls::pki_types::ServerName;
sync::{
broadcast::{self},
mpsc,
},
};
use tokio_rustls::{rustls::pki_types::ServerName, TlsConnector};
use crate::{ use crate::{
http::body_empty, http::body_empty,
@ -60,22 +54,46 @@ async fn create_ws_client(
struct DataServerSubscription { struct DataServerSubscription {
server: Arc<RelayServer>, server: Arc<RelayServer>,
host: String, host: String,
raw_block_tx: broadcast::Sender<Bytes>,
event_tx: mpsc::Sender<StreamEvent>, event_tx: mpsc::Sender<StreamEvent>,
last_seq: Option<i128>, last_seq: Option<i64>,
} }
impl DataServerSubscription { impl DataServerSubscription {
fn new(server: Arc<RelayServer>, host: String) -> Self { fn new(server: Arc<RelayServer>, host: String) -> Self {
Self { Self {
host, host,
raw_block_tx: server.raw_block_tx.clone(),
event_tx: server.event_tx.clone(), event_tx: server.event_tx.clone(),
last_seq: None, last_seq: None,
server, server,
} }
} }
async fn handle_commit(
&mut self,
payload: subscribe_repos::Commit,
) -> Result<Option<StreamEventPayload>> {
let user = lookup_user(&self.server, &payload.repo).await?;
let Some(pds) = user.pds else {
bail!("user has no associated pds? {:?}", user);
};
let uri: Uri = pds.parse()?;
if uri.authority().map(|a| a.host()) != Some(&self.host) {
bail!(
"commit from non-authoritative pds (got {} expected {})",
self.host,
pds
);
}
if user.takedown {
tracing::debug!(did = %user.did, seq = %payload.seq, "dropping commit event from taken-down user");
return Ok(None);
}
self.last_seq = Some(payload.seq);
Ok(Some(StreamEventPayload::Commit(payload)))
}
async fn handle_event(&mut self, frame: Bytes) -> Result<()> { async fn handle_event(&mut self, frame: Bytes) -> Result<()> {
// TODO: validate if this message is valid to come from this host // TODO: validate if this message is valid to come from this host
@ -92,52 +110,60 @@ impl DataServerSubscription {
Some("#commit") => { Some("#commit") => {
let payload = let payload =
serde_ipld_dagcbor::from_slice::<subscribe_repos::Commit>(payload_buf)?; serde_ipld_dagcbor::from_slice::<subscribe_repos::Commit>(payload_buf)?;
self.handle_commit(payload).await?
}
let user = lookup_user(&self.server, &payload.repo).await?; Some("#handle") => {
let Some(pds) = user.pds else { // TODO
bail!("user has no associated pds? {}", user.did); None
}; }
let uri: Uri = pds.parse()?;
if uri.authority().map(|a| a.host()) != Some(&self.host) { Some("#info") => {
bail!( let payload = serde_ipld_dagcbor::from_slice::<subscribe_repos::Info>(payload_buf)?;
"commit from non-authoritative pds (got {} expected {})", if payload.name == "OutdatedCursor" {
self.host, tracing::warn!(message = ?payload.message, "outdated cursor");
pds
);
} }
StreamEventPayload::Commit(payload) None
} }
Some(t) => { Some(t) => {
tracing::warn!("dropped unknown message type '{}'", t); tracing::warn!("dropped unknown message type '{}'", t);
return Ok(()); None
}
None => {
return Ok(());
// skip ig
} }
None => None,
}; };
self.event_tx.send((header, payload)).await?; if let Some(payload) = payload {
self.event_tx.send((header, payload)).await?;
}
Ok(()) Ok(())
} }
} }
pub async fn subscribe_to_host(server: Arc<RelayServer>, host: String) -> Result<()> { #[tracing::instrument(skip_all, fields(host = %host))]
tracing::debug!(%host, "establishing connection"); async fn host_subscription(server: Arc<RelayServer>, host: String) -> Result<()> {
tracing::debug!("establishing connection");
let mut subscription = DataServerSubscription::new(server, host); let mut subscription = DataServerSubscription::new(server, host);
// TODO: load seq from db ?
'reconnect: loop { 'reconnect: loop {
let mut ws = create_ws_client( let mut ws = create_ws_client(
&subscription.host, &subscription.host,
443, 443,
"/xrpc/com.atproto.sync.subscribeRepos", &format!(
"/xrpc/com.atproto.sync.subscribeRepos{}",
subscription
.last_seq
.map(|c| format!("?cursor={c}"))
.unwrap_or_default()
),
) )
.await?; .await?;
tracing::debug!(seq = ?subscription.last_seq, "listening");
tracing::debug!(host = %subscription.host, "listening");
loop { loop {
match ws.read_frame().await { match ws.read_frame().await {
@ -153,15 +179,19 @@ pub async fn subscribe_to_host(server: Arc<RelayServer>, host: String) -> Result
tracing::error!("error handling event (skipping): {e:?}"); tracing::error!("error handling event (skipping): {e:?}");
} }
} }
Ok(frame) if frame.opcode == OpCode::Close => {
tracing::debug!("got close frame. reconnecting in 10s");
tokio::time::sleep(Duration::from_secs(10)).await;
continue 'reconnect;
}
Ok(frame) => { Ok(frame) => {
tracing::warn!("unexpected frame type {:?}", frame.opcode); tracing::warn!("unexpected frame type {:?}", frame.opcode);
} }
Err(e) => { Err(e) => {
tracing::error!(host = %subscription.host, "{e:?}"); tracing::error!("{e:?}");
// TODO: should we try reconnect in every situation? // TODO: should we try to reconnect in every situation?
if let WebSocketError::UnexpectedEOF = e { if let WebSocketError::UnexpectedEOF = e {
tracing::debug!(host = %subscription.host, "reconnecting"); tracing::debug!("got unexpected EOF. reconnecting immediately");
// TODO: should we sleep at all here
continue 'reconnect; continue 'reconnect;
} else { } else {
break 'reconnect; break 'reconnect;
@ -171,7 +201,7 @@ pub async fn subscribe_to_host(server: Arc<RelayServer>, host: String) -> Result
} }
} }
tracing::debug!(host = %subscription.host, "disconnected"); tracing::debug!("disconnected");
Ok(()) Ok(())
} }
@ -183,7 +213,7 @@ pub fn index_servers(server: Arc<RelayServer>, hosts: &[String]) {
let host = host.to_string(); let host = host.to_string();
let server = Arc::clone(&server); let server = Arc::clone(&server);
tokio::task::spawn(async move { tokio::task::spawn(async move {
if let Err(e) = subscribe_to_host(server, host).await { if let Err(e) = host_subscription(server, host).await {
tracing::warn!("encountered error subscribing to PDS: {e:?}"); tracing::warn!("encountered error subscribing to PDS: {e:?}");
} }
}); });

View File

@ -14,7 +14,7 @@ use crate::{
RelayServer, RelayServer,
}; };
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize, Debug, Clone)]
pub struct User { pub struct User {
pub did: String, pub did: String,
pub pds: Option<String>, pub pds: Option<String>,