refactor(server::client::auth): removed state machine and added approval flow coordination
Some checks failed
ci/woodpecker/pr/server-audit Pipeline was successful
ci/woodpecker/pr/server-vet Pipeline failed
ci/woodpecker/pr/server-lint Pipeline failed
ci/woodpecker/pr/server-test Pipeline was successful

This commit is contained in:
hdbg
2026-03-11 20:18:06 +01:00
parent 606a1f3774
commit ba86d18250
9 changed files with 329 additions and 256 deletions

View File

@@ -4,6 +4,52 @@ This document covers concrete technology choices and dependencies. For the archi
--- ---
## Client Connection Flow
### New Client Approval
When a client whose public key is not yet in the database connects, all connected user agents are asked to approve the connection. The first agent to respond determines the outcome; remaining requests are cancelled via a watch channel.
```mermaid
flowchart TD
A([Client connects]) --> B[Receive AuthChallengeRequest]
B --> C{pubkey in DB?}
C -- yes --> D[Read nonce\nIncrement nonce in DB]
D --> G
C -- no --> E[Ask all UserAgents:\nClientConnectionRequest]
E --> F{First response}
F -- denied --> Z([Reject connection])
F -- approved --> F2[Cancel remaining\nUserAgent requests]
F2 --> F3[INSERT client\nnonce = 1]
F3 --> G[Send AuthChallenge\nwith nonce]
G --> H[Receive AuthChallengeSolution]
H --> I{Signature valid?}
I -- no --> Z
I -- yes --> J([Session started])
```
### Known Issue: Concurrent Registration Race (TOCTOU)
Two connections presenting the same previously-unknown public key can race through the approval flow simultaneously:
1. Both check the DB → neither is registered.
2. Both request approval from user agents → both receive approval.
3. Both `INSERT` the client record → the second insert silently overwrites the first, resetting the nonce.
This means the first connection's nonce is invalidated by the second, causing its challenge verification to fail. A fix requires either serialising new-client registration (e.g. an in-memory lock keyed on pubkey) or replacing the separate check + insert with an `INSERT OR IGNORE` / upsert guarded by a unique constraint on `public_key`.
### Nonce Semantics
The `program_client.nonce` column stores the **next usable nonce** — i.e. it is always one ahead of the nonce last issued in a challenge.
- **New client:** inserted with `nonce = 1`; the first challenge is issued with `nonce = 0`.
- **Existing client:** the current DB value is read and used as the challenge nonce, then immediately incremented within the same exclusive transaction, preventing replay.
---
## Cryptography ## Cryptography
### Authentication ### Authentication

View File

@@ -83,6 +83,23 @@ use async_trait::async_trait;
pub enum Error { pub enum Error {
#[error("Transport channel is closed")] #[error("Transport channel is closed")]
ChannelClosed, ChannelClosed,
#[error("Unexpected message received")]
UnexpectedMessage,
}
/// Receives one message from `transport` and extracts a value from it using
/// `extractor`. Returns [`Error::ChannelClosed`] if the transport closes and
/// [`Error::UnexpectedMessage`] if `extractor` returns `None`.
pub async fn expect_message<T, Inbound, Outbound, Target, F>(
transport: &mut T,
extractor: F,
) -> Result<Target, Error>
where
T: Bi<Inbound, Outbound> + ?Sized,
F: FnOnce(Inbound) -> Option<Target>,
{
let msg = transport.recv().await.ok_or(Error::ChannelClosed)?;
extractor(msg).ok_or(Error::UnexpectedMessage)
} }
/// Minimal bidirectional transport abstraction used by protocol code. /// Minimal bidirectional transport abstraction used by protocol code.

View File

@@ -0,0 +1,251 @@
use arbiter_proto::{
format_challenge,
proto::client::{
AuthChallenge, AuthChallengeSolution, ClientConnectError, ClientRequest, ClientResponse,
client_connect_error::Code as ConnectErrorCode,
client_request::Payload as ClientRequestPayload,
client_response::Payload as ClientResponsePayload,
},
transport::expect_message,
};
use diesel::{
ExpressionMethods as _, OptionalExtension as _, QueryDsl as _, dsl::insert_into, update,
};
use diesel_async::RunQueryDsl as _;
use ed25519_dalek::VerifyingKey;
use kameo::error::SendError;
use tracing::error;
use crate::{
actors::{client::ClientConnection, router::{self, RequestClientApproval}},
db::{self, schema::program_client},
};
use super::session::ClientSession;
#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
pub enum Error {
#[error("Unexpected message payload")]
UnexpectedMessagePayload,
#[error("Invalid client public key length")]
InvalidClientPubkeyLength,
#[error("Invalid client public key encoding")]
InvalidAuthPubkeyEncoding,
#[error("Database pool unavailable")]
DatabasePoolUnavailable,
#[error("Database operation failed")]
DatabaseOperationFailed,
#[error("Invalid challenge solution")]
InvalidChallengeSolution,
#[error("Client approval request failed")]
ApproveError(#[from] ApproveError),
#[error("Internal error")]
InternalError,
#[error("Transport error")]
Transport,
}
#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
pub enum ApproveError {
#[error("Internal error")]
Internal,
#[error("Client connection denied by user agents")]
Denied,
#[error("Upstream error: {0}")]
Upstream(router::ApprovalError),
}
/// Atomically reads and increments the nonce for a known client.
/// Returns `None` if the pubkey is not registered.
async fn get_nonce(db: &db::DatabasePool, pubkey: &VerifyingKey) -> Result<Option<i32>, Error> {
let pubkey_bytes = pubkey.as_bytes().to_vec();
let mut conn = db.get().await.map_err(|e| {
error!(error = ?e, "Database pool error");
Error::DatabasePoolUnavailable
})?;
conn.exclusive_transaction(|conn| {
let pubkey_bytes = pubkey_bytes.clone();
Box::pin(async move {
let Some(current_nonce) = program_client::table
.filter(program_client::public_key.eq(&pubkey_bytes))
.select(program_client::nonce)
.first::<i32>(conn)
.await
.optional()?
else {
return Result::<_, diesel::result::Error>::Ok(None);
};
update(program_client::table)
.filter(program_client::public_key.eq(&pubkey_bytes))
.set(program_client::nonce.eq(current_nonce + 1))
.execute(conn)
.await?;
Ok(Some(current_nonce))
})
})
.await
.map_err(|e| {
error!(error = ?e, "Database error");
Error::DatabaseOperationFailed
})
}
async fn approve_new_client(
actors: &crate::actors::GlobalActors,
pubkey: VerifyingKey,
) -> Result<(), Error> {
let result = actors
.router
.ask(RequestClientApproval { client_pubkey: pubkey })
.await;
match result {
Ok(true) => Ok(()),
Ok(false) => Err(Error::ApproveError(ApproveError::Denied)),
Err(SendError::HandlerError(e)) => {
error!(error = ?e, "Approval upstream error");
Err(Error::ApproveError(ApproveError::Upstream(e)))
}
Err(e) => {
error!(error = ?e, "Approval request to router failed");
Err(Error::ApproveError(ApproveError::Internal))
}
}
}
async fn insert_client(db: &db::DatabasePool, pubkey: &VerifyingKey) -> Result<(), Error> {
let now = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap_or_default()
.as_secs() as i32;
let mut conn = db.get().await.map_err(|e| {
error!(error = ?e, "Database pool error");
Error::DatabasePoolUnavailable
})?;
insert_into(program_client::table)
.values((
program_client::public_key.eq(pubkey.as_bytes().to_vec()),
program_client::nonce.eq(1), // pre-incremented; challenge uses 0
program_client::created_at.eq(now),
program_client::updated_at.eq(now),
))
.execute(&mut conn)
.await
.map_err(|e| {
error!(error = ?e, "Failed to insert new client");
Error::DatabaseOperationFailed
})?;
Ok(())
}
async fn challenge_client(
props: &mut ClientConnection,
pubkey: VerifyingKey,
nonce: i32,
) -> Result<(), Error> {
let challenge = AuthChallenge {
pubkey: pubkey.as_bytes().to_vec(),
nonce,
};
props
.transport
.send(Ok(ClientResponse {
payload: Some(ClientResponsePayload::AuthChallenge(challenge.clone())),
}))
.await
.map_err(|e| {
error!(error = ?e, "Failed to send auth challenge");
Error::Transport
})?;
let AuthChallengeSolution { signature } = expect_message(
&mut *props.transport,
|req: ClientRequest| match req.payload? {
ClientRequestPayload::AuthChallengeSolution(s) => Some(s),
_ => None,
},
)
.await
.map_err(|e| {
error!(error = ?e, "Failed to receive challenge solution");
Error::Transport
})?;
let formatted = format_challenge(nonce, &challenge.pubkey);
let sig = signature.as_slice().try_into().map_err(|_| {
error!("Invalid signature length");
Error::InvalidChallengeSolution
})?;
pubkey.verify_strict(&formatted, &sig).map_err(|_| {
error!("Challenge solution verification failed");
Error::InvalidChallengeSolution
})?;
Ok(())
}
fn connect_error_code(err: &Error) -> ConnectErrorCode {
match err {
Error::ApproveError(ApproveError::Denied) => ConnectErrorCode::ApprovalDenied,
Error::ApproveError(ApproveError::Upstream(router::ApprovalError::NoUserAgentsConnected)) => {
ConnectErrorCode::NoUserAgentsOnline
}
_ => ConnectErrorCode::Unknown,
}
}
async fn authenticate(props: &mut ClientConnection) -> Result<VerifyingKey, Error> {
let Some(ClientRequest {
payload: Some(ClientRequestPayload::AuthChallengeRequest(challenge)),
}) = props.transport.recv().await
else {
return Err(Error::Transport);
};
let pubkey_bytes = challenge
.pubkey
.as_array()
.ok_or(Error::InvalidClientPubkeyLength)?;
let pubkey =
VerifyingKey::from_bytes(pubkey_bytes).map_err(|_| Error::InvalidAuthPubkeyEncoding)?;
let nonce = match get_nonce(&props.db, &pubkey).await? {
Some(nonce) => nonce,
None => {
approve_new_client(&props.actors, pubkey).await?;
insert_client(&props.db, &pubkey).await?;
0
}
};
challenge_client(props, pubkey, nonce).await?;
Ok(pubkey)
}
pub async fn authenticate_and_create(mut props: ClientConnection) -> Result<ClientSession, Error> {
match authenticate(&mut props).await {
Ok(pubkey) => Ok(ClientSession::new(props, pubkey)),
Err(err) => {
let code = connect_error_code(&err);
let _ = props
.transport
.send(Ok(ClientResponse {
payload: Some(ClientResponsePayload::ClientConnectError(
ClientConnectError { code: code.into() },
)),
}))
.await;
Err(err)
}
}
}

View File

@@ -1,101 +0,0 @@
use arbiter_proto::proto::client::{
AuthChallengeRequest, AuthChallengeSolution, ClientRequest,
client_request::Payload as ClientRequestPayload,
};
use ed25519_dalek::VerifyingKey;
use tracing::error;
use crate::actors::client::{
ClientConnection,
auth::state::{AuthContext, AuthStateMachine},
session::ClientSession,
};
#[derive(thiserror::Error, Debug, Clone, PartialEq, Eq)]
pub enum Error {
#[error("Unexpected message payload")]
UnexpectedMessagePayload,
#[error("Invalid client public key length")]
InvalidClientPubkeyLength,
#[error("Invalid client public key encoding")]
InvalidAuthPubkeyEncoding,
#[error("Database pool unavailable")]
DatabasePoolUnavailable,
#[error("Database operation failed")]
DatabaseOperationFailed,
#[error("Public key not registered")]
PublicKeyNotRegistered,
#[error("Invalid signature length")]
InvalidSignatureLength,
#[error("Invalid challenge solution")]
InvalidChallengeSolution,
#[error("Transport error")]
Transport,
}
mod state;
use state::*;
fn parse_auth_event(payload: ClientRequestPayload) -> Result<AuthEvents, Error> {
match payload {
ClientRequestPayload::AuthChallengeRequest(AuthChallengeRequest { pubkey }) => {
let pubkey_bytes = pubkey.as_array().ok_or(Error::InvalidClientPubkeyLength)?;
let pubkey = VerifyingKey::from_bytes(pubkey_bytes)
.map_err(|_| Error::InvalidAuthPubkeyEncoding)?;
Ok(AuthEvents::AuthRequest(ChallengeRequest {
pubkey: pubkey.into(),
}))
}
ClientRequestPayload::AuthChallengeSolution(AuthChallengeSolution { signature }) => {
Ok(AuthEvents::ReceivedSolution(ChallengeSolution {
solution: signature,
}))
}
}
}
pub async fn authenticate(props: &mut ClientConnection) -> Result<VerifyingKey, Error> {
let mut state = AuthStateMachine::new(AuthContext::new(props));
loop {
let transport = state.context_mut().conn.transport.as_mut();
let Some(ClientRequest {
payload: Some(payload),
}) = transport.recv().await
else {
return Err(Error::Transport);
};
let event = parse_auth_event(payload)?;
match state.process_event(event).await {
Ok(AuthStates::AuthOk(key)) => return Ok(key.clone()),
Err(AuthError::ActionFailed(err)) => {
error!(?err, "State machine action failed");
return Err(err);
}
Err(AuthError::GuardFailed(err)) => {
error!(?err, "State machine guard failed");
return Err(err);
}
Err(AuthError::InvalidEvent) => {
error!("Invalid event for current state");
return Err(Error::InvalidChallengeSolution);
}
Err(AuthError::TransitionsFailed) => {
error!("Invalid state transition");
return Err(Error::InvalidChallengeSolution);
}
_ => (),
}
}
}
pub async fn authenticate_and_create(
mut props: ClientConnection,
) -> Result<ClientSession, Error> {
let key = authenticate(&mut props).await?;
let session = ClientSession::new(props, key);
Ok(session)
}

View File

@@ -1,136 +0,0 @@
use arbiter_proto::proto::client::{
AuthChallenge, ClientResponse,
client_response::Payload as ClientResponsePayload,
};
use diesel::{ExpressionMethods as _, OptionalExtension as _, QueryDsl, update};
use diesel_async::RunQueryDsl;
use ed25519_dalek::VerifyingKey;
use tracing::error;
use super::Error;
use crate::{actors::client::ClientConnection, db::schema};
pub struct ChallengeRequest {
pub pubkey: VerifyingKey,
}
pub struct ChallengeContext {
pub challenge: AuthChallenge,
pub key: VerifyingKey,
}
pub struct ChallengeSolution {
pub solution: Vec<u8>,
}
smlang::statemachine!(
name: Auth,
custom_error: true,
transitions: {
*Init + AuthRequest(ChallengeRequest) / async prepare_challenge = SentChallenge(ChallengeContext),
SentChallenge(ChallengeContext) + ReceivedSolution(ChallengeSolution) [async verify_solution] / provide_key = AuthOk(VerifyingKey),
}
);
async fn create_nonce(db: &crate::db::DatabasePool, pubkey_bytes: &[u8]) -> Result<i32, Error> {
let mut db_conn = db.get().await.map_err(|e| {
error!(error = ?e, "Database pool error");
Error::DatabasePoolUnavailable
})?;
db_conn
.exclusive_transaction(|conn| {
Box::pin(async move {
let current_nonce = schema::program_client::table
.filter(schema::program_client::public_key.eq(pubkey_bytes.to_vec()))
.select(schema::program_client::nonce)
.first::<i32>(conn)
.await?;
update(schema::program_client::table)
.filter(schema::program_client::public_key.eq(pubkey_bytes.to_vec()))
.set(schema::program_client::nonce.eq(current_nonce + 1))
.execute(conn)
.await?;
Result::<_, diesel::result::Error>::Ok(current_nonce)
})
})
.await
.optional()
.map_err(|e| {
error!(error = ?e, "Database error");
Error::DatabaseOperationFailed
})?
.ok_or_else(|| {
error!(?pubkey_bytes, "Public key not found in database");
Error::PublicKeyNotRegistered
})
}
pub struct AuthContext<'a> {
pub(super) conn: &'a mut ClientConnection,
}
impl<'a> AuthContext<'a> {
pub fn new(conn: &'a mut ClientConnection) -> Self {
Self { conn }
}
}
impl AuthStateMachineContext for AuthContext<'_> {
type Error = Error;
async fn verify_solution(
&self,
ChallengeContext { challenge, key }: &ChallengeContext,
ChallengeSolution { solution }: &ChallengeSolution,
) -> Result<bool, Self::Error> {
let formatted_challenge =
arbiter_proto::format_challenge(challenge.nonce, &challenge.pubkey);
let signature = solution.as_slice().try_into().map_err(|_| {
error!(?solution, "Invalid signature length");
Error::InvalidChallengeSolution
})?;
let valid = key.verify_strict(&formatted_challenge, &signature).is_ok();
Ok(valid)
}
async fn prepare_challenge(
&mut self,
ChallengeRequest { pubkey }: ChallengeRequest,
) -> Result<ChallengeContext, Self::Error> {
let nonce = create_nonce(&self.conn.db, pubkey.as_bytes()).await?;
let challenge = AuthChallenge {
pubkey: pubkey.as_bytes().to_vec(),
nonce,
};
self.conn
.transport
.send(Ok(ClientResponse {
payload: Some(ClientResponsePayload::AuthChallenge(challenge.clone())),
}))
.await
.map_err(|e| {
error!(?e, "Failed to send auth challenge");
Error::Transport
})?;
Ok(ChallengeContext {
challenge,
key: pubkey,
})
}
fn provide_key(
&mut self,
state_data: &ChallengeContext,
_: ChallengeSolution,
) -> Result<VerifyingKey, Self::Error> {
Ok(state_data.key)
}
}

View File

@@ -56,7 +56,7 @@ impl Actor for MessageRouter {
} }
} }
#[derive(Debug, thiserror::Error)] #[derive(Debug, thiserror::Error, Clone, PartialEq, Eq, Hash)]
pub enum ApprovalError { pub enum ApprovalError {
#[error("No user agents connected")] #[error("No user agents connected")]
NoUserAgentsConnected, NoUserAgentsConnected,

View File

@@ -1,17 +1,14 @@
use std::{ops::DerefMut, sync::Mutex}; use std::{ops::DerefMut, sync::Mutex};
use arbiter_proto::proto::{ use arbiter_proto::proto::user_agent::{
client,
user_agent::{
ClientConnectionCancel, ClientConnectionRequest, UnsealEncryptedKey, UnsealResult, ClientConnectionCancel, ClientConnectionRequest, UnsealEncryptedKey, UnsealResult,
UnsealStart, UnsealStartResponse, UserAgentRequest, UserAgentResponse, UnsealStart, UnsealStartResponse, UserAgentRequest, UserAgentResponse,
user_agent_request::Payload as UserAgentRequestPayload, user_agent_request::Payload as UserAgentRequestPayload,
user_agent_response::Payload as UserAgentResponsePayload, user_agent_response::Payload as UserAgentResponsePayload,
}, };
};
use chacha20poly1305::{AeadInPlace, XChaCha20Poly1305, XNonce, aead::KeyInit}; use chacha20poly1305::{AeadInPlace, XChaCha20Poly1305, XNonce, aead::KeyInit};
use ed25519_dalek::VerifyingKey; use ed25519_dalek::VerifyingKey;
use kameo::{Actor, error::SendError, message, messages, prelude::Context}; use kameo::{Actor, error::SendError, messages, prelude::Context};
use memsafe::MemSafe; use memsafe::MemSafe;
use tokio::{select, sync::watch}; use tokio::{select, sync::watch};
use tracing::{error, info}; use tracing::{error, info};
@@ -62,7 +59,7 @@ impl UserAgentSession {
async fn send_msg<Reply: kameo::Reply>( async fn send_msg<Reply: kameo::Reply>(
&mut self, &mut self,
msg: UserAgentResponsePayload, msg: UserAgentResponsePayload,
ctx: &mut Context<Self, Reply>, _ctx: &mut Context<Self, Reply>,
) -> Result<(), Error> { ) -> Result<(), Error> {
self.props self.props
.transport .transport
@@ -111,7 +108,6 @@ impl UserAgentSession {
#[messages] #[messages]
impl UserAgentSession { impl UserAgentSession {
// TODO: Think about refactoring it to state-machine based flow, as we already have one // TODO: Think about refactoring it to state-machine based flow, as we already have one
#[message(ctx)] #[message(ctx)]
pub async fn request_new_client_approval( pub async fn request_new_client_approval(

View File

@@ -1,12 +1,13 @@
#![allow(unused)] #![allow(unused)]
#![allow(clippy::all)] #![allow(clippy::all)]
use crate::db::schema::{self, aead_encrypted, arbiter_settings, root_key_history, tls_history}; use crate::db::{ schema::{self, aead_encrypted, arbiter_settings, root_key_history, tls_history}};
use diesel::{prelude::*, sqlite::Sqlite}; use diesel::{prelude::*, sql_types::Bool, sqlite::Sqlite};
use restructed::Models; use restructed::Models;
pub mod types { pub mod types {
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use diesel::{deserialize::FromSql, expression::AsExpression, serialize::{IsNull, ToSql}, sql_types::Integer, sqlite::Sqlite};
pub struct SqliteTimestamp(DateTime<Utc>); pub struct SqliteTimestamp(DateTime<Utc>);
} }
@@ -58,8 +59,8 @@ pub struct TlsHistory {
pub id: i32, pub id: i32,
pub cert: String, pub cert: String,
pub cert_key: String, // PEM Encoded private key pub cert_key: String, // PEM Encoded private key
pub ca_cert: String, // PEM Encoded certificate for cert signing pub ca_cert: String, // PEM Encoded certificate for cert signing
pub ca_key: String, // PEM Encoded public key for cert signing pub ca_key: String, // PEM Encoded public key for cert signing
pub created_at: i32, pub created_at: i32,
} }
@@ -68,10 +69,10 @@ pub struct TlsHistory {
pub struct ArbiterSettings { pub struct ArbiterSettings {
pub id: i32, pub id: i32,
pub root_key_id: Option<i32>, // references root_key_history.id pub root_key_id: Option<i32>, // references root_key_history.id
pub tls_id: Option<i32>, // references tls_history.id pub tls_id: Option<i32>, // references tls_history.id
} }
#[derive(Queryable, Debug)] #[derive(Queryable, Debug, Insertable, Selectable)]
#[diesel(table_name = schema::program_client, check_for_backend(Sqlite))] #[diesel(table_name = schema::program_client, check_for_backend(Sqlite))]
pub struct ProgramClient { pub struct ProgramClient {
pub id: i32, pub id: i32,

View File

@@ -77,13 +77,12 @@ fn client_auth_error_status(value: &client::auth::Error) -> Status {
Error::InvalidAuthPubkeyEncoding => { Error::InvalidAuthPubkeyEncoding => {
Status::invalid_argument("Failed to convert pubkey to VerifyingKey") Status::invalid_argument("Failed to convert pubkey to VerifyingKey")
} }
Error::InvalidSignatureLength => Status::invalid_argument("Invalid signature length"), Error::InvalidChallengeSolution => Status::unauthenticated(value.to_string()),
Error::PublicKeyNotRegistered | Error::InvalidChallengeSolution => { Error::ApproveError(_) => Status::permission_denied(value.to_string()),
Status::unauthenticated(value.to_string())
}
Error::Transport => Status::internal("Transport error"), Error::Transport => Status::internal("Transport error"),
Error::DatabasePoolUnavailable => Status::internal("Database pool error"), Error::DatabasePoolUnavailable => Status::internal("Database pool error"),
Error::DatabaseOperationFailed => Status::internal("Database error"), Error::DatabaseOperationFailed => Status::internal("Database error"),
Error::InternalError => Status::internal("Internal error"),
} }
} }