doc: further comments on server architecture

This commit is contained in:
Wojciech Nawrocki 2020-09-13 19:39:21 -04:00 committed by Sebastian Ullrich
parent 8decfb6b1d
commit c3b333b9e7
4 changed files with 111 additions and 73 deletions

View file

@ -12,13 +12,9 @@ import Lean.Server.Snapshots
import Lean.Data.Lsp
import Lean.Data.Json.FromToJson
namespace Lean
namespace Server
/-
Each file worker process manages a single file. File workers are launched and terminated
by a watchdog process. See Watchdog.lean for a description of how file workers are expected
to interact with the watchdog process.
/-!
For general server architecture, see `README.md`. For details of IPC communication, see `Watchdog.lean`.
This module implements per-file worker processes.
File processing and requests+notifications against a file should be concurrent for two reasons:
- By the LSP standard, requests should be cancellable.
@ -32,12 +28,15 @@ On didChange notifications, we search for the task in which the change occured.
a task that has not yet finished before finding the task we're looking for, we terminate it
and start the elaboration there, otherwise we start the elaboration at the task where the change occured.
Requests iterate over tasks until they find the command that they need to answer the request.
Requests iterate over tasks until they find the command that they need to answer the request.
In order to not block the main thread, this is done in a request task.
If a task that the request task waits for is terminated, a change occured somewhere before the
command that the request is looking for and the request sends a "content changed" error.
-/
namespace Lean
namespace Server
open Lsp
open IO
open Snapshots
@ -69,7 +68,7 @@ t ← asTask act;
pure $ t.map $ fun error => match error with
| Except.ok e => e
| Except.error ioError => Except.error (TaskError.ioError ioError)
private partial def runCore (h : FS.Stream) (uri : DocumentUri) (version : Nat) (contents : FileMap) : Snapshot → IO (Except TaskError ElabTask)
| parent => do
result ← compileNextCmd contents.source parent;
@ -78,7 +77,7 @@ private partial def runCore (h : FS.Stream) (uri : DocumentUri) (version : Nat)
sendDiagnosticsCore h uri version contents snap.msgLog;
-- TODO(MH): check for interrupt (maybe with increased precision even in compileNextCmd, but not after runTask!)
t ← runTask (runCore snap);
pure (Except.ok ⟨snap, t⟩)
pure (Except.ok ⟨snap, t⟩)
| Sum.inr msgLog => pure (Except.error TaskError.eof)
def run (h : FS.Stream) (uri : DocumentUri) (version : Nat) (contents : FileMap) (parent : Snapshot) : IO ElabTask := do
@ -135,7 +134,7 @@ partial def branchOffAt (h : FS.Stream) (uri : DocumentUri) (version : Nat) (con
-- next might finish before it sees the interrupt, so we must chase down the chain of tasks
-- interruptAfter task; -- TODO(MH): we may not need to interrupt since tasks without refs are marked as cancelled by the GC
pure newNext
end ElabTask
/-- A document editable in the sense that we track the environment

View file

@ -9,7 +9,7 @@ import Lean.Data.Lsp
namespace Lean
namespace Lsp
class HasFileSource (α : Type*) :=
class HasFileSource (α : Type*) :=
(fileSource : α → DocumentUri)
export HasFileSource (fileSource)

View file

@ -2,15 +2,11 @@
(Assuming `lean4` is the `elan` toolchain for stage 0.5)
```
leanmake +lean4 bin PKG=ServerBin LINK_OPTS=-rdynamic
cd $LEAN4_HOME/src/Lean/Server/
leanmake +lean4 bin PKG=Watchdog LINK_OPTS=-rdynamic
leanmake +lean4 bin PKG=FileWorker LINK_OPTS=-rdynamic
```
## Server code style
Comments should exist to denote specifics of our implementation but, for
the most part, we shouldn't copy comments over from the LSP specification
to avoid unnecessary duplication.
## Connecting clients
### Emacs with lsp-mode
@ -25,9 +21,14 @@ An easy way to get an LSP client is to build the [sample extension](https://gith
```typescript
let serverOptions: ServerOptions = {
command: "/path/to/build/bin/ServerBin",
command: "$LEAN4_HOME/src/Lean/Server/build/bin/Watchdog",
args: [],
options: null
options: {
env: {
LEAN_PATH: "$LEAN4_HOME/build/$RELEASE_OR_DEBUG/stage0.5/lib/lean/",
LEAN_WORKER_PATH: "$LEAN4_HOME/src/Lean/Server/build/bin/FileWorker"
}
}
};
```
@ -46,22 +47,22 @@ or if logging LSP requests using Netcat (below):
### In `bash` with Netcat:
```
cd $LEAN4_HOME/src/Lean/Server/
mkfifo pipe
# So that the server can find and import packages
export LEAN_PATH=$LEAN4_HOME/build/$RELEASE_OR_DEBUG/stage0.5/lib/lean/
nc -l -p 12345 < pipe | tee client.log | ./build/bin/ServerBin 2> stderr | tee pipe server.log
export LEAN_WORKER_PATH=$LEAN4_HOME/src/Lean/Server/build/bin/FileWorker
nc -l -p 12345 < pipe | tee client.log | ./build/bin/Watchdog 2> stderr | tee pipe server.log
```
will create three files to follow with `tail -f` -- `client.log` for client messages, `server.log` for server messages and `stderr` for server `IO.stderr` debugging.
### In VSCode
Set `$extension.trace.server` to `verbose` as described in the *Usage* section of [LSP Inspector](https://microsoft.github.io/language-server-protocol/inspector/).
Set `$extension.trace.server` to `verbose` as described in the *Usage* section of [LSP Inspector](https://microsoft.github.io/language-server-protocol/inspector/**.
## Known issues affecting development
## Server design
- https://leanprover.zulipchat.com/#narrow/stream/147302-lean4/topic/segfaulting.20lean.20binary
## (Very incomplete) notes on Lean 3
### (Very incomplete) notes on Lean 3
How the Lean 3 server provides info for mouse hovers, tactic states, widgets states and autocompletion:
- When compiling a file, the module manager stores *snapshots* of the environment and options
@ -72,3 +73,32 @@ How the Lean 3 server provides info for mouse hovers, tactic states, widgets sta
It finds the closest snapshot to the position at which info is requested and passes
that to `report_info` at `src/frontends/lean/interactive.cpp:141`. `report_info` queries
the `info_manager` as well as the environment at that point for relevant information.
### Architecture
#### Process separation
The server consists of a single *watchdog* process coordinating per-file *worker* processes.
The watchdog's only purpose is to:
- manage a worker process for each open file;
- keep track of minimal persistent state;
- coalesce and coordinate the workers' communication with the LSP client.
Almost all of the actual computation (elaboration, `#eval`uation, autocompletion, ..) happens in the workers.
Why would we settle on such an architecture? The crucial point is that corruption of a single per-file worker cannot affect the stability of the whole server. A similar idea drove the design of per-tab sandbox processes in web browsers such as Chromium Site Isolation or Firefox Electrolysis. In our case though, possible corruption is not due to malicious behaviour (we assume Lean code opened in an editor is trusted) but rather due to arbitrary computation in metaprograms and `#eval` statements which users write. If the user code for one file causes a stack overflow, we would not want the entire server to die. Thanks to the separation, the offending file can be recompiled while keeping the state of other open files intact. To facilitate restarting workers in this fashion, the watchdog needs to keep track of a minimal amount of state - the contents of open files and possibly the place at which it crashed.
Another important consideration is the *compacted region* memory used by imported modules. For efficiency, these regions are not subject to the reference-counting GC and as such need to be freed manually when the imports change. But doing this safely is pretty much impossible, as safe freeing is the very problem GCs are supposed to solve. It is far easier to simply nuke and restart the worker process whenever this needs to be done, as it only happens in cases in which all of the worker's state would have to be recomputed anyway.
#### Recompilation of opened files
When the user has two or more files in a single dependency chain open, it is desirable for changes in imports to propagate to modules importing them. That is, when `B.lean` depends on `A.lean` and both are open, changes to `A` should eventually be observable in `B`. But a major problem with Lean 3 is how it does this much too eagerly. Often `B` will be recompiled needlessly as soon as `A` is opened, even if no changes have been made to `A`. For heavyweight modules which take up to several minutes to compile, this causes frustration when `A` is opened merely for inspection e.g. via go-to-definition.
In Lean 4, the situation is different as `.olean` artifacts are required to exist for all imported modules -- one cannot import a `.lean` file without compiling it first. In the running example, when a user opens and edits `A`, nothing is going to happen to `B`. They can continue to interact with it as if `A` kept its previous contents. But when `A` is saved with changes, `.olean` compilation will be triggered (user-configurable, can be disabled) and correspondingly `B` will be recompiled. This being a conscious action, users will be aware of having to then wait for reverse dependencies to recompile.
### Code style
Comments should exist to denote specifics of our implementation but, for
the most part, we shouldn't copy comments over from the LSP specification
to avoid unnecessary duplication.

View file

@ -12,47 +12,52 @@ import Lean.Elab.Import
import Lean.Data.Lsp
import Lean.Server.HasFileSource
/-!
For general server architecture, see `README.md`. This module implements the watchdog process.
## Watchdog state
Most LSP clients only send us file diffs, so to facilitate sending entire file contents to freshly restarted workers,
the watchdog needs to maintain the current state of each file. It can also use this state to detect changes
to the header and thus restart the corresponding worker, freeing its imports.
TODO(WN):
We may eventually want to keep track of approximately (since this isn't knowable exactly) where in the file a worker
crashed. Then on restart, we said worker to only parse up to that point and query the user about how to proceed
(continue OR allow the user to fix the bug and then continue OR ..). Without this, if the crash is deterministic,
the worker could get into a restart loop.
## Watchdog <-> worker communication
The watchdog process and its file worker processes communicate via LSP. If the necessity arises,
we might add non-standard commands similarly based on JSON-RPC. Most requests and notifications
are forwarded to the corresponding file worker process, with the exception of these notifications:
- textDocument/didOpen: Launch the file worker, create the associated watchdog state and launch a task to asynchronously
receive LSP packets from the worker (e.g. request responses).
- textDocument/didChange: Update the local file state. If the header was mutated,
signal a shutdown to the file worker by closing the I/O channels.
Then restart the file worker. Otherwise, forward the `didChange` notification.
- textDocument/didClose: Signal a shutdown to the file worker and remove the associated watchdog state.
Moreover, we don't implement the full protocol at this level:
- Upon starting, the `initialize` request is forwarded to the worker, but it must not respond with its server capabilities.
Consequently, the watchdog will not send an `initialized` notification to the worker.
- After `initialize`, the watchdog sends the corresponding `didOpen` notification with the full current state of the file.
No additional `didOpen` notifications will be forwarded to the worker process.
- File workers will never receive a `shutdown` request or an `exit` notification. File workers are always terminated
by closing their I/O channels. Similarly, they never receive a `didClose` notification.
## Watchdog <-> client communication
The watchdog itself should implement the LSP standard as closely as possible. However we reserve the right to add non-standard
extensions in case they're needed, for example to communicate tactic state.
-/
namespace Lean
namespace Server
/-
The server architecture consists of a watchdog process that communicates with the user
and one file worker process for each open file.
This is because processing the header of a file creates objects that need to be freed manually.
Unfortunately, it is very difficult to ensure that these objects are not still in use by some user code,
potentially leading to segfaults when freeing these objects.
To contain this issue, each open file is processed by one dedicated process.
When the header is mutated, the process is killed and restarted by the watchdog process.
Lean elaboration can also be very expensive due to the tactic framework essentially allowing for arbitrary user
programs: If the user code for one file causes a stack overflow, we would not want the entire server to die.
Hence, isolating user code at a file-level and potentially restarting file worker processes upon error has the added
benefit of increased stability.
To communicate the file state to the file worker upon restarting, the watchdog needs to maintain
the current state of the file, which it can also use to detect changes to the header and thus terminate
the corresponding file worker.
The watchdog process and its file worker processes communicate via LSP. Most requests and notifications
are forwarded to the corresponding file worker process, with the exception of these notifications:
- didOpen: launch the file worker, create the associated watchdog state and launch a task to forward
incoming packets from the watchdog (e.g. request responses).
- didChange: update the local file state. if the header was mutated,
signal a shutdown to the file worker by closing the I/O channels and restart the file worker.
otherwise, forward the didChange notification.
- didClose: signal a shutdown to the file worker and remove the associated watchdog state.
Writes to Lean I/O channels are atomic, and hence we do not need additional synchronization for the tasks
that read file worker responses.
While the communication between the watchdog and its file workers uses LSP packets, it does not implement the
full protocol:
- Upon starting, the initialize request is forwarded to the file worker, but it must not respond with its server capabilities.
Consequently, the watchdog will not send an initialized notification to the file worker.
- After initialize, the watch dog sends the corresponding didOpen notification with the full current state of the file.
No additional didOpen notifications will be forwarded to the file worker process.
- File workers will never receive a shutdown request or an exit notification. File workers are always terminated
by closing their I/O channels. Similarly, they never receive a didClose notification.
-/
open IO
open Std (RBMap RBMap.empty)
open Lsp
@ -65,13 +70,12 @@ let pre := text.source.extract 0 start;
let post := text.source.extract «end» text.source.bsize;
(pre ++ newText ++ post).toFileMap
def parsedImportsEndPos (input : String) : IO String.Pos := do
private def parsedImportsEndPos (input : String) : IO String.Pos := do
env ← mkEmptyEnvironment;
let fileName := "<input>";
let inputCtx := Parser.mkInputContext input fileName;
match Parser.parseHeader env inputCtx with
| (header, parserState, messages) => do
pure parserState.pos
let (_, parserState, _) := Parser.parseHeader env inputCtx;
pure parserState.pos
structure EditableDocument :=
(version : Nat)
@ -86,7 +90,7 @@ structure FileWorker :=
namespace FileWorker
def spawnArgs : Process.SpawnArgs := {workerCfg with cmd := "fileworker"}
def spawnArgs : Process.SpawnArgs := {workerCfg with cmd := "fileworker"}
def spawn (doc : EditableDocument) : IO FileWorker := do
proc ← Process.spawn spawnArgs;
@ -131,7 +135,7 @@ def readUserLspMessage : ServerM JsonRpc.Message :=
fun st => monadLift $ readLspMessage st.hIn
def readWorkerLspRequestAs (key : DocumentUri) (expectedMethod : String) (α : Type*) [HasFromJson α] : ServerM (Request α) :=
findFileWorker key >>= fun fw => monadLift $ readLspRequestAs fw.stdout expectedMethod α
findFileWorker key >>= fun fw => monadLift $ readLspRequestAs fw.stdout expectedMethod α
def readUserLspRequestAs (expectedMethod : String) (α : Type*) [HasFromJson α] : ServerM (Request α) :=
fun st => monadLift $ readLspRequestAs st.hIn expectedMethod α
@ -171,7 +175,7 @@ st ← read;
writeWorkerLspRequest key (0 : Nat) "initialize" st.initParams
def writeWorkerDidOpenNotification (key : DocumentUri) : ServerM Unit := do
findFileWorker key >>= fun fw => writeWorkerLspNotification key "textDocument/didOpen"
findFileWorker key >>= fun fw => writeWorkerLspNotification key "textDocument/didOpen"
(DidOpenTextDocumentParams.mk ⟨key, "lean", fw.doc.version, fw.doc.text.source⟩)
def parseParams (paramType : Type*) [HasFromJson paramType] (params : Json) : ServerM paramType :=
@ -187,6 +191,7 @@ partial def forwardFileWorkerPackets (fw : FileWorker) : Unit → ServerM Unit
-- TODO(MH): detect closed stream somehow and terminate gracefully
-- TODO(MH): potentially catch unintended termination (e.g. due to stack overflow) and restart process
msg ← monadLift $ readLspMessage fw.stdout;
-- NOTE: Writes to Lean I/O streams are atomic, so we don't need to synchronise these in principle.
writeUserLspMessage msg;
forwardFileWorkerPackets ⟨⟩
@ -200,7 +205,7 @@ writeWorkerDidOpenNotification key;
-- TODO(MH): Sebastian said something about this better being implemented as threads
-- (due to the long running nature of these tasks) but i did not yet have time to
-- look into this.
let _ := Task.mk (forwardFileWorkerPackets fw);
let _ := Task.mk (forwardFileWorkerPackets fw);
pure ⟨⟩
-- TODO(MH)
@ -230,6 +235,10 @@ else changes.forM $ fun change =>
let newDocText := replaceLspRange oldDoc.text range newText;
let oldHeaderEndPos := oldDoc.headerEndPos;
if startOff < oldHeaderEndPos then do
/- The header changed, restart worker. -/
-- TODO(WN): we should amortize this somehow;
-- when the user is typing in an import, this
-- may rapidly destroy/create new processes
terminateFileWorker fw;
startFileWorker doc.uri newVersion newDocText
else