doc: further comments on server architecture

2020-09-13 19:39:21 -04:00 · 2020-09-13 19:39:21 -04:00 · c3b333b9e7
commit c3b333b9e7
parent 8decfb6b1d
4 changed files with 111 additions and 73 deletions
--- a/src/Lean/Server/FileWorker.lean
+++ b/src/Lean/Server/FileWorker.lean
@ -12,13 +12,9 @@ import Lean.Server.Snapshots
 import Lean.Data.Lsp
 import Lean.Data.Json.FromToJson

-namespace Lean
-namespace Server
-
-/-
-Each file worker process manages a single file. File workers are launched and terminated
-by a watchdog process. See Watchdog.lean for a description of how file workers are expected 
-to interact with the watchdog process.
+/-!
+For general server architecture, see `README.md`. For details of IPC communication, see `Watchdog.lean`.
+This module implements per-file worker processes.

 File processing and requests+notifications against a file should be concurrent for two reasons:
 - By the LSP standard, requests should be cancellable.
@ -32,12 +28,15 @@ On didChange notifications, we search for the task in which the change occured.
 a task that has not yet finished before finding the task we're looking for, we terminate it
 and start the elaboration there, otherwise we start the elaboration at the task where the change occured.

-Requests iterate over tasks until they find the command that they need to answer the request. 
+Requests iterate over tasks until they find the command that they need to answer the request.
 In order to not block the main thread, this is done in a request task.
 If a task that the request task waits for is terminated, a change occured somewhere before the
 command that the request is looking for and the request sends a "content changed" error.
 -/

+namespace Lean
+namespace Server
+
 open Lsp
 open IO
 open Snapshots
@ -69,7 +68,7 @@ t ← asTask act;
 pure $ t.map $ fun error => match error with
 | Except.ok e => e
 | Except.error ioError => Except.error (TaskError.ioError ioError)
-  
+
 private partial def runCore (h : FS.Stream) (uri : DocumentUri) (version : Nat) (contents : FileMap) : Snapshot → IO (Except TaskError ElabTask)
 | parent => do
  result ← compileNextCmd contents.source parent;
@ -78,7 +77,7 @@ private partial def runCore (h : FS.Stream) (uri : DocumentUri) (version : Nat)
    sendDiagnosticsCore h uri version contents snap.msgLog;
    -- TODO(MH): check for interrupt (maybe with increased precision even in compileNextCmd, but not after runTask!)
    t ← runTask (runCore snap);
-    pure (Except.ok ⟨snap, t⟩)  
+    pure (Except.ok ⟨snap, t⟩)
  | Sum.inr msgLog => pure (Except.error TaskError.eof)

 def run (h : FS.Stream) (uri : DocumentUri) (version : Nat) (contents : FileMap) (parent : Snapshot) : IO ElabTask := do
@ -135,7 +134,7 @@ partial def branchOffAt (h : FS.Stream) (uri : DocumentUri) (version : Nat) (con
    -- next might finish before it sees the interrupt, so we must chase down the chain of tasks
    -- interruptAfter task; -- TODO(MH): we may not need to interrupt since tasks without refs are marked as cancelled by the GC
    pure newNext
-  
+
 end ElabTask

 /-- A document editable in the sense that we track the environment
--- a/src/Lean/Server/HasFileSource.lean
+++ b/src/Lean/Server/HasFileSource.lean
@ -9,7 +9,7 @@ import Lean.Data.Lsp
 namespace Lean
 namespace Lsp

-class HasFileSource (α : Type*) := 
+class HasFileSource (α : Type*) :=
 (fileSource : α → DocumentUri)
 export HasFileSource (fileSource)

--- a/src/Lean/Server/README.md
+++ b/src/Lean/Server/README.md
@ -2,15 +2,11 @@

 (Assuming `lean4` is the `elan` toolchain for stage 0.5)
 ```
-leanmake +lean4 bin PKG=ServerBin LINK_OPTS=-rdynamic
+cd $LEAN4_HOME/src/Lean/Server/
+leanmake +lean4 bin PKG=Watchdog LINK_OPTS=-rdynamic
+leanmake +lean4 bin PKG=FileWorker LINK_OPTS=-rdynamic
 ```

-## Server code style
-
-Comments should exist to denote specifics of our implementation but, for
-the most part, we shouldn't copy comments over from the LSP specification
-to avoid unnecessary duplication.
-
 ## Connecting clients

 ### Emacs with lsp-mode
@ -25,9 +21,14 @@ An easy way to get an LSP client is to build the [sample extension](https://gith

 ```typescript
  let serverOptions: ServerOptions = {
-    command: "/path/to/build/bin/ServerBin",
+    command: "$LEAN4_HOME/src/Lean/Server/build/bin/Watchdog",
    args: [],
-    options: null
+    options: {
+      env: {
+        LEAN_PATH: "$LEAN4_HOME/build/$RELEASE_OR_DEBUG/stage0.5/lib/lean/",
+        LEAN_WORKER_PATH: "$LEAN4_HOME/src/Lean/Server/build/bin/FileWorker"
+      }
+    }
  };
 ```

@ -46,22 +47,22 @@ or if logging LSP requests using Netcat (below):
 ### In `bash` with Netcat:

 ```
+cd $LEAN4_HOME/src/Lean/Server/
 mkfifo pipe
 # So that the server can find and import packages
 export LEAN_PATH=$LEAN4_HOME/build/$RELEASE_OR_DEBUG/stage0.5/lib/lean/
-nc -l -p 12345 < pipe | tee client.log | ./build/bin/ServerBin 2> stderr | tee pipe server.log
+export LEAN_WORKER_PATH=$LEAN4_HOME/src/Lean/Server/build/bin/FileWorker
+nc -l -p 12345 < pipe | tee client.log | ./build/bin/Watchdog 2> stderr | tee pipe server.log
 ```
 will create three files to follow with `tail -f` -- `client.log` for client messages, `server.log` for server messages and `stderr` for server `IO.stderr` debugging.

 ### In VSCode

-Set `$extension.trace.server` to `verbose` as described in the *Usage* section of [LSP Inspector](https://microsoft.github.io/language-server-protocol/inspector/).
+Set `$extension.trace.server` to `verbose` as described in the *Usage* section of [LSP Inspector](https://microsoft.github.io/language-server-protocol/inspector/**.

-## Known issues affecting development
+## Server design

- https://leanprover.zulipchat.com/#narrow/stream/147302-lean4/topic/segfaulting.20lean.20binary
-
-## (Very incomplete) notes on Lean 3
+### (Very incomplete) notes on Lean 3

 How the Lean 3 server provides info for mouse hovers, tactic states, widgets states and autocompletion:
 - When compiling a file, the module manager stores *snapshots* of the environment and options
@ -72,3 +73,32 @@ How the Lean 3 server provides info for mouse hovers, tactic states, widgets sta
  It finds the closest snapshot to the position at which info is requested and passes
  that to `report_info` at `src/frontends/lean/interactive.cpp:141`. `report_info` queries
  the `info_manager` as well as the environment at that point for relevant information.
+
+### Architecture
+
+#### Process separation
+
+The server consists of a single *watchdog* process coordinating per-file *worker* processes.
+
+The watchdog's only purpose is to:
+- manage a worker process for each open file;
+- keep track of minimal persistent state;
+- coalesce and coordinate the workers' communication with the LSP client.
+
+Almost all of the actual computation (elaboration, `#eval`uation, autocompletion, ..) happens in the workers.
+
+Why would we settle on such an architecture? The crucial point is that corruption of a single per-file worker cannot affect the stability of the whole server. A similar idea drove the design of per-tab sandbox processes in web browsers such as Chromium Site Isolation or Firefox Electrolysis. In our case though, possible corruption is not due to malicious behaviour (we assume Lean code opened in an editor is trusted) but rather due to arbitrary computation in metaprograms and `#eval` statements which users write. If the user code for one file causes a stack overflow, we would not want the entire server to die. Thanks to the separation, the offending file can be recompiled while keeping the state of other open files intact. To facilitate restarting workers in this fashion, the watchdog needs to keep track of a minimal amount of state - the contents of open files and possibly the place at which it crashed.
+
+Another important consideration is the *compacted region* memory used by imported modules. For efficiency, these regions are not subject to the reference-counting GC and as such need to be freed manually when the imports change. But doing this safely is pretty much impossible, as safe freeing is the very problem GCs are supposed to solve. It is far easier to simply nuke and restart the worker process whenever this needs to be done, as it only happens in cases in which all of the worker's state would have to be recomputed anyway.
+
+#### Recompilation of opened files
+
+When the user has two or more files in a single dependency chain open, it is desirable for changes in imports to propagate to modules importing them. That is, when `B.lean` depends on `A.lean` and both are open, changes to `A` should eventually be observable in `B`. But a major problem with Lean 3 is how it does this much too eagerly. Often `B` will be recompiled needlessly as soon as `A` is opened, even if no changes have been made to `A`. For heavyweight modules which take up to several minutes to compile, this causes frustration when `A` is opened merely for inspection e.g. via go-to-definition.
+
+In Lean 4, the situation is different as `.olean` artifacts are required to exist for all imported modules -- one cannot import a `.lean` file without compiling it first. In the running example, when a user opens and edits `A`, nothing is going to happen to `B`. They can continue to interact with it as if `A` kept its previous contents. But when `A` is saved with changes, `.olean` compilation will be triggered (user-configurable, can be disabled) and correspondingly `B` will be recompiled. This being a conscious action, users will be aware of having to then wait for reverse dependencies to recompile.
+
+### Code style
+
+Comments should exist to denote specifics of our implementation but, for
+the most part, we shouldn't copy comments over from the LSP specification
+to avoid unnecessary duplication.
--- a/src/Lean/Server/Watchdog.lean
+++ b/src/Lean/Server/Watchdog.lean
@ -12,47 +12,52 @@ import Lean.Elab.Import
 import Lean.Data.Lsp
 import Lean.Server.HasFileSource

+/-!
+For general server architecture, see `README.md`. This module implements the watchdog process.
+
+## Watchdog state
+
+Most LSP clients only send us file diffs, so to facilitate sending entire file contents to freshly restarted workers,
+the watchdog needs to maintain the current state of each file. It can also use this state to detect changes
+to the header and thus restart the corresponding worker, freeing its imports.
+
+TODO(WN):
+We may eventually want to keep track of approximately (since this isn't knowable exactly) where in the file a worker
+crashed. Then on restart, we said worker to only parse up to that point and query the user about how to proceed
+(continue OR allow the user to fix the bug and then continue OR ..). Without this, if the crash is deterministic,
+the worker could get into a restart loop.
+
+## Watchdog <-> worker communication
+
+The watchdog process and its file worker processes communicate via LSP. If the necessity arises,
+we might add non-standard commands similarly based on JSON-RPC. Most requests and notifications
+are forwarded to the corresponding file worker process, with the exception of these notifications:
+
+- textDocument/didOpen: Launch the file worker, create the associated watchdog state and launch a task to asynchronously
+                        receive LSP packets from the worker (e.g. request responses).
+- textDocument/didChange: Update the local file state. If the header was mutated,
+                          signal a shutdown to the file worker by closing the I/O channels.
+                          Then restart the file worker. Otherwise, forward the `didChange` notification.
+- textDocument/didClose: Signal a shutdown to the file worker and remove the associated watchdog state.
+
+Moreover, we don't implement the full protocol at this level:
+
+- Upon starting, the `initialize` request is forwarded to the worker, but it must not respond with its server capabilities.
+  Consequently, the watchdog will not send an `initialized` notification to the worker.
+- After `initialize`, the watchdog sends the corresponding `didOpen` notification with the full current state of the file.
+  No additional `didOpen` notifications will be forwarded to the worker process.
+- File workers will never receive a `shutdown` request or an `exit` notification. File workers are always terminated
+  by closing their I/O channels. Similarly, they never receive a `didClose` notification.
+
+## Watchdog <-> client communication
+
+The watchdog itself should implement the LSP standard as closely as possible. However we reserve the right to add non-standard
+extensions in case they're needed, for example to communicate tactic state.
+-/
+
 namespace Lean
 namespace Server

-/-
-The server architecture consists of a watchdog process that communicates with the user
-and one file worker process for each open file. 
-This is because processing the header of a file creates objects that need to be freed manually. 
-Unfortunately, it is very difficult to ensure that these objects are not still in use by some user code,
-potentially leading to segfaults when freeing these objects. 
-To contain this issue, each open file is processed by one dedicated process. 
-When the header is mutated, the process is killed and restarted by the watchdog process.
-Lean elaboration can also be very expensive due to the tactic framework essentially allowing for arbitrary user
-programs: If the user code for one file causes a stack overflow, we would not want the entire server to die.
-Hence, isolating user code at a file-level and potentially restarting file worker processes upon error has the added
-benefit of increased stability. 
-
-To communicate the file state to the file worker upon restarting, the watchdog needs to maintain
-the current state of the file, which it can also use to detect changes to the header and thus terminate
-the corresponding file worker.
-
-The watchdog process and its file worker processes communicate via LSP. Most requests and notifications
-are forwarded to the corresponding file worker process, with the exception of these notifications:
- didOpen: launch the file worker, create the associated watchdog state and launch a task to forward
-           incoming packets from the watchdog (e.g. request responses).
- didChange: update the local file state. if the header was mutated, 
-             signal a shutdown to the file worker by closing the I/O channels and restart the file worker.
-             otherwise, forward the didChange notification.
- didClose: signal a shutdown to the file worker and remove the associated watchdog state.
-Writes to Lean I/O channels are atomic, and hence we do not need additional synchronization for the tasks
-that read file worker responses.
-
-While the communication between the watchdog and its file workers uses LSP packets, it does not implement the
-full protocol:
- Upon starting, the initialize request is forwarded to the file worker, but it must not respond with its server capabilities.
-  Consequently, the watchdog will not send an initialized notification to the file worker.
- After initialize, the watch dog sends the corresponding didOpen notification with the full current state of the file.
-  No additional didOpen notifications will be forwarded to the file worker process.
- File workers will never receive a shutdown request or an exit notification. File workers are always terminated
-  by closing their I/O channels. Similarly, they never receive a didClose notification. 
-/
-
 open IO
 open Std (RBMap RBMap.empty)
 open Lsp
@ -65,13 +70,12 @@ let pre := text.source.extract 0 start;
 let post := text.source.extract «end» text.source.bsize;
 (pre ++ newText ++ post).toFileMap

-def parsedImportsEndPos (input : String) : IO String.Pos := do
+private def parsedImportsEndPos (input : String) : IO String.Pos := do
 env ← mkEmptyEnvironment;
 let fileName := "<input>";
 let inputCtx := Parser.mkInputContext input fileName;
-match Parser.parseHeader env inputCtx with
-| (header, parserState, messages) => do
-  pure parserState.pos
+let (_, parserState, _) := Parser.parseHeader env inputCtx;
+pure parserState.pos

 structure EditableDocument :=
 (version : Nat)
@ -86,7 +90,7 @@ structure FileWorker :=

 namespace FileWorker

-def spawnArgs : Process.SpawnArgs := {workerCfg with cmd := "fileworker"} 
+def spawnArgs : Process.SpawnArgs := {workerCfg with cmd := "fileworker"}

 def spawn (doc : EditableDocument) : IO FileWorker := do
 proc ← Process.spawn spawnArgs;
@ -131,7 +135,7 @@ def readUserLspMessage : ServerM JsonRpc.Message :=
 fun st => monadLift $ readLspMessage st.hIn

 def readWorkerLspRequestAs (key : DocumentUri) (expectedMethod : String) (α : Type*) [HasFromJson α] : ServerM (Request α) :=
-findFileWorker key >>= fun fw => monadLift $ readLspRequestAs fw.stdout expectedMethod α 
+findFileWorker key >>= fun fw => monadLift $ readLspRequestAs fw.stdout expectedMethod α

 def readUserLspRequestAs (expectedMethod : String) (α : Type*) [HasFromJson α] : ServerM (Request α) :=
 fun st => monadLift $ readLspRequestAs st.hIn expectedMethod α
@ -171,7 +175,7 @@ st ← read;
 writeWorkerLspRequest key (0 : Nat) "initialize" st.initParams

 def writeWorkerDidOpenNotification (key : DocumentUri) : ServerM Unit := do
-findFileWorker key >>= fun fw => writeWorkerLspNotification key "textDocument/didOpen" 
+findFileWorker key >>= fun fw => writeWorkerLspNotification key "textDocument/didOpen"
  (DidOpenTextDocumentParams.mk ⟨key, "lean", fw.doc.version, fw.doc.text.source⟩)

 def parseParams (paramType : Type*) [HasFromJson paramType] (params : Json) : ServerM paramType :=
@ -187,6 +191,7 @@ partial def forwardFileWorkerPackets (fw : FileWorker) : Unit → ServerM Unit
  -- TODO(MH): detect closed stream somehow and terminate gracefully
  -- TODO(MH): potentially catch unintended termination (e.g. due to stack overflow) and restart process
  msg ← monadLift $ readLspMessage fw.stdout;
+  -- NOTE: Writes to Lean I/O streams are atomic, so we don't need to synchronise these in principle.
  writeUserLspMessage msg;
  forwardFileWorkerPackets ⟨⟩

@ -200,7 +205,7 @@ writeWorkerDidOpenNotification key;
 -- TODO(MH): Sebastian said something about this better being implemented as threads
 -- (due to the long running nature of these tasks) but i did not yet have time to
 -- look into this.
-let _ := Task.mk (forwardFileWorkerPackets fw); 
+let _ := Task.mk (forwardFileWorkerPackets fw);
 pure ⟨⟩

 -- TODO(MH)
@ -230,6 +235,10 @@ else changes.forM $ fun change =>
    let newDocText := replaceLspRange oldDoc.text range newText;
    let oldHeaderEndPos := oldDoc.headerEndPos;
    if startOff < oldHeaderEndPos then do
+      /- The header changed, restart worker. -/
+      -- TODO(WN): we should amortize this somehow;
+      -- when the user is typing in an import, this
+      -- may rapidly destroy/create new processes
      terminateFileWorker fw;
      startFileWorker doc.uri newVersion newDocText
    else