diff --git a/.gitignore b/.gitignore index 32610f71..145c3069 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,9 @@ kubeconfig # Documentation build output _build/ +# Gateway microVM rootfs build artifacts +rootfs/ + # Docker build artifacts (image tarballs, packaged helm charts) deploy/docker/.build/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f558cdeb..bdfae592 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -140,6 +140,41 @@ These are the primary `mise` tasks for day-to-day development: | `mise run docs` | Build and serve documentation locally | | `mise run clean` | Clean build artifacts | +### MicroVM runtime + +To build and run the standalone `gateway` microVM from `crates/openshell-vm`: + +```bash +mise run vm +``` + +That task builds `openshell-vm`, stages `gateway.runtime/`, builds the default rootfs under `$XDG_DATA_HOME/openshell/gateway/rootfs`, codesigns `target/debug/gateway` on macOS, and then launches the VM. + +Once the VM is running, you can run cluster debug commands against its kubeconfig directly: + +```bash +target/debug/gateway exec -- kubectl get pods -A +target/debug/gateway exec -- kubectl -n openshell logs statefulset/openshell +``` + +If you only want to stage the sidecar runtime bundle without launching the VM: + +```bash +mise run vm:bundle-runtime +``` + +To force a fresh rebuild of the binary, bundled runtime, and rootfs without launching the VM: + +```bash +mise run vm:build +``` + +To create a local tarball that contains both `gateway` and `gateway.runtime/`: + +```bash +mise run vm:package:gateway +``` + ## Project Structure | Path | Purpose | diff --git a/Cargo.lock b/Cargo.lock index 3d01356a..76d0b3c6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -91,9 +91,9 @@ dependencies = [ [[package]] name = "anstream" -version = "1.0.0" +version = "0.6.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", "anstyle-parse", @@ -112,9 +112,9 @@ checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" [[package]] name = "anstyle-parse" -version = "1.0.0" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] @@ -141,9 +141,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.102" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +checksum = "5f0e0fee31ef5ed1ba1316088939cea399010ed7731dba877ed44aeb407a75ea" [[package]] name = "argon2" @@ -186,7 +186,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -197,7 +197,7 @@ checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -433,9 +433,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.11.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" +checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" dependencies = [ "serde_core", ] @@ -542,9 +542,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.20.2" +version = "3.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" +checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" [[package]] name = "byteorder" @@ -584,9 +584,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.56" +version = "1.2.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29" dependencies = [ "find-msvc-tools", "jobserver", @@ -643,9 +643,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.6.0" +version = "4.5.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" +checksum = "6899ea499e3fb9305a65d5ebf6e3d2248c5fab291f300ad0a704fbe142eae31a" dependencies = [ "clap_builder", "clap_derive", @@ -653,45 +653,51 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.6.0" +version = "4.5.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +checksum = "7b12c8b680195a62a8364d16b8447b01b6c2c8f9aaf68bee653be34d4245e238" dependencies = [ "anstream", "anstyle", - "clap_lex", + "clap_lex 0.7.7", "strsim", ] [[package]] name = "clap_complete" -version = "4.6.0" +version = "4.5.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19c9f1dde76b736e3681f28cec9d5a61299cbaae0fce80a68e43724ad56031eb" +checksum = "c757a3b7e39161a4e56f9365141ada2a6c915a8622c408ab6bb4b5d047371031" dependencies = [ "clap", - "clap_lex", + "clap_lex 1.0.0", "is_executable", "shlex", ] [[package]] name = "clap_derive" -version = "4.6.0" +version = "4.5.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "clap_lex" -version = "1.1.0" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" + +[[package]] +name = "clap_lex" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" [[package]] name = "cmake" @@ -983,7 +989,7 @@ checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1007,7 +1013,7 @@ dependencies = [ "proc-macro2", "quote", "strsim", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1018,7 +1024,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1053,7 +1059,7 @@ checksum = "780eb241654bf097afb00fc5f054a09b687dad862e485fdcf8399bb056565370" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1080,9 +1086,9 @@ dependencies = [ [[package]] name = "deranged" -version = "0.5.8" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" dependencies = [ "powerfmt", ] @@ -1142,7 +1148,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1247,7 +1253,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1267,7 +1273,7 @@ checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1403,9 +1409,9 @@ checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" [[package]] name = "futures" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1418,9 +1424,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1428,15 +1434,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1456,38 +1462,38 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "futures-sink" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.32" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -1497,6 +1503,7 @@ dependencies = [ "futures-task", "memchr", "pin-project-lite", + "pin-utils", "slab", ] @@ -1683,7 +1690,7 @@ dependencies = [ "proc-macro-error2", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -1797,9 +1804,9 @@ checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" [[package]] name = "hybrid-array" -version = "0.4.8" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8655f91cd07f2b9d0c24137bd650fe69617773435ee5ec83022377777ce65ef1" +checksum = "e1b229d73f5803b562cc26e4da0396c8610a4ee209f4fac8fa4f8d709166dc45" dependencies = [ "typenum", ] @@ -1891,7 +1898,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.3", + "socket2 0.6.2", "tokio", "tower-service", "tracing", @@ -2137,9 +2144,9 @@ dependencies = [ [[package]] name = "ipnet" -version = "2.12.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "iri-string" @@ -2217,9 +2224,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.91" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", @@ -2341,7 +2348,7 @@ dependencies = [ "proc-macro2", "quote", "serde_json", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -2398,9 +2405,9 @@ checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" [[package]] name = "libc" -version = "0.2.183" +version = "0.2.180" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" +checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "libcrux-intrinsics" @@ -2468,6 +2475,16 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "libloading" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +dependencies = [ + "cfg-if", + "windows-link", +] + [[package]] name = "libm" version = "0.2.16" @@ -2476,14 +2493,13 @@ checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" [[package]] name = "libredox" -version = "0.1.14" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" +checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" dependencies = [ "bitflags", "libc", - "plain", - "redox_syscall 0.7.3", + "redox_syscall 0.7.0", ] [[package]] @@ -2505,9 +2521,9 @@ checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" [[package]] name = "linux-raw-sys" -version = "0.12.1" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" [[package]] name = "litemap" @@ -2584,9 +2600,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "memchr" -version = "2.8.0" +version = "2.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" +checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" [[package]] name = "miette" @@ -2615,7 +2631,7 @@ checksum = "db5b29714e950dbb20d5e6f74f9dcec4edbcc1067bb7f8ed198c097b8c1a818b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -2785,9 +2801,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.4" +version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" [[package]] name = "once_cell_polyfill" @@ -3034,6 +3050,22 @@ dependencies = [ "url", ] +[[package]] +name = "openshell-vm" +version = "0.1.0" +dependencies = [ + "base64 0.22.1", + "clap", + "libc", + "libloading", + "miette", + "openshell-bootstrap", + "serde_json", + "thiserror 2.0.18", + "tracing", + "tracing-subscriber", +] + [[package]] name = "openssh" version = "0.11.6" @@ -3065,9 +3097,9 @@ dependencies = [ [[package]] name = "owo-colors" -version = "4.3.0" +version = "4.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d211803b9b6b570f68772237e415a029d5a50c65d382910b879fb19d3271f94d" +checksum = "9c6901729fa79e91a0913333229e9ca5dc725089d1c363b2f4b4760709dc4a52" [[package]] name = "p256" @@ -3252,7 +3284,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -3291,29 +3323,29 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.11" +version = "1.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.11" +version = "1.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "pin-project-lite" -version = "0.2.17" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" [[package]] name = "pin-utils" @@ -3385,12 +3417,6 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" -[[package]] -name = "plain" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" - [[package]] name = "poly1305" version = "0.8.0" @@ -3451,7 +3477,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" dependencies = [ "proc-macro2", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -3482,7 +3508,7 @@ dependencies = [ "proc-macro-error-attr2", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -3520,7 +3546,7 @@ dependencies = [ "prost", "prost-types", "regex", - "syn 2.0.117", + "syn 2.0.114", "tempfile", ] @@ -3534,7 +3560,7 @@ dependencies = [ "itertools 0.14.0", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -3568,7 +3594,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.3", + "socket2 0.6.2", "thiserror 2.0.18", "tokio", "tracing", @@ -3577,9 +3603,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.14" +version = "0.11.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" dependencies = [ "bytes", "getrandom 0.3.4", @@ -3605,16 +3631,16 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.3", + "socket2 0.6.2", "tracing", "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.45" +version = "1.0.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" dependencies = [ "proc-macro2", ] @@ -3740,9 +3766,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.7.3" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" +checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27" dependencies = [ "bitflags", ] @@ -3772,9 +3798,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.10" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" [[package]] name = "regorus" @@ -4020,22 +4046,22 @@ dependencies = [ [[package]] name = "rustix" -version = "1.1.4" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys 0.12.1", + "linux-raw-sys 0.11.0", "windows-sys 0.61.2", ] [[package]] name = "rustls" -version = "0.23.37" +version = "0.23.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "log", "once_cell", @@ -4096,9 +4122,9 @@ checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" -version = "1.0.23" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" +checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" [[package]] name = "salsa20" @@ -4111,9 +4137,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.29" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" dependencies = [ "windows-sys 0.61.2", ] @@ -4139,7 +4165,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4194,9 +4220,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "3.7.0" +version = "3.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ "bitflags", "core-foundation", @@ -4207,9 +4233,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.17.0" +version = "2.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" dependencies = [ "core-foundation-sys", "libc", @@ -4258,7 +4284,7 @@ checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4269,7 +4295,7 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4304,7 +4330,7 @@ checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4498,12 +4524,12 @@ dependencies = [ [[package]] name = "socket2" -version = "0.6.3" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -4594,7 +4620,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4617,7 +4643,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn 2.0.117", + "syn 2.0.114", "tokio", "url", ] @@ -4761,7 +4787,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d904e7009df136af5297832a3ace3370cd14ff1546a232f4f185036c2736fcac" dependencies = [ "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4812,7 +4838,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4855,9 +4881,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.117" +version = "2.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +checksum = "d4d107df263a3013ef9b1879b0df87d706ff80f65a86ea879bd9c31f9b307c2a" dependencies = [ "proc-macro2", "quote", @@ -4881,7 +4907,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -4906,14 +4932,14 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.27.0" +version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ "fastrand", - "getrandom 0.4.2", + "getrandom 0.3.4", "once_cell", - "rustix 1.1.4", + "rustix 1.1.3", "windows-sys 0.61.2", ] @@ -4949,7 +4975,7 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b8cb979cb11c32ce1603f8137b22262a9d131aaa5c37b5678025f22b8becd0" dependencies = [ - "rustix 1.1.4", + "rustix 1.1.3", "windows-sys 0.60.2", ] @@ -4989,7 +5015,7 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5000,7 +5026,7 @@ checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5086,14 +5112,14 @@ checksum = "2d2e76690929402faae40aebdda620a2c0e25dd6d3b9afe48867dfd95991f4bd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "tokio" -version = "1.50.0" +version = "1.49.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" dependencies = [ "bytes", "libc", @@ -5101,20 +5127,20 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.3", + "socket2 0.6.2", "tokio-macros", "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5224,7 +5250,7 @@ dependencies = [ "prost-build", "prost-types", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5345,7 +5371,7 @@ checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5462,9 +5488,9 @@ checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" [[package]] name = "unicode-ident" -version = "1.0.24" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" [[package]] name = "unicode-linebreak" @@ -5582,9 +5608,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.22.0" +version = "1.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" dependencies = [ "getrandom 0.4.2", "js-sys", @@ -5650,9 +5676,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.114" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -5663,9 +5689,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.64" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", "futures-util", @@ -5677,9 +5703,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.114" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5687,22 +5713,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.114" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ "bumpalo", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.114" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] @@ -5743,9 +5769,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.91" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -5864,7 +5890,7 @@ checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -5875,7 +5901,7 @@ checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -6205,7 +6231,7 @@ dependencies = [ "heck", "indexmap 2.13.0", "prettyplease", - "syn 2.0.117", + "syn 2.0.114", "wasm-metadata", "wit-bindgen-core", "wit-component", @@ -6221,7 +6247,7 @@ dependencies = [ "prettyplease", "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", "wit-bindgen-core", "wit-bindgen-rust", ] @@ -6276,7 +6302,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" dependencies = [ "libc", - "rustix 1.1.4", + "rustix 1.1.3", ] [[package]] @@ -6313,28 +6339,28 @@ checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.42" +version = "0.8.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" +checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.42" +version = "0.8.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" +checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -6354,7 +6380,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", "synstructure", ] @@ -6375,7 +6401,7 @@ checksum = "85a5b4158499876c763cb03bc4e49185d3cccbabb15b33c627f7884f43db852e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] @@ -6408,11 +6434,11 @@ checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.117", + "syn 2.0.114", ] [[package]] name = "zmij" -version = "1.0.21" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" +checksum = "3ff05f8caa9038894637571ae6b9e29466c1f4f829d26c9b28f869a29cbe3445" diff --git a/architecture/custom-vm-runtime.md b/architecture/custom-vm-runtime.md new file mode 100644 index 00000000..7d0be284 --- /dev/null +++ b/architecture/custom-vm-runtime.md @@ -0,0 +1,127 @@ +# Custom libkrunfw VM Runtime + +## Overview + +The OpenShell gateway VM uses [libkrun](https://github.com/containers/libkrun) to boot a +lightweight microVM with Apple Hypervisor.framework (macOS) or KVM (Linux). The kernel +is embedded inside `libkrunfw`, a companion library that packages a pre-built Linux kernel. + +The stock `libkrunfw` from Homebrew ships a minimal kernel without bridge, netfilter, or +conntrack support. This is insufficient for Kubernetes pod networking. + +The custom libkrunfw runtime adds bridge CNI, iptables/nftables, and conntrack support to +the VM kernel, enabling standard Kubernetes networking. + +## Architecture + +``` +Host (macOS/Linux) +├── gateway binary +│ ├── Loads libkrun.dylib (VMM) +│ ├── Preloads libkrunfw.dylib (kernel) +│ └── Logs runtime provenance +├── gateway.runtime/ (sidecar bundle) +│ ├── libkrun.dylib +│ ├── libkrunfw.dylib (stock or custom) +│ ├── gvproxy +│ ├── manifest.json +│ └── provenance.json (custom only) +└── gvproxy (networking) + +Guest VM +├── gateway-init.sh (PID 1) +│ ├── Validates kernel capabilities (fail-fast) +│ ├── Configures bridge CNI +│ └── Execs k3s server +└── check-vm-capabilities.sh (diagnostics) +``` + +## Network Profile + +The VM uses the bridge CNI profile, which requires a custom libkrunfw with bridge and +netfilter kernel support. The init script validates these capabilities at boot and fails +fast with an actionable error if they are missing. + +### Bridge Profile + +- CNI: bridge plugin with `cni0` interface +- IP masquerade: enabled (iptables-legacy via CNI bridge plugin) +- kube-proxy: enabled (nftables mode) +- Service VIPs: functional (ClusterIP, NodePort) +- hostNetwork workarounds: not required + +## Runtime Provenance + +At boot, the gateway binary logs provenance metadata about the loaded runtime bundle: + +- Library paths and SHA-256 hashes +- Whether the runtime is custom-built or stock +- For custom runtimes: libkrunfw commit, kernel version, build timestamp + +This information is sourced from `provenance.json` (generated by the build script) +and makes it straightforward to correlate VM behavior with a specific runtime artifact. + +## Build Pipeline + +``` +crates/openshell-vm/runtime/ +├── build-custom-libkrunfw.sh # Clones libkrunfw, applies config, builds +├── kernel/ +│ └── bridge-cni.config # Kernel config fragment +└── README.md # Operator documentation + +Output: target/custom-runtime/ +├── libkrunfw.dylib # Custom library +├── provenance.json # Build metadata +├── bridge-cni.config # Config fragment used +└── kernel.config # Full kernel .config +``` + +## Kernel Config Fragment + +The `bridge-cni.config` fragment enables these kernel features on top of the stock +libkrunfw kernel: + +| Feature | Config | Purpose | +|---------|--------|---------| +| Bridge device | `CONFIG_BRIDGE` | cni0 bridge for pod networking | +| Bridge netfilter | `CONFIG_BRIDGE_NETFILTER` | kube-proxy visibility into bridge traffic | +| Netfilter | `CONFIG_NETFILTER` | iptables/nftables framework | +| Connection tracking | `CONFIG_NF_CONNTRACK` | NAT state tracking | +| NAT | `CONFIG_NF_NAT` | Service VIP DNAT/SNAT | +| iptables | `CONFIG_IP_NF_IPTABLES` | CNI bridge masquerade | +| nftables | `CONFIG_NF_TABLES` | kube-proxy nftables mode (primary) | +| veth | `CONFIG_VETH` | Pod network namespace pairs | +| IPVS | `CONFIG_IP_VS` | kube-proxy IPVS mode (optional) | +| Landlock | `CONFIG_SECURITY_LANDLOCK` | Filesystem sandboxing support | +| Seccomp filter | `CONFIG_SECCOMP_FILTER` | Syscall filtering support | + +## Verification + +Two verification tools are provided: + +1. **Capability checker** (`check-vm-capabilities.sh`): Runs inside the VM to verify + kernel capabilities. Produces pass/fail results for each required feature. + +2. **Verification matrix** (`verify-vm.sh`): Runs from the host against a running VM. + Checks node health, pod status, networking, service reachability, and event logs. + +## Rollout Strategy + +1. Custom runtime support is opt-in via `OPENSHELL_VM_RUNTIME_SOURCE_DIR`. +2. The init script validates kernel capabilities at boot and fails fast if missing. +3. Rollback: unset the env var and re-bundle with stock libraries (note: stock + libraries lack bridge/netfilter and pod networking will not work). + +## Related Files + +| File | Purpose | +|------|---------| +| `crates/openshell-vm/src/ffi.rs` | Runtime loading, provenance capture | +| `crates/openshell-vm/src/lib.rs` | VM launch, provenance logging | +| `crates/openshell-vm/scripts/gateway-init.sh` | Guest init, network profile selection | +| `crates/openshell-vm/scripts/check-vm-capabilities.sh` | Kernel capability checker | +| `crates/openshell-vm/scripts/verify-vm.sh` | Host-side verification matrix | +| `crates/openshell-vm/runtime/` | Build pipeline and kernel config | +| `tasks/scripts/bundle-vm-runtime.sh` | Runtime bundling (stock + custom) | +| `tasks/vm.toml` | Mise task definitions | diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 8bcb60fd..2d7db436 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -6,12 +6,12 @@ pub mod edge_token; pub mod errors; pub mod image; -mod constants; +pub mod constants; mod docker; mod metadata; -mod mtls; -mod paths; -mod pki; +pub mod mtls; +pub mod paths; +pub mod pki; pub(crate) mod push; mod runtime; diff --git a/crates/openshell-bootstrap/src/paths.rs b/crates/openshell-bootstrap/src/paths.rs index cd3cb769..ff31b021 100644 --- a/crates/openshell-bootstrap/src/paths.rs +++ b/crates/openshell-bootstrap/src/paths.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use miette::Result; -use openshell_core::paths::xdg_config_dir; +use openshell_core::paths::{xdg_config_dir, xdg_data_dir}; use std::path::PathBuf; /// Path to the file that stores the active gateway name. @@ -26,6 +26,16 @@ pub fn last_sandbox_path(gateway: &str) -> Result { Ok(gateways_dir()?.join(gateway).join("last_sandbox")) } +/// Default rootfs directory for gateway microVMs. +/// +/// Location: `$XDG_DATA_HOME/openshell/gateway/rootfs` +pub fn default_rootfs_dir() -> Result { + Ok(xdg_data_dir()? + .join("openshell") + .join("gateway") + .join("rootfs")) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/openshell-core/src/paths.rs b/crates/openshell-core/src/paths.rs index bd9ce23d..fd0a141b 100644 --- a/crates/openshell-core/src/paths.rs +++ b/crates/openshell-core/src/paths.rs @@ -29,6 +29,19 @@ pub fn openshell_config_dir() -> Result { Ok(xdg_config_dir()?.join("openshell")) } +/// Resolve the XDG data base directory. +/// +/// Returns `$XDG_DATA_HOME` if set, otherwise `$HOME/.local/share`. +pub fn xdg_data_dir() -> Result { + if let Ok(path) = std::env::var("XDG_DATA_HOME") { + return Ok(PathBuf::from(path)); + } + let home = std::env::var("HOME") + .into_diagnostic() + .wrap_err("HOME is not set")?; + Ok(PathBuf::from(home).join(".local").join("share")) +} + /// Create a directory (and parents) with owner-only permissions (`0o700`) on /// Unix. On non-Unix platforms, falls back to default permissions. /// diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml new file mode 100644 index 00000000..c8319765 --- /dev/null +++ b/crates/openshell-vm/Cargo.toml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "openshell-vm" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true +description = "MicroVM runtime using libkrun for hardware-isolated execution" + +[lib] +name = "openshell_vm" +path = "src/lib.rs" + +[[bin]] +name = "gateway" +path = "src/main.rs" + +[dependencies] +base64 = "0.22" +clap = { workspace = true } +libc = "0.2" +libloading = "0.8" +miette = { workspace = true } +openshell-bootstrap = { path = "../openshell-bootstrap" } +serde_json = "1" +thiserror = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } + +[lints] +workspace = true diff --git a/crates/openshell-vm/entitlements.plist b/crates/openshell-vm/entitlements.plist new file mode 100644 index 00000000..154f3308 --- /dev/null +++ b/crates/openshell-vm/entitlements.plist @@ -0,0 +1,8 @@ + + + + + com.apple.security.hypervisor + + + diff --git a/crates/openshell-vm/runtime/README.md b/crates/openshell-vm/runtime/README.md new file mode 100644 index 00000000..e8289685 --- /dev/null +++ b/crates/openshell-vm/runtime/README.md @@ -0,0 +1,179 @@ +# Custom libkrunfw Runtime + +This directory contains the build infrastructure for a custom `libkrunfw` runtime +that enables bridge CNI and netfilter support in the OpenShell gateway VM. + +## Why + +The stock `libkrunfw` (from Homebrew) ships a kernel without bridge, netfilter, +or conntrack support. This means the VM cannot: + +- Create `cni0` bridge interfaces (required by the bridge CNI plugin) +- Run kube-proxy (requires nftables) +- Route service VIP traffic (requires NAT/conntrack) + +The custom runtime builds libkrunfw with an additional kernel config fragment +that enables these networking and sandboxing features. + +## Directory Structure + +``` +runtime/ + build-custom-libkrunfw.sh # Build script for custom libkrunfw + kernel/ + bridge-cni.config # Kernel config fragment (networking + sandboxing) +``` + +## Building + +### Prerequisites + +- Rust toolchain +- make, git, curl +- On macOS: Xcode command line tools and cross-compilation tools for aarch64 + +### Quick Build + +```bash +# Build custom libkrunfw (clones libkrunfw repo, applies config, builds) +./crates/openshell-vm/runtime/build-custom-libkrunfw.sh + +# Or via mise task: +mise run vm:build-custom-runtime +``` + +### Output + +Build artifacts are placed in `target/custom-runtime/`: + +``` +target/custom-runtime/ + libkrunfw.dylib # The custom library + libkrunfw..dylib # Version-suffixed copy + provenance.json # Build metadata (commit, hash, timestamp) + bridge-cni.config # The config fragment used + kernel.config # Full kernel .config (for debugging) +``` + +### Using the Custom Runtime + +```bash +# Point the bundle script at the custom build: +export OPENSHELL_VM_RUNTIME_SOURCE_DIR=target/custom-runtime +mise run vm:bundle-runtime + +# Then boot the VM as usual: +mise run vm +``` + +## Networking + +The VM uses bridge CNI for pod networking with nftables-mode kube-proxy for +service VIP / ClusterIP support. The kernel config fragment enables both +iptables (for CNI bridge masquerade) and nftables (for kube-proxy). + +k3s is started with `--kube-proxy-arg=proxy-mode=nftables` because the +bundled iptables binaries in k3s have revision-negotiation issues with the +libkrun kernel's xt_MARK module. nftables mode uses the kernel's nf_tables +subsystem directly and avoids this entirely. + +## Runtime Provenance + +At VM boot, the gateway binary logs provenance information about the loaded +runtime: + +``` +runtime: /path/to/gateway.runtime + libkrunfw: libkrunfw.dylib + sha256: a1b2c3d4e5f6... + type: custom (OpenShell-built) + libkrunfw-commit: abc1234 + kernel-version: 6.6.30 + build-timestamp: 2026-03-23T10:00:00Z +``` + +For stock runtimes: +``` +runtime: /path/to/gateway.runtime + libkrunfw: libkrunfw.dylib + sha256: f6e5d4c3b2a1... + type: stock (system/homebrew) +``` + +## Verification + +### Capability Check (inside VM) + +```bash +# Run inside the VM to verify kernel capabilities: +/srv/check-vm-capabilities.sh + +# JSON output for CI: +/srv/check-vm-capabilities.sh --json +``` + +### Full Verification Matrix + +```bash +# Run from the host with a running VM: +./crates/openshell-vm/scripts/verify-vm.sh + +# Or via mise task: +mise run vm:verify +``` + +## Rollback + +To revert to the stock runtime: + +```bash +# Unset the custom runtime source: +unset OPENSHELL_VM_RUNTIME_SOURCE_DIR + +# Re-bundle with stock libraries: +mise run vm:bundle-runtime + +# Boot — will auto-detect legacy-vm-net profile: +mise run vm +``` + +## Troubleshooting + +### "FailedCreatePodSandBox" bridge errors + +The kernel does not have bridge support. Verify: +```bash +# Inside VM: +ip link add test0 type bridge && echo "bridge OK" && ip link del test0 +``` + +If this fails, you are running the stock runtime. Build and use the custom one. + +### kube-proxy CrashLoopBackOff + +kube-proxy runs in nftables mode. If it crashes, verify nftables support: +```bash +# Inside VM: +nft list ruleset +``` + +If this fails, the kernel may lack `CONFIG_NF_TABLES`. Use the custom runtime. + +Common errors: +- `unknown option "--xor-mark"`: kube-proxy is running in iptables mode instead + of nftables. Verify `--kube-proxy-arg=proxy-mode=nftables` is in the k3s args. + +### Runtime mismatch after upgrade + +If libkrunfw is updated (e.g., via `brew upgrade`), the stock runtime may +change. Check provenance: +```bash +# Look for provenance info in VM boot output +grep "runtime:" ~/.local/share/openshell/gateway/console.log +``` + +Re-build the custom runtime if needed: +```bash +mise run vm:build-custom-runtime +mise run vm:bundle-runtime +``` diff --git a/crates/openshell-vm/runtime/build-custom-libkrunfw.sh b/crates/openshell-vm/runtime/build-custom-libkrunfw.sh new file mode 100755 index 00000000..a69fc0c1 --- /dev/null +++ b/crates/openshell-vm/runtime/build-custom-libkrunfw.sh @@ -0,0 +1,385 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build a custom libkrunfw with bridge/netfilter kernel support. +# +# This script clones libkrunfw, applies the OpenShell kernel config +# fragment (bridge CNI, iptables, conntrack), builds the library, and +# stages the artifact with provenance metadata. +# +# Prerequisites: +# - Rust toolchain (cargo) +# - make, git, curl +# - Cross-compilation toolchain for aarch64 (if building on x86_64) +# - On macOS: Xcode command line tools +# +# Usage: +# ./build-custom-libkrunfw.sh [--output-dir DIR] [--libkrunfw-ref REF] +# +# Environment: +# LIBKRUNFW_REF - git ref to check out (default: main) +# LIBKRUNFW_REPO - git repo URL (default: github.com/containers/libkrunfw) +# OPENSHELL_RUNTIME_OUTPUT_DIR - output directory for built artifacts + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" +KERNEL_CONFIG_FRAGMENT="${SCRIPT_DIR}/kernel/bridge-cni.config" + +# Defaults +LIBKRUNFW_REPO="${LIBKRUNFW_REPO:-https://github.com/containers/libkrunfw.git}" +LIBKRUNFW_REF="${LIBKRUNFW_REF:-main}" +OUTPUT_DIR="${OPENSHELL_RUNTIME_OUTPUT_DIR:-${PROJECT_ROOT}/target/custom-runtime}" +BUILD_DIR="${PROJECT_ROOT}/target/libkrunfw-build" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --output-dir) + OUTPUT_DIR="$2"; shift 2 ;; + --libkrunfw-ref) + LIBKRUNFW_REF="$2"; shift 2 ;; + --help|-h) + echo "Usage: $0 [--output-dir DIR] [--libkrunfw-ref REF]" + echo "" + echo "Build a custom libkrunfw with bridge/netfilter kernel support." + echo "" + echo "Options:" + echo " --output-dir DIR Output directory for built artifacts" + echo " --libkrunfw-ref REF Git ref to check out (default: main)" + echo "" + echo "Environment:" + echo " LIBKRUNFW_REPO Git repo URL" + echo " LIBKRUNFW_REF Git ref (branch/tag/commit)" + echo " OPENSHELL_RUNTIME_OUTPUT_DIR Output directory" + exit 0 + ;; + *) + echo "Unknown argument: $1" >&2; exit 1 ;; + esac +done + +echo "==> Building custom libkrunfw" +echo " Repo: ${LIBKRUNFW_REPO}" +echo " Ref: ${LIBKRUNFW_REF}" +echo " Config fragment: ${KERNEL_CONFIG_FRAGMENT}" +echo " Output: ${OUTPUT_DIR}" +echo "" + +# ── Clone / update libkrunfw ──────────────────────────────────────────── + +if [ -d "${BUILD_DIR}/libkrunfw/.git" ]; then + echo "==> Updating existing libkrunfw checkout..." + git -C "${BUILD_DIR}/libkrunfw" fetch origin + git -C "${BUILD_DIR}/libkrunfw" checkout "${LIBKRUNFW_REF}" + git -C "${BUILD_DIR}/libkrunfw" pull --ff-only 2>/dev/null || true +else + echo "==> Cloning libkrunfw..." + mkdir -p "${BUILD_DIR}" + git clone "${LIBKRUNFW_REPO}" "${BUILD_DIR}/libkrunfw" + git -C "${BUILD_DIR}/libkrunfw" checkout "${LIBKRUNFW_REF}" +fi + +LIBKRUNFW_DIR="${BUILD_DIR}/libkrunfw" +LIBKRUNFW_COMMIT=$(git -C "${LIBKRUNFW_DIR}" rev-parse HEAD) +LIBKRUNFW_SHORT=$(git -C "${LIBKRUNFW_DIR}" rev-parse --short HEAD) + +echo " Commit: ${LIBKRUNFW_COMMIT}" + +# ── Detect the kernel version libkrunfw targets ──────────────────────── + +# libkrunfw's Makefile typically sets KERNEL_VERSION or has it in a +# config file. Try to detect it. +KERNEL_VERSION="" +if [ -f "${LIBKRUNFW_DIR}/Makefile" ]; then + KERNEL_VERSION=$(grep -oE 'KERNEL_VERSION\s*=\s*linux-[^\s]+' "${LIBKRUNFW_DIR}/Makefile" 2>/dev/null | head -1 | sed 's/.*= *//' || true) +fi +if [ -z "$KERNEL_VERSION" ] && [ -f "${LIBKRUNFW_DIR}/kernel_version" ]; then + KERNEL_VERSION=$(cat "${LIBKRUNFW_DIR}/kernel_version") +fi +echo " Kernel version: ${KERNEL_VERSION:-unknown}" + +# ── Apply kernel config fragment ──────────────────────────────────────── + +echo "==> Applying OpenShell kernel config fragment..." + +# libkrunfw builds the kernel with a config generated from its own +# sources. The config merge happens after `make olddefconfig` runs +# on the base config. We use the kernel's scripts/kconfig/merge_config.sh +# when available, otherwise do a simple append+olddefconfig. + +MERGE_HOOK="${LIBKRUNFW_DIR}/openshell-kconfig-hook.sh" +cat > "${MERGE_HOOK}" << 'HOOKEOF' +#!/usr/bin/env bash +# Hook called by the libkrunfw build after extracting the kernel source. +# Merges the OpenShell kernel config fragment into .config. +set -euo pipefail + +KERNEL_DIR="$1" +FRAGMENT="$2" + +if [ ! -d "$KERNEL_DIR" ]; then + echo "ERROR: kernel source dir not found: $KERNEL_DIR" >&2 + exit 1 +fi + +if [ ! -f "$FRAGMENT" ]; then + echo "ERROR: config fragment not found: $FRAGMENT" >&2 + exit 1 +fi + +cd "$KERNEL_DIR" + +if [ -f scripts/kconfig/merge_config.sh ]; then + echo " Using kernel merge_config.sh" + KCONFIG_CONFIG=.config ./scripts/kconfig/merge_config.sh -m .config "$FRAGMENT" +else + echo " Appending fragment and running olddefconfig" + cat "$FRAGMENT" >> .config +fi + +make ARCH=arm64 olddefconfig + +# Verify critical configs are set +REQUIRED=( + CONFIG_BRIDGE + CONFIG_BRIDGE_NETFILTER + CONFIG_NETFILTER + CONFIG_NF_CONNTRACK + CONFIG_NF_NAT + CONFIG_IP_NF_IPTABLES + CONFIG_IP_NF_FILTER + CONFIG_IP_NF_NAT + CONFIG_NF_TABLES + CONFIG_NFT_NUMGEN + CONFIG_NFT_FIB_IPV4 + CONFIG_NFT_FIB_IPV6 + CONFIG_NFT_CT + CONFIG_NFT_NAT + CONFIG_NFT_MASQ + CONFIG_VETH + CONFIG_NET_NS +) + +MISSING=() +for cfg in "${REQUIRED[@]}"; do + if ! grep -q "^${cfg}=[ym]" .config; then + MISSING+=("$cfg") + fi +done + +if [ ${#MISSING[@]} -gt 0 ]; then + echo "ERROR: Required kernel configs not set after merge:" >&2 + printf " %s\n" "${MISSING[@]}" >&2 + exit 1 +fi + +echo " All required kernel configs verified." +HOOKEOF +chmod +x "${MERGE_HOOK}" + +# ── Build libkrunfw ──────────────────────────────────────────────────── + +echo "==> Building libkrunfw (this may take 10-30 minutes)..." + +cd "${LIBKRUNFW_DIR}" + +# Detect macOS vs Linux and pick the right library extension / target +if [ "$(uname -s)" = "Darwin" ]; then + LIB_EXT="dylib" +else + LIB_EXT="so" +fi + +# Detect the kernel source directory name from the Makefile +KERNEL_DIR_NAME=$(grep -oE 'KERNEL_VERSION\s*=\s*linux-[^\s]+' Makefile | head -1 | sed 's/KERNEL_VERSION *= *//') +if [ -z "$KERNEL_DIR_NAME" ]; then + echo "ERROR: Could not detect KERNEL_VERSION from Makefile" >&2 + exit 1 +fi +echo " Kernel source dir: ${KERNEL_DIR_NAME}" + +if [ "$(uname -s)" = "Darwin" ]; then + # On macOS, use krunvm to build the kernel inside a lightweight Linux VM. + # This matches the upstream libkrunfw build approach and avoids all the + # issues with Docker emulation and APFS filesystem limitations. + # + # Prerequisites: brew tap slp/krun && brew install krunvm + + if ! command -v krunvm &>/dev/null; then + echo "ERROR: krunvm is required to build the kernel on macOS" >&2 + echo " Install with: brew tap slp/krun && brew install krunvm" >&2 + exit 1 + fi + + echo "==> Building kernel inside krunvm (macOS detected)..." + + VM_NAME="libkrunfw-openshell" + + # Clean up any leftover VM from a previous failed run + krunvm delete "${VM_NAME}" 2>/dev/null || true + + # Copy the config fragment into the libkrunfw tree so the VM can see it. + # The merge hook (MERGE_HOOK) is already written there by the cat above. + cp -f "${KERNEL_CONFIG_FRAGMENT}" "${LIBKRUNFW_DIR}/openshell-bridge-cni.config" + + echo " Creating VM..." + # krunvm may print "The volume has been configured" on first use of a + # volume path and exit non-zero. Retry once if that happens. + if ! krunvm create fedora \ + --name "${VM_NAME}" \ + --cpus 4 \ + --mem 4096 \ + -v "${LIBKRUNFW_DIR}:/work" \ + -w /work; then + echo " Retrying VM creation..." + krunvm create fedora \ + --name "${VM_NAME}" \ + --cpus 4 \ + --mem 4096 \ + -v "${LIBKRUNFW_DIR}:/work" \ + -w /work + fi + + echo " Installing build dependencies..." + krunvm start "${VM_NAME}" /usr/bin/dnf -- install -y \ + 'dnf-command(builddep)' python3-pyelftools + + krunvm start "${VM_NAME}" /usr/bin/dnf -- builddep -y kernel + + # Step 1: prepare kernel sources (download, extract, patch, base config) + echo " Preparing kernel sources..." + krunvm start "${VM_NAME}" /usr/bin/make -- "${KERNEL_DIR_NAME}" + + # Step 2: merge the OpenShell config fragment + echo " Merging OpenShell kernel config fragment..." + krunvm start "${VM_NAME}" /usr/bin/bash -- \ + /work/openshell-kconfig-hook.sh "/work/${KERNEL_DIR_NAME}" /work/openshell-bridge-cni.config + + # Step 3: build the kernel and generate the C bundle + echo " Building kernel (this is the slow part)..." + krunvm start "${VM_NAME}" /usr/bin/make -- -j4 + + echo " Cleaning up VM..." + krunvm delete "${VM_NAME}" + + # Clean up temp files from the libkrunfw tree + rm -f "${LIBKRUNFW_DIR}/openshell-bridge-cni.config" + + if [ ! -f "${LIBKRUNFW_DIR}/kernel.c" ]; then + echo "ERROR: kernel.c was not produced — build failed" >&2 + exit 1 + fi + + # Compile the shared library on the host (uses host cc for a .dylib) + echo "==> Compiling libkrunfw.dylib on host..." + ABI_VERSION=$(grep -oE 'ABI_VERSION\s*=\s*[0-9]+' Makefile | head -1 | sed 's/[^0-9]//g') + cc -fPIC -DABI_VERSION="${ABI_VERSION}" -shared -o "libkrunfw.${ABI_VERSION}.dylib" kernel.c +else + # On Linux, we can do everything natively in three steps: + + # Step 1: prepare kernel sources + echo " Preparing kernel sources..." + make "${KERNEL_DIR_NAME}" + + # Step 2: merge config fragment + echo "==> Merging OpenShell kernel config fragment..." + bash "${MERGE_HOOK}" "${LIBKRUNFW_DIR}/${KERNEL_DIR_NAME}" "${KERNEL_CONFIG_FRAGMENT}" + + # Step 3: build the kernel and shared library + make -j"$(nproc)" "$(grep -oE 'KRUNFW_BINARY_Linux\s*=\s*\S+' Makefile | head -1 | sed 's/[^=]*= *//')" || \ + make -j"$(nproc)" libkrunfw.so +fi + +# ── Stage output artifacts ────────────────────────────────────────────── + +echo "==> Staging artifacts..." +mkdir -p "${OUTPUT_DIR}" + +# Find the built library — check versioned names (e.g. libkrunfw.5.dylib) first +BUILT_LIB="" +for candidate in \ + "${LIBKRUNFW_DIR}"/libkrunfw*.${LIB_EXT} \ + "${LIBKRUNFW_DIR}/libkrunfw.${LIB_EXT}" \ + "${LIBKRUNFW_DIR}/target/release/libkrunfw.${LIB_EXT}" \ + "${LIBKRUNFW_DIR}/build/libkrunfw.${LIB_EXT}"; do + if [ -f "$candidate" ]; then + BUILT_LIB="$candidate" + break + fi +done + +if [ -z "$BUILT_LIB" ]; then + echo "ERROR: Could not find built libkrunfw.${LIB_EXT}" >&2 + echo " Searched in ${LIBKRUNFW_DIR}/ for libkrunfw*.${LIB_EXT}" + exit 1 +fi + +echo " Found library: ${BUILT_LIB}" + +# Compute SHA-256 (shasum on macOS, sha256sum on Linux) +if command -v sha256sum &>/dev/null; then + ARTIFACT_HASH=$(sha256sum "${BUILT_LIB}" | cut -d' ' -f1) +else + ARTIFACT_HASH=$(shasum -a 256 "${BUILT_LIB}" | cut -d' ' -f1) +fi +ARTIFACT_HASH_SHORT="${ARTIFACT_HASH:0:12}" + +# Copy the library — always stage as libkrunfw.dylib / libkrunfw.so +# (the base name the runtime loader expects) plus the original name +cp "${BUILT_LIB}" "${OUTPUT_DIR}/libkrunfw.${LIB_EXT}" +BUILT_BASENAME="$(basename "${BUILT_LIB}")" +if [ "${BUILT_BASENAME}" != "libkrunfw.${LIB_EXT}" ]; then + cp "${BUILT_LIB}" "${OUTPUT_DIR}/${BUILT_BASENAME}" +fi + +# Copy the kernel config that was actually used (for reproducibility) +KERNEL_SRC_DIR="" +for candidate in \ + "${LIBKRUNFW_DIR}/linux-"* \ + "${LIBKRUNFW_DIR}/build/linux-"* \ + "${LIBKRUNFW_DIR}/kernel/linux-"*; do + if [ -d "$candidate" ] && [ -f "${candidate}/.config" ]; then + KERNEL_SRC_DIR="$candidate" + break + fi +done + +if [ -n "$KERNEL_SRC_DIR" ] && [ -f "${KERNEL_SRC_DIR}/.config" ]; then + cp "${KERNEL_SRC_DIR}/.config" "${OUTPUT_DIR}/kernel.config" +fi + +# Copy our fragment for reference +cp "${KERNEL_CONFIG_FRAGMENT}" "${OUTPUT_DIR}/bridge-cni.config" + +# ── Write provenance metadata ────────────────────────────────────────── + +cat > "${OUTPUT_DIR}/provenance.json" << EOF +{ + "artifact": "libkrunfw-custom", + "version": "0.1.0-openshell", + "build_timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", + "libkrunfw_repo": "${LIBKRUNFW_REPO}", + "libkrunfw_ref": "${LIBKRUNFW_REF}", + "libkrunfw_commit": "${LIBKRUNFW_COMMIT}", + "kernel_version": "${KERNEL_VERSION:-unknown}", + "kernel_config_fragment": "bridge-cni.config", + "artifact_sha256": "${ARTIFACT_HASH}", + "host_os": "$(uname -s)", + "host_arch": "$(uname -m)", + "builder": "build-custom-libkrunfw.sh" +} +EOF + +echo "" +echo "==> Build complete" +echo " Library: ${OUTPUT_DIR}/libkrunfw.${LIB_EXT}" +echo " SHA256: ${ARTIFACT_HASH_SHORT}..." +echo " Provenance: ${OUTPUT_DIR}/provenance.json" +echo " Commit: ${LIBKRUNFW_SHORT}" +echo "" +echo "To use this runtime:" +echo " export OPENSHELL_VM_RUNTIME_SOURCE_DIR=${OUTPUT_DIR}" +echo " mise run vm:bundle-runtime" diff --git a/crates/openshell-vm/runtime/kernel/bridge-cni.config b/crates/openshell-vm/runtime/kernel/bridge-cni.config new file mode 100644 index 00000000..7b9610e3 --- /dev/null +++ b/crates/openshell-vm/runtime/kernel/bridge-cni.config @@ -0,0 +1,119 @@ +# Custom kernel config fragment for libkrunfw (OpenShell VM) +# +# This fragment is applied on top of libkrunfw's base kernel config +# to enable bridge CNI, netfilter/iptables, and conntrack support +# required for Kubernetes pod networking in the VM. +# +# Apply with: scripts/merge-kconfig.sh +# +# See also: check-vm-capabilities.sh for runtime verification. + +# ── Network Namespaces (required for pod isolation) ───────────────────── +CONFIG_NET_NS=y +CONFIG_NAMESPACES=y + +# ── Virtual Ethernet (veth pairs for pod networking) ──────────────────── +CONFIG_VETH=y + +# ── Linux Bridge (required for bridge CNI plugin) ────────────────────── +CONFIG_BRIDGE=y +CONFIG_BRIDGE_NETFILTER=y +CONFIG_BRIDGE_IGMP_SNOOPING=y + +# ── Netfilter framework ──────────────────────────────────────────────── +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=y +CONFIG_NETFILTER_NETLINK_QUEUE=y +CONFIG_NETFILTER_NETLINK_LOG=y + +# ── Connection tracking (required for NAT and kube-proxy) ────────────── +CONFIG_NF_CONNTRACK=y +CONFIG_NF_CT_NETLINK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y + +# ── Netfilter xtables match modules (required by kube-proxy & kubelet) ─ +# kube-proxy uses xt_conntrack for stateful rules and xt_comment for +# labeling chains. Without these, iptables fails with: +# "Couldn't load match 'conntrack': No such file or directory" +CONFIG_NETFILTER_XTABLES=y +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y +CONFIG_NETFILTER_XT_MATCH_COMMENT=y +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y +CONFIG_NETFILTER_XT_MATCH_MARK=y +CONFIG_NETFILTER_XT_MATCH_STATISTIC=y +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=y +CONFIG_NETFILTER_XT_MATCH_RECENT=y +CONFIG_NETFILTER_XT_TARGET_MARK=y +CONFIG_NETFILTER_XT_TARGET_CONNMARK=y +CONFIG_NETFILTER_XT_MATCH_CONNMARK=y + +# ── NAT (required for service VIP / DNAT / SNAT) ────────────────────── +CONFIG_NF_NAT=y +CONFIG_NF_NAT_MASQUERADE_IPV4=y + +# ── iptables (CNI bridge masquerade + compat) ────────────────────────── +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_FILTER=y +CONFIG_IP_NF_NAT=y +CONFIG_IP_NF_MANGLE=y +CONFIG_IP_NF_TARGET_MASQUERADE=y +CONFIG_IP_NF_TARGET_REJECT=y + +# ── nftables (kube-proxy nftables mode — primary proxy backend) ───────── +# kube-proxy nftables proxier requires: numgen (random LB), fib (local +# address detection), counter, ct, nat, masq, reject, limit, redir. +CONFIG_NF_TABLES=y +CONFIG_NF_TABLES_INET=y +CONFIG_NFT_CT=y +CONFIG_NFT_NAT=y +CONFIG_NFT_MASQ=y +CONFIG_NFT_REJECT=y +CONFIG_NFT_COMPAT=y +CONFIG_NFT_NUMGEN=y +CONFIG_NFT_FIB_IPV4=y +CONFIG_NFT_FIB_IPV6=y +CONFIG_NFT_LIMIT=y +CONFIG_NFT_REDIR=y +CONFIG_NFT_TPROXY=y + +# ── IP forwarding and routing (required for pod-to-pod) ──────────────── +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_NET_IP_TUNNEL=y + +# ── IPVS (optional: kube-proxy IPVS mode) ───────────────────────────── +CONFIG_IP_VS=y +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_RR=y +CONFIG_IP_VS_WRR=y +CONFIG_IP_VS_SH=y +CONFIG_IP_VS_NFCT=y + +# ── Misc networking required by Kubernetes ───────────────────────────── +CONFIG_NET_SCH_HTB=y +CONFIG_NET_CLS_CGROUP=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y + +# ── Dummy interface (fallback networking) ────────────────────────────── +CONFIG_DUMMY=y + +# ── TUN/TAP (used by some CNI plugins) ──────────────────────────────── +CONFIG_TUN=y + +# ── Cgroups (already in base, ensure v2 is available) ────────────────── +CONFIG_CGROUPS=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PIDS=y +CONFIG_MEMCG=y + +# ── Security features required by the sandbox runtime ─────────────────── +CONFIG_SECURITY_LANDLOCK=y +CONFIG_SECCOMP_FILTER=y diff --git a/crates/openshell-vm/scripts/api-proxy.py b/crates/openshell-vm/scripts/api-proxy.py new file mode 100644 index 00000000..6da224f1 --- /dev/null +++ b/crates/openshell-vm/scripts/api-proxy.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +""" +TCP proxy that waits for the k3s apiserver to be ready on 127.0.0.1:6444, +then accepts connections on 0.0.0.0:6443 and forwards them to the apiserver. + +This decouples the TSI-exposed port from k3s's internal dynamiclistener, +which has TLS handshake issues when accessed through TSI. +""" + +import os +import socket +import sys +import threading +import time + +LISTEN_HOST = "0.0.0.0" +LISTEN_PORT = int(os.environ.get("PROXY_LISTEN_PORT", "6443")) +UPSTREAM_HOST = "127.0.0.1" +UPSTREAM_PORT = int(os.environ.get("PROXY_UPSTREAM_PORT", "6444")) +BUFFER_SIZE = 65536 + + +def wait_for_upstream(): + """Block until the upstream apiserver completes a TLS handshake. + + A raw TCP connect succeeds as soon as the port is bound, but the TLS + server may not be ready yet. We do a full TLS handshake to confirm. + """ + import ssl + + ctx = ssl.create_default_context() + ctx.check_hostname = False + ctx.verify_mode = ssl.CERT_NONE + + attempt = 0 + while True: + attempt += 1 + try: + sock = socket.create_connection((UPSTREAM_HOST, UPSTREAM_PORT), timeout=5) + ssock = ctx.wrap_socket(sock, server_hostname="localhost") + ssock.close() + print(f"[proxy] upstream TLS ready after {attempt} attempts", flush=True) + return + except ( + ConnectionRefusedError, + ConnectionResetError, + OSError, + ssl.SSLError, + ) as e: + if attempt % 5 == 0: + print( + f"[proxy] waiting for upstream (attempt {attempt}): {e}", flush=True + ) + time.sleep(1) + + +def forward(src, dst, label): + """Forward data between two sockets until one closes.""" + try: + while True: + data = src.recv(BUFFER_SIZE) + if not data: + break + dst.sendall(data) + except (BrokenPipeError, ConnectionResetError, OSError): + pass + finally: + try: + dst.shutdown(socket.SHUT_WR) + except OSError: + pass + + +def handle_client(client_sock, client_addr): + """Connect to upstream and forward bidirectionally.""" + print(f"[proxy] accepted connection from {client_addr}", flush=True) + try: + upstream = socket.create_connection((UPSTREAM_HOST, UPSTREAM_PORT), timeout=5) + print(f"[proxy] connected to upstream for {client_addr}", flush=True) + except OSError as e: + print( + f"[proxy] failed to connect to upstream for {client_addr}: {e}", flush=True + ) + client_sock.close() + return + + # Forward in both directions + t1 = threading.Thread( + target=forward, args=(client_sock, upstream, "client->upstream"), daemon=True + ) + t2 = threading.Thread( + target=forward, args=(upstream, client_sock, "upstream->client"), daemon=True + ) + t1.start() + t2.start() + t1.join() + t2.join() + print(f"[proxy] connection closed for {client_addr}", flush=True) + client_sock.close() + upstream.close() + + +def main(): + # Wait for the real apiserver to be ready before accepting connections + print( + f"[proxy] waiting for upstream at {UPSTREAM_HOST}:{UPSTREAM_PORT}...", + flush=True, + ) + wait_for_upstream() + + # Start listening + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind((LISTEN_HOST, LISTEN_PORT)) + server.listen(64) + print( + f"[proxy] listening on {LISTEN_HOST}:{LISTEN_PORT} -> {UPSTREAM_HOST}:{UPSTREAM_PORT}", + flush=True, + ) + + while True: + client_sock, client_addr = server.accept() + threading.Thread( + target=handle_client, args=(client_sock, client_addr), daemon=True + ).start() + + +if __name__ == "__main__": + main() diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh new file mode 100755 index 00000000..e79a9c7d --- /dev/null +++ b/crates/openshell-vm/scripts/build-rootfs.sh @@ -0,0 +1,737 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Build an aarch64 Ubuntu rootfs for the gateway microVM. +# +# Produces a rootfs with k3s pre-installed, the OpenShell helm chart and +# manifests baked in, container images pre-loaded, AND a fully initialized +# k3s cluster state (database, TLS, images imported, all services deployed). +# +# On first VM boot, k3s resumes from this pre-baked state instead of +# cold-starting, achieving ~3-5s startup times. +# +# Usage: +# ./crates/openshell-vm/scripts/build-rootfs.sh [output_dir] +# +# Requires: Docker (or compatible container runtime), curl, helm, zstd + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEFAULT_ROOTFS="${XDG_DATA_HOME:-${HOME}/.local/share}/openshell/gateway/rootfs" +ROOTFS_DIR="${1:-${DEFAULT_ROOTFS}}" +CONTAINER_NAME="krun-rootfs-builder" +INIT_CONTAINER_NAME="krun-k3s-init" +BASE_IMAGE_TAG="krun-rootfs:gateway" +# K3S_VERSION uses the semver "+" form for GitHub releases. +# The mise env may provide the Docker-tag form with "-" instead of "+"; +# normalise to "+" so the GitHub download URL works. +K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" +K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" + +# Project root (two levels up from crates/openshell-vm/scripts/) +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" + +# Container images to pre-load into k3s (arm64). +IMAGE_REPO_BASE="${IMAGE_REPO_BASE:-openshell}" +IMAGE_TAG="${IMAGE_TAG:-dev}" +SERVER_IMAGE="${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" +SANDBOX_IMAGE="${IMAGE_REPO_BASE}/sandbox:${IMAGE_TAG}" +AGENT_SANDBOX_IMAGE="registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0" + +echo "==> Building gateway rootfs" +echo " k3s version: ${K3S_VERSION}" +echo " Images: ${SERVER_IMAGE}, ${SANDBOX_IMAGE}" +echo " Output: ${ROOTFS_DIR}" + +# ── Download k3s binary (outside Docker — much faster) ───────────────── + +K3S_BIN="/tmp/k3s-arm64-${K3S_VERSION}" +if [ -f "${K3S_BIN}" ]; then + echo "==> Using cached k3s binary: ${K3S_BIN}" +else + echo "==> Downloading k3s ${K3S_VERSION} for arm64..." + curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s-arm64" \ + -o "${K3S_BIN}" + chmod +x "${K3S_BIN}" +fi + +# ── Build base image with dependencies ───────────────────────────────── + +# Clean up any previous run +docker rm -f "${CONTAINER_NAME}" 2>/dev/null || true +docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true + +echo "==> Building base image..." +docker build --platform linux/arm64 -t "${BASE_IMAGE_TAG}" -f - . <<'DOCKERFILE' +FROM nvcr.io/nvidia/base/ubuntu:noble-20251013 +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + ca-certificates \ + iptables \ + iproute2 \ + python3 \ + busybox-static \ + zstd \ + && rm -rf /var/lib/apt/lists/* +# busybox-static provides udhcpc for DHCP inside the VM. +RUN mkdir -p /usr/share/udhcpc && \ + ln -sf /bin/busybox /sbin/udhcpc +RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s +DOCKERFILE + +# Create a container and export the filesystem +echo "==> Creating container..." +docker create --platform linux/arm64 --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true + +echo "==> Exporting filesystem..." +# Previous builds may leave overlayfs work/ dirs with permissions that +# prevent rm on macOS. Force-fix permissions before removing. +if [ -d "${ROOTFS_DIR}" ]; then + chmod -R u+rwx "${ROOTFS_DIR}" 2>/dev/null || true + rm -rf "${ROOTFS_DIR}" +fi +mkdir -p "${ROOTFS_DIR}" +docker export "${CONTAINER_NAME}" | tar -C "${ROOTFS_DIR}" -xf - + +docker rm "${CONTAINER_NAME}" + +# ── Inject k3s binary ──────────────────────────────────────────────── + +echo "==> Injecting k3s binary..." +cp "${K3S_BIN}" "${ROOTFS_DIR}/usr/local/bin/k3s" +chmod +x "${ROOTFS_DIR}/usr/local/bin/k3s" +ln -sf /usr/local/bin/k3s "${ROOTFS_DIR}/usr/local/bin/kubectl" + +# ── Inject scripts ──────────────────────────────────────────────────── + +echo "==> Injecting gateway-init.sh..." +mkdir -p "${ROOTFS_DIR}/srv" +cp "${SCRIPT_DIR}/gateway-init.sh" "${ROOTFS_DIR}/srv/gateway-init.sh" +chmod +x "${ROOTFS_DIR}/srv/gateway-init.sh" + +# Keep the hello server around for debugging +cp "${SCRIPT_DIR}/hello-server.py" "${ROOTFS_DIR}/srv/hello-server.py" +chmod +x "${ROOTFS_DIR}/srv/hello-server.py" + +# Inject VM capability checker for runtime diagnostics. +cp "${SCRIPT_DIR}/check-vm-capabilities.sh" "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" +chmod +x "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" + +# ── Package and inject helm chart ──────────────────────────────────── + +HELM_CHART_DIR="${PROJECT_ROOT}/deploy/helm/openshell" +CHART_DEST="${ROOTFS_DIR}/var/lib/rancher/k3s/server/static/charts" + +if [ -d "${HELM_CHART_DIR}" ]; then + echo "==> Packaging helm chart..." + mkdir -p "${CHART_DEST}" + helm package "${HELM_CHART_DIR}" -d "${CHART_DEST}" + echo " $(ls "${CHART_DEST}"/*.tgz 2>/dev/null | xargs -I{} basename {})" +else + echo "WARNING: Helm chart not found at ${HELM_CHART_DIR}, skipping" +fi + +# ── Inject Kubernetes manifests ────────────────────────────────────── +# These are copied to /opt/openshell/manifests/ (staging). gateway-init.sh +# moves them to /var/lib/rancher/k3s/server/manifests/ at boot so the +# k3s Helm Controller auto-deploys them. + +MANIFEST_SRC="${PROJECT_ROOT}/deploy/kube/manifests" +MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/manifests" + +echo "==> Injecting Kubernetes manifests..." +mkdir -p "${MANIFEST_DEST}" + +for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do + if [ -f "${MANIFEST_SRC}/${manifest}" ]; then + cp "${MANIFEST_SRC}/${manifest}" "${MANIFEST_DEST}/" + echo " ${manifest}" + else + echo "WARNING: ${manifest} not found in ${MANIFEST_SRC}" + fi +done + +# ── Pre-load container images ──────────────────────────────────────── +# Pull arm64 images and save as tarballs in the k3s airgap images +# directory. k3s auto-imports from /var/lib/rancher/k3s/agent/images/ +# on startup, so no internet access is needed at boot time. +# +# Tarballs are cached in a persistent directory outside the rootfs so +# they survive rebuilds. This avoids re-pulling and re-saving ~1 GiB +# of images each time. + +IMAGES_DIR="${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images" +IMAGE_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/openshell/gateway/images" +mkdir -p "${IMAGES_DIR}" "${IMAGE_CACHE_DIR}" + +echo "==> Pre-loading container images (arm64)..." + +pull_and_save() { + local image="$1" + local output="$2" + local cache="${IMAGE_CACHE_DIR}/$(basename "${output}")" + + # Use cached tarball if available. + if [ -f "${cache}" ]; then + echo " cached: $(basename "${output}")" + cp "${cache}" "${output}" + return 0 + fi + + # Try to pull; if the registry is unavailable, fall back to the + # local Docker image cache (image may exist from a previous pull). + echo " pulling: ${image}..." + if ! docker pull --platform linux/arm64 "${image}" --quiet 2>/dev/null; then + echo " pull failed, checking local Docker cache..." + if ! docker image inspect "${image}" >/dev/null 2>&1; then + echo "ERROR: image ${image} not available locally or from registry" + exit 1 + fi + echo " using locally cached image" + fi + + echo " saving: $(basename "${output}")..." + # Pipe through zstd for faster decompression and smaller tarballs. + # k3s auto-imports .tar.zst files from the airgap images directory. + # -T0 uses all CPU cores; -3 is a good speed/ratio tradeoff. + docker save "${image}" | zstd -T0 -3 -o "${output}" + # Cache for next rebuild. + cp "${output}" "${cache}" +} + +pull_and_save "${SERVER_IMAGE}" "${IMAGES_DIR}/openshell-server.tar.zst" +pull_and_save "${SANDBOX_IMAGE}" "${IMAGES_DIR}/openshell-sandbox.tar.zst" +pull_and_save "${AGENT_SANDBOX_IMAGE}" "${IMAGES_DIR}/agent-sandbox-controller.tar.zst" + +# ── Pre-initialize k3s cluster state ───────────────────────────────── +# Boot k3s inside a Docker container using the rootfs we just built. +# Wait for it to fully initialize (import images, deploy manifests, +# create database), then capture the state back into the rootfs. +# +# This eliminates cold-start latency: on VM boot, k3s finds existing +# state and resumes in ~3-5 seconds instead of 30-60s. + +echo "" +echo "==> Pre-initializing k3s cluster state..." +echo " This boots k3s in a container, waits for full readiness," +echo " then captures the initialized state into the rootfs." + +# Patch the HelmChart manifest for the init container (same patches +# gateway-init.sh applies at runtime). +INIT_MANIFESTS="${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests" +mkdir -p "${INIT_MANIFESTS}" + +# Copy manifests from staging to the k3s manifest directory. +for manifest in "${MANIFEST_DEST}"/*.yaml; do + [ -f "$manifest" ] || continue + cp "$manifest" "${INIT_MANIFESTS}/" +done + +# Patch HelmChart for local images and VM settings. +HELMCHART="${INIT_MANIFESTS}/openshell-helmchart.yaml" +if [ -f "$HELMCHART" ]; then + # Use local images — explicitly imported into containerd. + sed -i '' 's|pullPolicy: Always|pullPolicy: IfNotPresent|' "$HELMCHART" 2>/dev/null \ + || sed -i 's|pullPolicy: Always|pullPolicy: IfNotPresent|' "$HELMCHART" + # Use the locally imported image references. + sed -i '' -E "s|repository:[[:space:]]*[^[:space:]]+|repository: ${SERVER_IMAGE%:*}|" "$HELMCHART" 2>/dev/null \ + || sed -i -E "s|repository:[[:space:]]*[^[:space:]]+|repository: ${SERVER_IMAGE%:*}|" "$HELMCHART" + sed -i '' -E "s|tag:[[:space:]]*\"?[^\"[:space:]]+\"?|tag: \"${IMAGE_TAG}\"|" "$HELMCHART" 2>/dev/null \ + || sed -i -E "s|tag:[[:space:]]*\"?[^\"[:space:]]+\"?|tag: \"${IMAGE_TAG}\"|" "$HELMCHART" + sed -i '' "s|server:[[:space:]]*sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest|server:\n sandboxImage: ${SANDBOX_IMAGE}|g" "$HELMCHART" 2>/dev/null || true + sed -i '' "s|sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest|sandboxImage: ${SANDBOX_IMAGE}|g" "$HELMCHART" 2>/dev/null \ + || sed -i "s|sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest|sandboxImage: ${SANDBOX_IMAGE}|g" "$HELMCHART" + # Bridge CNI: pods use normal pod networking, not hostNetwork. + # This must match what gateway-init.sh applies at runtime so the + # HelmChart manifest is unchanged at boot — preventing a helm + # upgrade job that would cycle the pre-baked pod. + sed -i '' 's|__HOST_NETWORK__|false|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__HOST_NETWORK__|false|g' "$HELMCHART" + # Enable SA token automount for bridge CNI mode. Must match + # gateway-init.sh runtime value to avoid manifest delta. + sed -i '' 's|__AUTOMOUNT_SA_TOKEN__|true|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__AUTOMOUNT_SA_TOKEN__|true|g' "$HELMCHART" + # Mount the k3s kubeconfig into the pod for VM mode. + sed -i '' 's|__KUBECONFIG_HOST_PATH__|"/etc/rancher/k3s"|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__KUBECONFIG_HOST_PATH__|"/etc/rancher/k3s"|g' "$HELMCHART" + # Disable persistence — use /tmp for the SQLite database. PVC mounts + # are unreliable on virtiofs. + sed -i '' 's|__PERSISTENCE_ENABLED__|false|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__PERSISTENCE_ENABLED__|false|g' "$HELMCHART" + sed -i '' 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" + # Clear SSH gateway placeholders. + sed -i '' 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" + sed -i '' 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" + SSH_HANDSHAKE_SECRET="$(head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n')" + sed -i '' "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART" 2>/dev/null \ + || sed -i "s|__SSH_HANDSHAKE_SECRET__|${SSH_HANDSHAKE_SECRET}|g" "$HELMCHART" + sed -i '' 's|__DISABLE_GATEWAY_AUTH__|false|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__DISABLE_GATEWAY_AUTH__|false|g' "$HELMCHART" + sed -i '' 's|__DISABLE_TLS__|false|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|__DISABLE_TLS__|false|g' "$HELMCHART" + sed -i '' 's|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: ""|g' "$HELMCHART" 2>/dev/null \ + || sed -i 's|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: ""|g' "$HELMCHART" + sed -i '' '/__CHART_CHECKSUM__/d' "$HELMCHART" 2>/dev/null \ + || sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART" +fi + +# Patch agent-sandbox manifest for VM networking constraints. +AGENT_MANIFEST="${INIT_MANIFESTS}/agent-sandbox.yaml" +if [ -f "$AGENT_MANIFEST" ]; then + # Keep agent-sandbox on pod networking to avoid host port clashes. + # Point in-cluster client traffic at the API server node IP because + # kube-proxy is disabled in VM mode. + sed -i '' '/hostNetwork: true/d' "$AGENT_MANIFEST" 2>/dev/null \ + || sed -i '/hostNetwork: true/d' "$AGENT_MANIFEST" + sed -i '' '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" 2>/dev/null \ + || sed -i '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" + sed -i '' 's|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0\ + args:\ + - -metrics-bind-address=:8082\ + env:\ + - name: KUBERNETES_SERVICE_HOST\ + value: 192.168.127.2\ + - name: KUBERNETES_SERVICE_PORT\ + value: "6443"|g' "$AGENT_MANIFEST" 2>/dev/null \ + || sed -i 's|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0\ + args:\ + - -metrics-bind-address=:8082\ + env:\ + - name: KUBERNETES_SERVICE_HOST\ + value: 192.168.127.2\ + - name: KUBERNETES_SERVICE_PORT\ + value: "6443"|g' "$AGENT_MANIFEST" + if grep -q 'hostNetwork: true' "$AGENT_MANIFEST" \ + || grep -q 'ClusterFirstWithHostNet' "$AGENT_MANIFEST" \ + || ! grep -q 'KUBERNETES_SERVICE_HOST' "$AGENT_MANIFEST" \ + || ! grep -q 'metrics-bind-address=:8082' "$AGENT_MANIFEST"; then + echo "ERROR: failed to patch agent-sandbox manifest for VM networking constraints: $AGENT_MANIFEST" >&2 + exit 1 + fi +fi + +# local-storage implies local-path-provisioner, which requires CNI bridge +# networking that is unavailable in the VM kernel. +rm -f "${INIT_MANIFESTS}/local-storage.yaml" 2>/dev/null || true + +# Boot k3s in a privileged container. We use a Docker volume for the +# k3s data directory because kine (SQLite) creates Unix sockets that +# don't work over bind mounts from macOS. After k3s is ready, we +# copy the state back into the rootfs. +docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true +docker volume rm krun-k3s-init-data 2>/dev/null || true +docker volume create krun-k3s-init-data >/dev/null + +# Seed the volume with the airgap images and manifests from the rootfs. +echo " Seeding Docker volume with airgap images and manifests..." +docker run --rm \ + --platform linux/arm64 \ + -v krun-k3s-init-data:/var/lib/rancher/k3s \ + -v "${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images:/src/images:ro" \ + -v "${ROOTFS_DIR}/var/lib/rancher/k3s/server/static/charts:/src/charts:ro" \ + -v "${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests:/src/manifests:ro" \ + "${BASE_IMAGE_TAG}" \ + sh -c ' + mkdir -p /var/lib/rancher/k3s/agent/images \ + /var/lib/rancher/k3s/server/static/charts \ + /var/lib/rancher/k3s/server/manifests && + cp /src/images/* /var/lib/rancher/k3s/agent/images/ 2>/dev/null || true && + cp /src/charts/* /var/lib/rancher/k3s/server/static/charts/ 2>/dev/null || true && + cp /src/manifests/* /var/lib/rancher/k3s/server/manifests/ 2>/dev/null || true + ' + +echo " Starting k3s in container..." +# Use --hostname=gateway so the k3s node name matches the VM's hostname. +# This ensures the pre-baked pod schedule (node affinity) is valid when +# the VM boots — avoiding a stale Docker-hostname node in the cluster. +docker run -d \ + --name "${INIT_CONTAINER_NAME}" \ + --hostname gateway \ + --platform linux/arm64 \ + --privileged \ + --tmpfs /run \ + --tmpfs /tmp \ + -v "${K3S_BIN}:/usr/local/bin/k3s:ro" \ + -v krun-k3s-init-data:/var/lib/rancher/k3s \ + "${BASE_IMAGE_TAG}" \ + /usr/local/bin/k3s server \ + --disable=traefik,servicelb,metrics-server,coredns,local-storage \ + --disable-network-policy \ + --write-kubeconfig-mode=644 \ + --flannel-backend=host-gw \ + --snapshotter=native + +# Wait for kubeconfig to appear. k3s writes it to +# /etc/rancher/k3s/k3s.yaml inside the container. +echo " Waiting for kubeconfig..." +for i in $(seq 1 90); do + if docker exec "${INIT_CONTAINER_NAME}" test -s /etc/rancher/k3s/k3s.yaml 2>/dev/null; then + echo " Kubeconfig ready (${i}s)" + break + fi + if [ "$i" -eq 90 ]; then + echo "ERROR: kubeconfig did not appear in 90s" + docker logs "${INIT_CONTAINER_NAME}" --tail 50 + docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true + docker volume rm krun-k3s-init-data 2>/dev/null || true + exit 1 + fi + sleep 1 +done + +# Wait for containerd to be fully ready before importing images. +# The kubeconfig may appear before containerd's gRPC socket is +# accepting requests. `k3s ctr version` exercises the full path. +echo " Waiting for containerd..." +for i in $(seq 1 60); do + if docker exec "${INIT_CONTAINER_NAME}" /usr/local/bin/k3s ctr version >/dev/null 2>&1; then + echo " Containerd ready (${i}s)" + break + fi + if [ "$i" -eq 60 ]; then + echo "ERROR: containerd did not become ready in 60s" + docker logs "${INIT_CONTAINER_NAME}" --tail 30 + docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true + docker volume rm krun-k3s-init-data 2>/dev/null || true + exit 1 + fi + sleep 1 +done + +# Explicitly import images into containerd's k8s.io namespace, then +# tag them with the docker.io/ prefix that kubelet expects. +# +# When Docker saves "openshell/gateway:dev", the tarball stores the +# reference as "openshell/gateway:dev". But kubelet normalises all +# short names to "docker.io/openshell/gateway:dev". Without the +# re-tag, kubelet can't find the image and falls back to pulling. +echo " Importing images into containerd..." +docker exec "${INIT_CONTAINER_NAME}" sh -c ' + # Prefer system zstd (installed in base image), fall back to k3s bundled. + if command -v zstd >/dev/null 2>&1; then + ZSTD=zstd + else + ZSTD=$(find /var/lib/rancher/k3s/data -name zstd -type f 2>/dev/null | head -1) + fi + + for f in /var/lib/rancher/k3s/agent/images/*.tar.zst; do + [ -f "$f" ] || continue + base=$(basename "$f") + echo " importing ${base}..." + if [ -n "$ZSTD" ]; then + "$ZSTD" -d -c "$f" | /usr/local/bin/k3s ctr images import - + rc=$? + else + echo " ERROR: no zstd available, cannot decompress ${base}" + rc=1 + fi + if [ $rc -ne 0 ]; then + echo " ERROR: import failed for ${base} (rc=$rc)" + fi + done + + echo "" + echo " Images after import:" + /usr/local/bin/k3s ctr images list -q | grep -v "^sha256:" | sort + + # Re-tag short-name images with docker.io/ prefix so kubelet can + # find them. kubelet normalises "openshell/gateway:dev" to + # "docker.io/openshell/gateway:dev". Only re-tag images that look + # like short Docker Hub names (contain "/" but no "." before the + # first "/", i.e. not registry.k8s.io/... or ghcr.io/...). + echo "" + echo " Re-tagging short names with docker.io/ prefix..." + for ref in $(/usr/local/bin/k3s ctr images list -q | grep -v "^sha256:"); do + # Skip already-qualified names (contain a dot before the first slash). + case "$ref" in + *.*/*) continue ;; + esac + fqdn="docker.io/${ref}" + echo " ${ref} -> ${fqdn}" + /usr/local/bin/k3s ctr images tag "${ref}" "${fqdn}" 2>/dev/null || true + done + + echo "" + echo " Final image list:" + /usr/local/bin/k3s ctr images list -q | grep -v "^sha256:" | sort +' 2>&1 | sed 's/^/ /' + +# Wait for the openshell namespace (Helm controller creates it). +echo " Waiting for openshell namespace..." +for i in $(seq 1 120); do + if docker exec "${INIT_CONTAINER_NAME}" \ + /usr/local/bin/k3s kubectl get namespace openshell -o name 2>/dev/null | grep -q openshell; then + echo " Namespace ready (${i}s)" + break + fi + if [ "$i" -eq 120 ]; then + echo "ERROR: openshell namespace did not appear in 120s" + docker logs "${INIT_CONTAINER_NAME}" --tail 50 + docker rm -f "${INIT_CONTAINER_NAME}" 2>/dev/null || true + docker volume rm krun-k3s-init-data 2>/dev/null || true + exit 1 + fi + sleep 1 +done + +# Generate PKI and create TLS secrets inside the cluster. +echo " Generating TLS certificates and creating secrets..." + +# We generate certs outside the container, then apply them via kubectl. +# Use openssl for cert generation at build time (simpler than pulling in +# the Rust PKI library). The bootstrap Rust code will detect +# these pre-baked secrets at runtime and skip its own generation. + +PKI_DIR=$(mktemp -d) +trap 'rm -rf "${PKI_DIR}"' EXIT + +# Generate CA +openssl req -x509 -newkey ec -pkeyopt ec_paramgen_curve:prime256v1 \ + -keyout "${PKI_DIR}/ca.key" -out "${PKI_DIR}/ca.crt" \ + -days 3650 -nodes -subj "/O=openshell/CN=openshell-ca" 2>/dev/null + +# Generate server cert with SANs +cat > "${PKI_DIR}/server.cnf" </dev/null +openssl x509 -req -in "${PKI_DIR}/server.csr" \ + -CA "${PKI_DIR}/ca.crt" -CAkey "${PKI_DIR}/ca.key" -CAcreateserial \ + -out "${PKI_DIR}/server.crt" -days 3650 \ + -extensions v3_req -extfile "${PKI_DIR}/server.cnf" 2>/dev/null + +# Generate client cert +openssl req -newkey ec -pkeyopt ec_paramgen_curve:prime256v1 \ + -keyout "${PKI_DIR}/client.key" -out "${PKI_DIR}/client.csr" \ + -nodes -subj "/CN=openshell-client" 2>/dev/null +openssl x509 -req -in "${PKI_DIR}/client.csr" \ + -CA "${PKI_DIR}/ca.crt" -CAkey "${PKI_DIR}/ca.key" -CAcreateserial \ + -out "${PKI_DIR}/client.crt" -days 3650 2>/dev/null + +# Apply TLS secrets to the cluster via kubectl inside the container. +# We create JSON manifests and pipe them in. +apply_secret() { + local name="$1" + local json="$2" + echo "$json" | docker exec -i "${INIT_CONTAINER_NAME}" \ + /usr/local/bin/k3s kubectl apply -f - 2>&1 | sed 's/^/ /' +} + +# Base64 encode the cert files +CA_CRT_B64=$(base64 < "${PKI_DIR}/ca.crt" | tr -d '\n') +SERVER_CRT_B64=$(base64 < "${PKI_DIR}/server.crt" | tr -d '\n') +SERVER_KEY_B64=$(base64 < "${PKI_DIR}/server.key" | tr -d '\n') +CLIENT_CRT_B64=$(base64 < "${PKI_DIR}/client.crt" | tr -d '\n') +CLIENT_KEY_B64=$(base64 < "${PKI_DIR}/client.key" | tr -d '\n') + +apply_secret "openshell-server-tls" "$(cat </dev/null || echo "0") + if [ "$ready" = "1" ]; then + echo " OpenShell pod ready (${i}s)" + break + fi + if [ "$i" -eq 120 ]; then + echo "WARNING: openshell pod not ready after 120s, continuing anyway" + docker exec "${INIT_CONTAINER_NAME}" \ + /usr/local/bin/k3s kubectl -n openshell get pods 2>/dev/null | sed 's/^/ /' || true + break + fi + sleep 1 +done + +# Bake PKI materials into the rootfs so the host-side bootstrap can +# find them without waiting for the cluster. This is the key to +# skipping the namespace wait + kubectl apply on every boot. +echo " Baking PKI into rootfs..." +PKI_DEST="${ROOTFS_DIR}/opt/openshell/pki" +mkdir -p "${PKI_DEST}" +cp "${PKI_DIR}/ca.crt" "${PKI_DEST}/ca.crt" +cp "${PKI_DIR}/ca.key" "${PKI_DEST}/ca.key" +cp "${PKI_DIR}/server.crt" "${PKI_DEST}/server.crt" +cp "${PKI_DIR}/server.key" "${PKI_DEST}/server.key" +cp "${PKI_DIR}/client.crt" "${PKI_DEST}/client.crt" +cp "${PKI_DIR}/client.key" "${PKI_DEST}/client.key" + +# Stop k3s gracefully so the kine SQLite DB is flushed. +echo " Stopping k3s..." +docker stop "${INIT_CONTAINER_NAME}" --timeout 10 + +# Surgically clean the kine SQLite DB. While k3s was running, +# controllers maintained pods, events, leases, and endpoints. These +# runtime objects would cause the VM's kubelet to reconcile against an +# empty containerd (SandboxChanged) on boot. With k3s stopped, we can +# safely strip them directly from the DB — no race condition, no auth. +echo " Cleaning runtime objects from kine DB..." +CLEANUP_SQL=$(mktemp) +cat > "$CLEANUP_SQL" << 'EOSQL' +DELETE FROM kine WHERE name LIKE '/registry/pods/%'; +DELETE FROM kine WHERE name LIKE '/registry/events/%'; +DELETE FROM kine WHERE name LIKE '/registry/leases/%'; +DELETE FROM kine WHERE name LIKE '/registry/endpointslices/%'; +DELETE FROM kine WHERE name LIKE '/registry/masterleases/%'; +PRAGMA wal_checkpoint(TRUNCATE); +VACUUM; +EOSQL +docker run --rm \ + -v krun-k3s-init-data:/data \ + -v "${CLEANUP_SQL}:/tmp/clean.sql:ro" \ + alpine:latest \ + sh -c ' + apk add --no-cache sqlite >/dev/null 2>&1 + DB=/data/server/db/state.db + if [ ! -f "$DB" ]; then echo "ERROR: state.db not found"; exit 1; fi + echo " Before: $(sqlite3 "$DB" "SELECT COUNT(*) FROM kine;") kine records" + sqlite3 "$DB" < /tmp/clean.sql + echo " After: $(sqlite3 "$DB" "SELECT COUNT(*) FROM kine;") kine records" + ' 2>&1 | sed 's/^/ /' +rm -f "$CLEANUP_SQL" + +# Copy the initialized k3s state from the Docker volume back into the +# rootfs. We use a helper container to access the volume. +echo " Extracting k3s state from Docker volume..." +if [ -d "${ROOTFS_DIR}/var/lib/rancher/k3s" ]; then + chmod -R u+rwx "${ROOTFS_DIR}/var/lib/rancher/k3s" 2>/dev/null || true + rm -rf "${ROOTFS_DIR}/var/lib/rancher/k3s" +fi +mkdir -p "${ROOTFS_DIR}/var/lib/rancher/k3s" +# Use tar instead of cp to handle special files that can't be created +# on the macOS-backed bind mount. tar's --ignore-failed-read and +# warning suppression let us capture everything that matters (database, +# TLS, containerd image store in native snapshotter format) while +# skipping uncopiable metadata. +# +# Exclude the overlayfs snapshotter — Docker's init container uses it +# but we use the native snapshotter in the VM. The overlayfs snapshots +# contain full image layer trees that are massive and create files with +# Docker Desktop VirtioFS ownership xattrs that are undeletable on macOS. +# Also exclude runtime task state (stale shim PIDs, sockets) and the +# containerd bolt database (we'll wipe it in the surgical cleanup below). +# Use alpine (native platform) instead of the arm64 base image to avoid +# QEMU emulation overhead. tar doesn't need ARM — it's just copying files. +# Include the containerd native snapshotter, content store, and metadata +# database (meta.db) so the VM doesn't need to re-extract image layers +# at boot time. Exclude the overlayfs snapshotter (Docker's init uses +# overlayfs internally but the VM uses native), runtime task state (stale +# PIDs/sockets), and airgap tarballs (restored from cache below). +# +# The native snapshotter data is ~1-3 GB depending on images. Copying +# through Docker Desktop VirtioFS is slower than native but necessary +# for fast boot times — without it, each boot spends >2 min extracting +# layers on virtio-fs, causing kubelet CreateContainer timeouts. +docker run --rm \ + -v krun-k3s-init-data:/src:ro \ + -v "${ROOTFS_DIR}/var/lib/rancher/k3s:/dst" \ + alpine:latest \ + sh -c 'cd /src && tar cf - \ + --exclude="./agent/containerd/io.containerd.snapshotter.v1.overlayfs" \ + --exclude="./agent/containerd/io.containerd.runtime.v2.task" \ + --exclude="./agent/containerd/io.containerd.sandbox.controller.v1.shim" \ + --exclude="./agent/containerd/tmpmounts" \ + --exclude="./agent/containerd/containerd.log" \ + --exclude="./agent/images" \ + . 2>/dev/null | (cd /dst && tar xf - 2>/dev/null); true' + +# Clean up runtime artifacts that shouldn't persist (same cleanup +# gateway-init.sh does on warm boot). +echo " Cleaning runtime artifacts..." +rm -rf "${ROOTFS_DIR}/var/lib/rancher/k3s/server/tls/temporary-certs" 2>/dev/null || true +rm -f "${ROOTFS_DIR}/var/lib/rancher/k3s/server/kine.sock" 2>/dev/null || true +find "${ROOTFS_DIR}/var/lib/rancher/k3s" -name '*.sock' -delete 2>/dev/null || true +find "${ROOTFS_DIR}/run" -name '*.sock' -delete 2>/dev/null || true + +# Restore airgap image tarballs. The extraction above excluded +# ./agent/images (to avoid pulling them from the Docker volume) and the +# rm -rf earlier wiped the pre-loaded copies. Copy them back from the +# persistent cache so k3s can import them on first VM boot. +echo " Restoring airgap image tarballs..." +mkdir -p "${IMAGES_DIR}" +for f in "${IMAGE_CACHE_DIR}"/*.tar.zst; do + [ -f "$f" ] || continue + cp "$f" "${IMAGES_DIR}/" +done +echo " Images: $(ls "${IMAGES_DIR}"/*.tar.zst 2>/dev/null | wc -l | tr -d ' ') tarballs ($(du -sh "${IMAGES_DIR}" 2>/dev/null | cut -f1))" + +# Write sentinel file so gateway-init.sh and the host-side bootstrap +# know this rootfs has pre-initialized state. +echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "${ROOTFS_DIR}/opt/openshell/.initialized" + +docker rm "${INIT_CONTAINER_NAME}" 2>/dev/null || true +docker volume rm krun-k3s-init-data 2>/dev/null || true + +echo " Pre-initialization complete." + +# ── Verify ──────────────────────────────────────────────────────────── + +if [ ! -f "${ROOTFS_DIR}/usr/local/bin/k3s" ]; then + echo "ERROR: k3s binary not found in rootfs. Something went wrong." + exit 1 +fi + +if [ ! -f "${ROOTFS_DIR}/opt/openshell/.initialized" ]; then + echo "WARNING: Pre-initialization sentinel not found. Cold starts will be slow." +fi + +echo "" +echo "==> Rootfs ready at: ${ROOTFS_DIR}" +echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" +echo " Pre-initialized: $(cat "${ROOTFS_DIR}/opt/openshell/.initialized" 2>/dev/null || echo 'no')" + +# Show k3s data size +K3S_DATA="${ROOTFS_DIR}/var/lib/rancher/k3s" +if [ -d "${K3S_DATA}" ]; then + echo " k3s state: $(du -sh "${K3S_DATA}" | cut -f1)" +fi + +# Show PKI +if [ -d "${ROOTFS_DIR}/opt/openshell/pki" ]; then + echo " PKI: baked ($(ls "${ROOTFS_DIR}/opt/openshell/pki/" | wc -l | tr -d ' ') files)" +fi + +echo "" +echo "Next steps:" +echo " 1. Run: openshell gateway" +echo " Expected startup time: ~3-5 seconds (pre-initialized)" diff --git a/crates/openshell-vm/scripts/check-vm-capabilities.sh b/crates/openshell-vm/scripts/check-vm-capabilities.sh new file mode 100755 index 00000000..2e758f5e --- /dev/null +++ b/crates/openshell-vm/scripts/check-vm-capabilities.sh @@ -0,0 +1,234 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# VM Kernel Capability Checker +# +# Runs inside the guest VM (or a container with the same rootfs) to +# verify that the kernel has the capabilities required for bridge CNI +# networking, kube-proxy, and Kubernetes pod networking. +# +# Usage: +# ./check-vm-capabilities.sh [--json] +# +# Exit codes: +# 0 = all required capabilities present +# 1 = one or more required capabilities missing +# 2 = script error + +set -euo pipefail + +JSON_OUTPUT=false +if [ "${1:-}" = "--json" ]; then + JSON_OUTPUT=true +fi + +PASS=0 +FAIL=0 +WARN=0 +RESULTS=() + +# ── Helpers ───────────────────────────────────────────────────────────── + +check() { + local name="$1" + local category="$2" + local required="$3" # "required" or "optional" + local description="$4" + shift 4 + local cmd=("$@") + + if eval "${cmd[@]}" >/dev/null 2>&1; then + RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"pass\",\"required\":\"$required\",\"description\":\"$description\"}") + PASS=$((PASS + 1)) + if [ "$JSON_OUTPUT" = false ]; then + printf " ✓ %-40s %s\n" "$name" "$description" + fi + else + if [ "$required" = "required" ]; then + RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"fail\",\"required\":\"$required\",\"description\":\"$description\"}") + FAIL=$((FAIL + 1)) + if [ "$JSON_OUTPUT" = false ]; then + printf " ✗ %-40s %s (REQUIRED)\n" "$name" "$description" + fi + else + RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"warn\",\"required\":\"$required\",\"description\":\"$description\"}") + WARN=$((WARN + 1)) + if [ "$JSON_OUTPUT" = false ]; then + printf " ~ %-40s %s (optional)\n" "$name" "$description" + fi + fi + fi +} + +check_module() { + local module="$1" + # Check /proc/modules (loaded), /proc/config.gz (builtin), or /sys/module + if [ -d "/sys/module/$module" ]; then + return 0 + fi + if grep -q "^${module} " /proc/modules 2>/dev/null; then + return 0 + fi + # Check if compiled in via /proc/config.gz or /boot/config + local config_key + config_key="CONFIG_$(echo "$module" | tr '[:lower:]-' '[:upper:]_')" + if [ -f /proc/config.gz ]; then + if zcat /proc/config.gz 2>/dev/null | grep -q "^${config_key}=[ym]"; then + return 0 + fi + fi + return 1 +} + +# ── Capability Checks ────────────────────────────────────────────────── + +if [ "$JSON_OUTPUT" = false ]; then + echo "VM Kernel Capability Check" + echo "==========================" + echo "" + echo "Kernel: $(uname -r)" + echo "" +fi + +# --- Network Namespaces --- +if [ "$JSON_OUTPUT" = false ]; then echo "[Network Namespaces]"; fi + +check "net_namespace" "netns" "required" \ + "network namespace support (CONFIG_NET_NS)" \ + "test -d /proc/self/ns && ls /proc/self/ns/net" + +check "veth_pair" "netns" "required" \ + "veth pair creation (CONFIG_VETH)" \ + "ip link add _chk0 type veth peer name _chk1 && ip link del _chk0" + +# --- Linux Bridge --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Linux Bridge]"; fi + +check "bridge_module" "bridge" "required" \ + "bridge device support (CONFIG_BRIDGE)" \ + "ip link add _chkbr0 type bridge && ip link del _chkbr0" + +check "bridge_nf_call" "bridge" "required" \ + "bridge netfilter (CONFIG_BRIDGE_NETFILTER)" \ + "check_module bridge && test -f /proc/sys/net/bridge/bridge-nf-call-iptables 2>/dev/null || check_module br_netfilter" + +# --- Netfilter / iptables --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Netfilter / iptables]"; fi + +check "netfilter" "netfilter" "required" \ + "netfilter framework (CONFIG_NETFILTER)" \ + "check_module nf_conntrack || check_module ip_tables || test -d /proc/sys/net/netfilter" + +check "nf_conntrack" "netfilter" "required" \ + "connection tracking (CONFIG_NF_CONNTRACK)" \ + "check_module nf_conntrack" + +check "nf_nat" "netfilter" "required" \ + "NAT support (CONFIG_NF_NAT)" \ + "check_module nf_nat" + +check "iptables_filter" "netfilter" "required" \ + "iptables filter (CONFIG_IP_NF_FILTER)" \ + "check_module ip_tables || iptables -L -n >/dev/null 2>&1" + +check "iptables_nat" "netfilter" "required" \ + "iptables NAT (CONFIG_IP_NF_NAT)" \ + "check_module iptable_nat || iptables -t nat -L -n >/dev/null 2>&1" + +check "iptables_mangle" "netfilter" "optional" \ + "iptables mangle (CONFIG_IP_NF_MANGLE)" \ + "check_module iptable_mangle || iptables -t mangle -L -n >/dev/null 2>&1" + +check "nf_conntrack_netlink" "netfilter" "optional" \ + "conntrack netlink (CONFIG_NF_CT_NETLINK)" \ + "check_module nf_conntrack_netlink" + +check "nftables" "netfilter" "optional" \ + "nftables (CONFIG_NF_TABLES)" \ + "check_module nf_tables || nft list ruleset >/dev/null 2>&1" + +# --- IP Forwarding / Routing --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[IP Forwarding]"; fi + +check "ip_forward" "routing" "required" \ + "IP forwarding (sysctl)" \ + "test -f /proc/sys/net/ipv4/ip_forward" + +check "ip_route" "routing" "required" \ + "IP routing" \ + "ip route show >/dev/null 2>&1" + +# --- CNI Plugin Dependencies --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[CNI Plugins]"; fi + +check "cni_bridge_bin" "cni" "required" \ + "bridge CNI plugin binary" \ + "test -x /opt/cni/bin/bridge || find /var/lib/rancher/k3s/data -name bridge -type f 2>/dev/null | head -1 | grep -q ." + +check "cni_host_local_bin" "cni" "required" \ + "host-local IPAM plugin binary" \ + "test -x /opt/cni/bin/host-local || find /var/lib/rancher/k3s/data -name host-local -type f 2>/dev/null | head -1 | grep -q ." + +check "cni_loopback_bin" "cni" "required" \ + "loopback CNI plugin binary" \ + "test -x /opt/cni/bin/loopback || find /var/lib/rancher/k3s/data -name loopback -type f 2>/dev/null | head -1 | grep -q ." + +check "cni_portmap_bin" "cni" "optional" \ + "portmap CNI plugin binary (needs iptables)" \ + "test -x /opt/cni/bin/portmap || find /var/lib/rancher/k3s/data -name portmap -type f 2>/dev/null | head -1 | grep -q ." + +# --- Userspace Tools --- +if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Userspace Tools]"; fi + +check "iptables_bin" "userspace" "required" \ + "iptables binary" \ + "command -v iptables" + +check "conntrack_bin" "userspace" "optional" \ + "conntrack binary" \ + "command -v conntrack" + +check "ip_bin" "userspace" "required" \ + "iproute2 (ip command)" \ + "command -v ip" + +# ── Summary ──────────────────────────────────────────────────────────── + +if [ "$JSON_OUTPUT" = true ]; then + echo "{" + echo " \"kernel\": \"$(uname -r)\"," + echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," + echo " \"pass\": $PASS," + echo " \"fail\": $FAIL," + echo " \"warn\": $WARN," + echo " \"results\": [" + local_first=true + for r in "${RESULTS[@]}"; do + if [ "$local_first" = true ]; then + local_first=false + else + echo "," + fi + printf " %s" "$r" + done + echo "" + echo " ]" + echo "}" +else + echo "" + echo "─────────────────────────────────────────" + printf "Results: %d passed, %d failed, %d warnings\n" "$PASS" "$FAIL" "$WARN" + + if [ "$FAIL" -gt 0 ]; then + echo "" + echo "FAIL: $FAIL required capabilities missing." + echo "The VM kernel needs to be rebuilt with the missing features." + echo "See: crates/openshell-vm/runtime/kernel/README.md" + exit 1 + else + echo "" + echo "PASS: All required capabilities present." + exit 0 + fi +fi diff --git a/crates/openshell-vm/scripts/gateway-init.sh b/crates/openshell-vm/scripts/gateway-init.sh new file mode 100755 index 00000000..ab1b8c08 --- /dev/null +++ b/crates/openshell-vm/scripts/gateway-init.sh @@ -0,0 +1,416 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# Init script for the gateway microVM. Runs as PID 1 inside the libkrun VM. +# +# Mounts essential virtual filesystems, configures networking, then execs +# k3s server. If the rootfs was pre-initialized by build-rootfs.sh (sentinel +# at /opt/openshell/.initialized), the full manifest setup is skipped and +# k3s resumes from its persisted state (~3-5s startup). + +set -e + +BOOT_START=$(date +%s%3N 2>/dev/null || date +%s) + +ts() { + local now + now=$(date +%s%3N 2>/dev/null || date +%s) + local elapsed=$(( (now - BOOT_START) )) + printf "[%d.%03ds] %s\n" $((elapsed / 1000)) $((elapsed % 1000)) "$*" +} + +PRE_INITIALIZED=false +if [ -f /opt/openshell/.initialized ]; then + PRE_INITIALIZED=true + ts "pre-initialized rootfs detected (fast path)" +fi + +# ── Mount essential filesystems (parallel) ────────────────────────────── +# These are independent; mount them concurrently. + +mount -t proc proc /proc 2>/dev/null & +mount -t sysfs sysfs /sys 2>/dev/null & +mount -t tmpfs tmpfs /tmp 2>/dev/null & +mount -t tmpfs tmpfs /run 2>/dev/null & +mount -t devtmpfs devtmpfs /dev 2>/dev/null & +wait + +# These depend on /dev being mounted. +mkdir -p /dev/pts /dev/shm +mount -t devpts devpts /dev/pts 2>/dev/null & +mount -t tmpfs tmpfs /dev/shm 2>/dev/null & + +# cgroup2 (unified hierarchy) — required by k3s/containerd. +mkdir -p /sys/fs/cgroup +mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null & +wait + +ts "filesystems mounted" + +# ── Networking ────────────────────────────────────────────────────────── + +hostname gateway 2>/dev/null || true + +# Ensure loopback is up (k3s binds to 127.0.0.1). +ip link set lo up 2>/dev/null || true + +# Detect whether we have a real network interface (gvproxy) or need a +# dummy interface (TSI / no networking). +if ip link show eth0 >/dev/null 2>&1; then + # gvproxy networking — bring up eth0 and get an IP via DHCP. + # gvproxy has a built-in DHCP server that assigns 192.168.127.2/24 + # with gateway 192.168.127.1 and configures ARP properly. + ts "detected eth0 (gvproxy networking)" + ip link set eth0 up 2>/dev/null || true + + # Use DHCP to get IP and configure routes. gvproxy's DHCP server + # handles ARP resolution which static config does not. + if command -v udhcpc >/dev/null 2>&1; then + # udhcpc needs a script to apply the lease. Use the busybox + # default script if available, otherwise write a minimal one. + UDHCPC_SCRIPT="/usr/share/udhcpc/default.script" + if [ ! -f "$UDHCPC_SCRIPT" ]; then + mkdir -p /usr/share/udhcpc + cat > "$UDHCPC_SCRIPT" << 'DHCP_SCRIPT' +#!/bin/sh +case "$1" in + bound|renew) + ip addr flush dev "$interface" + ip addr add "$ip/$mask" dev "$interface" + if [ -n "$router" ]; then + ip route add default via $router dev "$interface" + fi + if [ -n "$dns" ]; then + echo -n > /etc/resolv.conf + for d in $dns; do + echo "nameserver $d" >> /etc/resolv.conf + done + fi + ;; +esac +DHCP_SCRIPT + chmod +x "$UDHCPC_SCRIPT" + fi + # -f: stay in foreground, -q: quit after obtaining lease, + # -n: exit if no lease, -T 1: 1s between retries, -t 3: 3 retries + # -A 1: wait 1s before first retry (aggressive for local gvproxy) + udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1 || true + else + # Fallback to static config if no DHCP client available. + ts "no DHCP client, using static config" + ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true + ip route add default via 192.168.127.1 2>/dev/null || true + fi + + # Ensure DNS is configured. DHCP should have set /etc/resolv.conf, + # but if it didn't (or static fallback was used), provide a default. + if [ ! -s /etc/resolv.conf ]; then + echo "nameserver 8.8.8.8" > /etc/resolv.conf + echo "nameserver 8.8.4.4" >> /etc/resolv.conf + fi + + # Read back the IP we got (from DHCP or static). + NODE_IP=$(ip -4 addr show eth0 | grep -oP 'inet \K[^/]+' || echo "192.168.127.2") + ts "eth0 IP: $NODE_IP" +else + # TSI or no networking — create a dummy interface for k3s. + ts "no eth0 found, using dummy interface (TSI mode)" + ip link add dummy0 type dummy 2>/dev/null || true + ip addr add 10.0.2.15/24 dev dummy0 2>/dev/null || true + ip link set dummy0 up 2>/dev/null || true + ip route add default dev dummy0 2>/dev/null || true + + NODE_IP="10.0.2.15" +fi + +# ── k3s data directories ─────────────────────────────────────────────── + +mkdir -p /var/lib/rancher/k3s +mkdir -p /etc/rancher/k3s + +# Clean stale runtime artifacts from previous boots (virtio-fs persists +# the rootfs between VM restarts). +rm -rf /var/lib/rancher/k3s/server/tls/temporary-certs 2>/dev/null || true +rm -f /var/lib/rancher/k3s/server/kine.sock 2>/dev/null || true +# Clean stale node password so k3s doesn't fail validation on reboot. +# Each k3s start generates a new random node password; the old hash in +# the database will not match. Removing the local password file forces +# k3s to re-register with a fresh one. +rm -f /var/lib/rancher/k3s/server/cred/node-passwd 2>/dev/null || true +# Also clean any stale pid files and unix sockets +find /var/lib/rancher/k3s -name '*.sock' -delete 2>/dev/null || true +find /run -name '*.sock' -delete 2>/dev/null || true + +# Clean stale containerd runtime state from previous boots. +# +# The rootfs persists across VM restarts via virtio-fs. We PRESERVE the +# bolt metadata database (meta.db) because it contains snapshot and image +# metadata that containerd needs to avoid re-extracting all image layers +# on every boot. The native snapshotter on virtio-fs takes ~2 min to +# extract the openshell/gateway image; keeping meta.db lets containerd +# know the snapshots already exist. +# +# The kine (SQLite) DB cleanup in build-rootfs.sh already removes stale +# pod/sandbox records from k3s etcd, preventing kubelet from reconciling +# against stale sandboxes. +CONTAINERD_DIR="/var/lib/rancher/k3s/agent/containerd" +if [ -d "$CONTAINERD_DIR" ]; then + # Remove runtime task state (stale shim PIDs, sockets from dead processes). + rm -rf "${CONTAINERD_DIR}/io.containerd.runtime.v2.task" 2>/dev/null || true + # Remove sandbox controller shim state. Stale sandbox records cause + # containerd to reuse network namespaces from previous boots, which + # already have routes configured. The CNI bridge plugin then fails + # with "file exists" when adding the default route on retry. + rm -rf "${CONTAINERD_DIR}/io.containerd.sandbox.controller.v1.shim" 2>/dev/null || true + # Clean stale ingest temp files from the content store. + rm -rf "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" 2>/dev/null || true + mkdir -p "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" + # Preserve meta.db — snapshot/image metadata avoids re-extraction. + ts "cleaned containerd runtime state (preserved meta.db + content store + snapshotter)" +fi +rm -rf /run/k3s 2>/dev/null || true + +ts "stale artifacts cleaned" + +# ── Clean stale CNI / pod networking state ────────────────────────────── +# The rootfs persists across VM restarts via virtio-fs. Previous pod +# sandboxes leave behind veth pairs, bridge routes, host-local IPAM +# allocations, and network namespaces. If not cleaned, the bridge CNI +# plugin fails with: +# "failed to add route ... file exists" +# because the default route via cni0 already exists from the prior boot, +# or a stale network namespace already has the route configured. + +# Tear down the CNI bridge and its associated routes. +if ip link show cni0 >/dev/null 2>&1; then + ip link set cni0 down 2>/dev/null || true + ip link delete cni0 2>/dev/null || true + ts "deleted stale cni0 bridge" +fi + +# Remove any leftover veth pairs (CNI bridge plugin creates vethXXXX). +for veth in $(ip -o link show type veth 2>/dev/null | awk -F': ' '{print $2}' | cut -d'@' -f1); do + ip link delete "$veth" 2>/dev/null || true +done + +# Flush host-local IPAM allocations so IPs can be reassigned cleanly. +rm -rf /var/lib/cni/networks 2>/dev/null || true +rm -rf /var/lib/cni/results 2>/dev/null || true + +# Flush any stale CNI-added routes for the pod CIDR. These can conflict +# with routes the bridge plugin tries to add on the next boot. +ip route flush 10.42.0.0/24 2>/dev/null || true + +# Clean up stale pod network namespaces from previous boots. Containerd +# creates named netns under /var/run/netns/ for each pod sandbox. If +# these persist across VM restarts, the CNI bridge plugin fails when +# adding routes because the stale netns already has the default route +# configured from the prior boot. Removing all named network namespaces +# forces containerd to create fresh ones. +if [ -d /var/run/netns ]; then + for ns in $(ip netns list 2>/dev/null | awk '{print $1}'); do + ip netns delete "$ns" 2>/dev/null || true + done +fi +# Also clean the netns bind-mount directory used by containerd/CRI. +# Containerd may use /run/netns/ or /var/run/netns/ (same via tmpfs). +rm -rf /run/netns/* 2>/dev/null || true +rm -rf /var/run/netns/* 2>/dev/null || true + +ts "stale CNI networking state cleaned" + +# ── Network profile detection ─────────────────────────────────────────── +# Detect early so manifest patching and k3s flags both use the same value. +# +# "bridge" is the only supported profile. It requires a custom libkrunfw +# with CONFIG_BRIDGE, CONFIG_NETFILTER, CONFIG_NF_NAT built in. If the +# kernel lacks these capabilities the VM cannot run pod networking and we +# fail fast with an actionable error. + +NET_PROFILE="bridge" + +ts "network profile: ${NET_PROFILE}" + +# Validate that the kernel actually has the required capabilities. +_caps_ok=true +if ! ip link add _cap_br0 type bridge 2>/dev/null; then + echo "ERROR: kernel lacks bridge support (CONFIG_BRIDGE). Use a custom libkrunfw." >&2 + _caps_ok=false +else + ip link del _cap_br0 2>/dev/null || true +fi +if [ ! -d /proc/sys/net/netfilter ] && [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + echo "ERROR: kernel lacks netfilter support (CONFIG_NETFILTER). Use a custom libkrunfw." >&2 + _caps_ok=false +fi +if [ "$_caps_ok" = false ]; then + echo "FATAL: required kernel capabilities missing — cannot configure pod networking." >&2 + echo "See: architecture/custom-vm-runtime.md for build instructions." >&2 + exit 1 +fi + +# ── Deploy bundled manifests (cold boot only) ─────────────────────────── +# On pre-initialized rootfs, manifests are already in place from the +# build-time k3s boot. Skip this entirely for fast startup. + +K3S_MANIFESTS="/var/lib/rancher/k3s/server/manifests" +BUNDLED_MANIFESTS="/opt/openshell/manifests" + +if [ "$PRE_INITIALIZED" = false ]; then + + mkdir -p "$K3S_MANIFESTS" + + if [ -d "$BUNDLED_MANIFESTS" ]; then + ts "deploying bundled manifests (cold boot)..." + for manifest in "$BUNDLED_MANIFESTS"/*.yaml; do + [ ! -f "$manifest" ] && continue + cp "$manifest" "$K3S_MANIFESTS/" + done + + # Remove stale OpenShell-managed manifests from previous boots. + for existing in "$K3S_MANIFESTS"/openshell-*.yaml \ + "$K3S_MANIFESTS"/agent-*.yaml; do + [ ! -f "$existing" ] && continue + basename=$(basename "$existing") + if [ ! -f "$BUNDLED_MANIFESTS/$basename" ]; then + rm -f "$existing" + fi + done + fi + + ts "manifests deployed" +else + ts "skipping manifest deploy (pre-initialized)" +fi + +# Patch manifests for VM deployment constraints. +HELMCHART="$K3S_MANIFESTS/openshell-helmchart.yaml" +if [ -f "$HELMCHART" ]; then + # Use pre-loaded images — don't pull from registry. + sed -i 's|pullPolicy: Always|pullPolicy: IfNotPresent|' "$HELMCHART" + + # Bridge CNI: pods use normal pod networking, not hostNetwork. + # The pre-init in build-rootfs.sh replaces __HOST_NETWORK__ with "true" + # for Docker container networking. At VM boot with bridge CNI we need + # to override it back to "false" so pods use the CNI bridge network. + sed -i 's|hostNetwork: true|hostNetwork: false|g' "$HELMCHART" + sed -i 's|__HOST_NETWORK__|false|g' "$HELMCHART" + sed -i 's|__AUTOMOUNT_SA_TOKEN__|true|g' "$HELMCHART" + + sed -i 's|__KUBECONFIG_HOST_PATH__|"/etc/rancher/k3s"|g' "$HELMCHART" + sed -i 's|__PERSISTENCE_ENABLED__|false|g' "$HELMCHART" + sed -i 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" + # Clear SSH gateway placeholders (default 127.0.0.1 is correct for local VM). + sed -i 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" + sed -i 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" +fi + +AGENT_MANIFEST="$K3S_MANIFESTS/agent-sandbox.yaml" +if [ -f "$AGENT_MANIFEST" ]; then + # Bridge CNI: agent-sandbox uses normal pod networking. + # kube-proxy is enabled so kubernetes.default.svc is reachable + # via ClusterIP — no need for KUBERNETES_SERVICE_HOST override. + sed -i '/hostNetwork: true/d' "$AGENT_MANIFEST" + sed -i '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" + ts "agent-sandbox: using pod networking (bridge profile)" +fi + +# ── CNI configuration (bridge) ────────────────────────────────────────── +# Uses the bridge CNI plugin with iptables masquerade. Requires +# CONFIG_BRIDGE, CONFIG_NETFILTER, CONFIG_NF_NAT in the VM kernel +# (validated above at boot). kube-proxy uses nftables mode for service +# VIP routing. + +CNI_CONF_DIR="/etc/cni/net.d" +CNI_BIN_DIR="/opt/cni/bin" +mkdir -p "$CNI_CONF_DIR" "$CNI_BIN_DIR" + +# Enable IP forwarding (required for masquerade). +echo 1 > /proc/sys/net/ipv4/ip_forward 2>/dev/null || true + +# Enable bridge netfilter call (required for CNI bridge masquerade to +# see bridged traffic). +if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then + echo 1 > /proc/sys/net/bridge/bridge-nf-call-iptables 2>/dev/null || true +fi + +cat > "$CNI_CONF_DIR/10-bridge.conflist" << 'CNICFG' +{ + "cniVersion": "1.0.0", + "name": "bridge", + "plugins": [ + { + "type": "bridge", + "bridge": "cni0", + "isGateway": true, + "isDefaultGateway": true, + "ipMasq": true, + "hairpinMode": true, + "ipam": { + "type": "host-local", + "ranges": [[{ "subnet": "10.42.0.0/24" }]] + } + }, + { + "type": "portmap", + "capabilities": { "portMappings": true }, + "snat": true + }, + { + "type": "loopback" + } + ] +} +CNICFG + +# Remove any stale legacy ptp config. +rm -f "$CNI_CONF_DIR/10-ptp.conflist" 2>/dev/null || true + +ts "bridge CNI configured (cni0 + iptables masquerade)" + +# Symlink k3s-bundled CNI binaries to the default containerd bin path. +# k3s extracts its tools to /var/lib/rancher/k3s/data//bin/. +# Use -e (not -f) because k3s ships these as symlinks to a `cni` multicall +# binary. +K3S_DATA_BIN=$(find /var/lib/rancher/k3s/data -maxdepth 2 -name bin -type d 2>/dev/null | head -1) +if [ -n "$K3S_DATA_BIN" ]; then + for plugin in bridge host-local loopback bandwidth portmap; do + [ -e "$K3S_DATA_BIN/$plugin" ] && ln -sf "$K3S_DATA_BIN/$plugin" "$CNI_BIN_DIR/$plugin" + done + ts "CNI binaries linked from $K3S_DATA_BIN" +else + ts "WARNING: k3s data bin dir not found, CNI binaries may be missing" +fi + +# Also clean up any flannel config from the k3s-specific CNI directory +# (pre-baked state from the Docker build used host-gw flannel). +rm -f "/var/lib/rancher/k3s/agent/etc/cni/net.d/10-flannel.conflist" 2>/dev/null || true + +# ── Start k3s ────────────────────────────────────────────────────────── +# Flags tuned for fast single-node startup. Bridge CNI handles pod +# networking; kube-proxy runs in nftables mode for service VIP / ClusterIP +# support. +# +# nftables mode: k3s bundles its own iptables binaries whose MARK target +# doesn't negotiate xt_MARK revision 2 correctly with the libkrun kernel, +# causing --xor-mark failures. nftables mode uses the kernel's nf_tables +# subsystem directly and sidesteps the issue entirely. The kernel is +# configured with CONFIG_NF_TABLES=y and related modules. + +K3S_ARGS=( + --disable=traefik,servicelb,metrics-server,coredns + --disable-network-policy + --write-kubeconfig-mode=644 + --node-ip="$NODE_IP" + --kube-apiserver-arg=bind-address=0.0.0.0 + --resolv-conf=/etc/resolv.conf + --tls-san=localhost,127.0.0.1,10.0.2.15,192.168.127.2 + --flannel-backend=none + --snapshotter=native + --kube-proxy-arg=proxy-mode=nftables +) + +ts "starting k3s server (bridge CNI + nftables kube-proxy)" + +exec /usr/local/bin/k3s server "${K3S_ARGS[@]}" diff --git a/crates/openshell-vm/scripts/hello-server.py b/crates/openshell-vm/scripts/hello-server.py new file mode 100644 index 00000000..f02d7d72 --- /dev/null +++ b/crates/openshell-vm/scripts/hello-server.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Minimal HTTP server that responds with 'Hello from libkrun VM!' on port 8080.""" + +import json +import os +import platform +from http.server import HTTPServer, BaseHTTPRequestHandler + + +class HelloHandler(BaseHTTPRequestHandler): + def do_GET(self): + body = json.dumps( + { + "message": "Hello from libkrun VM!", + "hostname": platform.node(), + "platform": platform.platform(), + "arch": platform.machine(), + "pid": os.getpid(), + "path": self.path, + }, + indent=2, + ) + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body.encode()) + + def log_message(self, format, *args): + print(f"[hello-server] {args[0]}") + + +def main(): + host = "0.0.0.0" + port = 8080 + server = HTTPServer((host, port), HelloHandler) + print(f"Hello server listening on {host}:{port}") + try: + server.serve_forever() + except KeyboardInterrupt: + print("\nShutting down.") + server.server_close() + + +if __name__ == "__main__": + main() diff --git a/crates/openshell-vm/scripts/verify-vm.sh b/crates/openshell-vm/scripts/verify-vm.sh new file mode 100755 index 00000000..a314301f --- /dev/null +++ b/crates/openshell-vm/scripts/verify-vm.sh @@ -0,0 +1,217 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# VM Verification Matrix +# +# Runs a comprehensive set of checks against a running gateway VM to +# validate networking, service reachability, and overall health. +# +# This script is designed to run both locally and in CI as a pass/fail +# gate for merge readiness. +# +# Usage: +# ./verify-vm.sh [--kubeconfig PATH] [--timeout SECS] +# +# Prerequisites: +# - A running gateway VM (`mise run vm`) +# - kubectl available in PATH +# +# Exit codes: +# 0 = all checks passed +# 1 = one or more checks failed +# 2 = script error / prerequisites not met + +set -euo pipefail + +KUBECONFIG="${KUBECONFIG:-${HOME}/.kube/gateway.yaml}" +TIMEOUT="${TIMEOUT:-120}" +PASS=0 +FAIL=0 +WARN=0 + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + --kubeconfig) KUBECONFIG="$2"; shift 2 ;; + --timeout) TIMEOUT="$2"; shift 2 ;; + --help|-h) + echo "Usage: $0 [--kubeconfig PATH] [--timeout SECS]" + exit 0 + ;; + *) echo "Unknown argument: $1" >&2; exit 2 ;; + esac +done + +export KUBECONFIG + +# ── Helpers ───────────────────────────────────────────────────────────── + +check() { + local name="$1" + local category="$2" + shift 2 + local cmd=("$@") + + printf " %-50s " "$name" + if output=$(eval "${cmd[@]}" 2>&1); then + echo "PASS" + PASS=$((PASS + 1)) + else + echo "FAIL" + if [ -n "$output" ]; then + echo " $output" | head -3 + fi + FAIL=$((FAIL + 1)) + fi +} + +wait_for_api() { + local deadline=$((SECONDS + TIMEOUT)) + while [ $SECONDS -lt $deadline ]; do + if kubectl get nodes -o name >/dev/null 2>&1; then + return 0 + fi + sleep 2 + done + return 1 +} + +echo "VM Verification Matrix" +echo "======================" +echo "" +echo "Kubeconfig: ${KUBECONFIG}" +echo "Timeout: ${TIMEOUT}s" +echo "" + +# ── Prerequisites ────────────────────────────────────────────────────── + +if [ ! -f "$KUBECONFIG" ]; then + echo "ERROR: Kubeconfig not found at ${KUBECONFIG}" + echo "Is the gateway VM running? Start with: mise run vm" + exit 2 +fi + +if ! command -v kubectl >/dev/null 2>&1; then + echo "ERROR: kubectl not found in PATH" + exit 2 +fi + +echo "[Waiting for API server...]" +if ! wait_for_api; then + echo "ERROR: API server not reachable after ${TIMEOUT}s" + exit 2 +fi +echo "" + +# ── Node Health ──────────────────────────────────────────────────────── + +echo "[Node Health]" + +check "node exists" "node" \ + "kubectl get nodes -o name | grep -q 'node/'" + +check "node is Ready" "node" \ + "kubectl get nodes -o jsonpath='{.items[0].status.conditions[?(@.type==\"Ready\")].status}' | grep -q True" + +echo "" + +# ── System Pods ──────────────────────────────────────────────────────── + +echo "[System Pods]" + +check "kube-system pods running" "pods" \ + "kubectl -n kube-system get pods -o jsonpath='{.items[*].status.phase}' | grep -qv Pending" + +check "no FailedCreatePodSandBox events" "pods" \ + "! kubectl get events -A --field-selector reason=FailedCreatePodSandBox -o name 2>/dev/null | grep -q ." + +check "no CrashLoopBackOff pods" "pods" \ + "! kubectl get pods -A -o jsonpath='{.items[*].status.containerStatuses[*].state.waiting.reason}' 2>/dev/null | grep -q CrashLoopBackOff" + +echo "" + +# ── OpenShell Namespace ──────────────────────────────────────────────── + +echo "[OpenShell Namespace]" + +check "openshell namespace exists" "openshell" \ + "kubectl get namespace openshell -o name" + +check "openshell-0 pod exists" "openshell" \ + "kubectl -n openshell get pod openshell-0 -o name" + +check "openshell-0 pod is Ready" "openshell" \ + "kubectl -n openshell get pod openshell-0 -o jsonpath='{.status.conditions[?(@.type==\"Ready\")].status}' | grep -q True" + +echo "" + +# ── Networking ───────────────────────────────────────────────────────── + +echo "[Networking]" + +check "services exist" "networking" \ + "kubectl get svc -A -o name | grep -q ." + +check "kubernetes service has ClusterIP" "networking" \ + "kubectl get svc kubernetes -o jsonpath='{.spec.clusterIP}' | grep -q ." + +# Check if bridge CNI is in use (cni0 bridge exists) +if kubectl exec -n openshell openshell-0 -- ip link show cni0 >/dev/null 2>&1; then + echo " CNI profile detected: bridge" +else + echo " WARNING: cni0 bridge not detected — bridge CNI may not be running yet" +fi + +check "cni0 bridge exists in pod" "networking" \ + "kubectl exec -n openshell openshell-0 -- ip link show cni0 2>/dev/null" + +# With bridge CNI, kubernetes.default.svc should be reachable. +check "kubernetes.default.svc reachable from pod" "networking" \ + "kubectl exec -n openshell openshell-0 -- wget -q -O /dev/null --timeout=5 https://kubernetes.default.svc/healthz 2>/dev/null || kubectl exec -n openshell openshell-0 -- curl -sk --connect-timeout 5 https://kubernetes.default.svc/healthz 2>/dev/null" + +check "no bridge creation errors in events" "networking" \ + "! kubectl get events -A 2>/dev/null | grep -qi 'bridge.*fail\\|cni0.*error\\|FailedCreatePodSandBox.*bridge'" + +echo "" + +# ── Host Port Connectivity ───────────────────────────────────────────── + +echo "[Host Connectivity]" + +check "port 30051 (gateway service) reachable" "host" \ + "timeout 5 bash -c 'echo > /dev/tcp/127.0.0.1/30051' 2>/dev/null || nc -z -w5 127.0.0.1 30051 2>/dev/null" + +echo "" + +# ── Event / Log Checks ──────────────────────────────────────────────── + +echo "[Events / Logs]" + +check "no repeated bind/listen conflicts" "events" \ + "! kubectl get events -A 2>/dev/null | grep -ci 'bind.*address already in use\\|listen.*address already in use' | grep -qv '^0$'" + +check "no hostNetwork fallback warnings" "events" \ + "! kubectl get events -A 2>/dev/null | grep -ci 'hostNetwork.*fallback' | grep -qv '^0$'" + +echo "" + +# ── Summary ──────────────────────────────────────────────────────────── + +echo "─────────────────────────────────────────────────────" +printf "Results: %d passed, %d failed\n" "$PASS" "$FAIL" +echo "CNI Profile: ${CNI_PROFILE}" +echo "" + +if [ "$FAIL" -gt 0 ]; then + echo "FAIL: ${FAIL} check(s) failed." + echo "" + echo "Debugging:" + echo " kubectl get nodes,pods -A" + echo " kubectl get events -A --sort-by=.lastTimestamp" + echo " cat ~/.local/share/openshell/gateway/console.log" + exit 1 +else + echo "PASS: All checks passed." + exit 0 +fi diff --git a/crates/openshell-vm/src/ffi.rs b/crates/openshell-vm/src/ffi.rs new file mode 100644 index 00000000..96c3f5de --- /dev/null +++ b/crates/openshell-vm/src/ffi.rs @@ -0,0 +1,312 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Minimal runtime-loaded bindings for the libkrun C API. +//! +//! We intentionally do not link libkrun at build time. Instead, the +//! `gateway` binary loads `libkrun` from the staged `gateway.runtime/` +//! sidecar bundle on first use. + +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; + +use libc::c_char; +use libloading::Library; + +use crate::VmError; + +/// Runtime provenance information extracted from the bundle. +#[derive(Debug, Clone)] +pub struct RuntimeProvenance { + /// Path to the libkrun library that was loaded. + pub libkrun_path: PathBuf, + /// Paths to all libkrunfw libraries that were preloaded. + pub libkrunfw_paths: Vec, + /// SHA-256 hash of the primary libkrunfw artifact (if computable). + pub libkrunfw_sha256: Option, + /// Contents of provenance.json if present in the runtime bundle. + pub provenance_json: Option, + /// Whether this is a custom (OpenShell-built) runtime. + pub is_custom: bool, +} + +pub const KRUN_LOG_TARGET_DEFAULT: i32 = -1; +pub const KRUN_LOG_LEVEL_OFF: u32 = 0; +pub const KRUN_LOG_LEVEL_ERROR: u32 = 1; +pub const KRUN_LOG_LEVEL_WARN: u32 = 2; +pub const KRUN_LOG_LEVEL_INFO: u32 = 3; +pub const KRUN_LOG_LEVEL_DEBUG: u32 = 4; +pub const KRUN_LOG_LEVEL_TRACE: u32 = 5; +pub const KRUN_LOG_STYLE_AUTO: u32 = 0; +pub const KRUN_LOG_OPTION_NO_ENV: u32 = 1; + +type KrunInitLog = + unsafe extern "C" fn(target_fd: i32, level: u32, style: u32, options: u32) -> i32; +type KrunCreateCtx = unsafe extern "C" fn() -> i32; +type KrunFreeCtx = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunSetVmConfig = unsafe extern "C" fn(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32; +type KrunSetRoot = unsafe extern "C" fn(ctx_id: u32, root_path: *const c_char) -> i32; +type KrunSetWorkdir = unsafe extern "C" fn(ctx_id: u32, workdir_path: *const c_char) -> i32; +type KrunSetExec = unsafe extern "C" fn( + ctx_id: u32, + exec_path: *const c_char, + argv: *const *const c_char, + envp: *const *const c_char, +) -> i32; +type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32; +type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32; +type KrunAddVsockPort2 = + unsafe extern "C" fn(ctx_id: u32, port: u32, c_filepath: *const c_char, listen: bool) -> i32; +type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; +type KrunAddVsock = unsafe extern "C" fn(ctx_id: u32, tsi_features: u32) -> i32; +type KrunAddNetUnixgram = unsafe extern "C" fn( + ctx_id: u32, + c_path: *const c_char, + fd: i32, + c_mac: *const u8, + features: u32, + flags: u32, +) -> i32; + +pub struct LibKrun { + pub krun_init_log: KrunInitLog, + pub krun_create_ctx: KrunCreateCtx, + pub krun_free_ctx: KrunFreeCtx, + pub krun_set_vm_config: KrunSetVmConfig, + pub krun_set_root: KrunSetRoot, + pub krun_set_workdir: KrunSetWorkdir, + pub krun_set_exec: KrunSetExec, + pub krun_set_port_map: KrunSetPortMap, + pub krun_set_console_output: KrunSetConsoleOutput, + pub krun_add_vsock_port2: KrunAddVsockPort2, + pub krun_start_enter: KrunStartEnter, + pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, + pub krun_add_vsock: KrunAddVsock, + pub krun_add_net_unixgram: KrunAddNetUnixgram, +} + +static LIBKRUN: OnceLock = OnceLock::new(); +static RUNTIME_PROVENANCE: OnceLock = OnceLock::new(); + +pub fn libkrun() -> Result<&'static LibKrun, VmError> { + if let Some(lib) = LIBKRUN.get() { + return Ok(lib); + } + + let loaded = LibKrun::load()?; + let _ = LIBKRUN.set(loaded); + Ok(LIBKRUN.get().expect("libkrun should be initialized")) +} + +/// Return the provenance information for the loaded runtime. +/// +/// Only available after [`libkrun()`] has been called successfully. +pub fn runtime_provenance() -> Option<&'static RuntimeProvenance> { + RUNTIME_PROVENANCE.get() +} + +impl LibKrun { + fn load() -> Result { + let path = runtime_libkrun_path()?; + let runtime_dir = path.parent().ok_or_else(|| { + VmError::HostSetup(format!("libkrun has no parent dir: {}", path.display())) + })?; + let krunfw_paths = preload_runtime_support_libraries(runtime_dir)?; + + // Build and store provenance information. + let provenance_json_path = runtime_dir.join("provenance.json"); + let provenance_json = fs::read_to_string(&provenance_json_path).ok(); + let is_custom = provenance_json.is_some(); + + let libkrunfw_sha256 = krunfw_paths.first().and_then(|p| compute_sha256(p).ok()); + + let provenance = RuntimeProvenance { + libkrun_path: path.clone(), + libkrunfw_paths: krunfw_paths, + libkrunfw_sha256, + provenance_json, + is_custom, + }; + let _ = RUNTIME_PROVENANCE.set(provenance); + + let library = Box::leak(Box::new(unsafe { + Library::new(&path).map_err(|e| { + VmError::HostSetup(format!("load libkrun from {}: {e}", path.display())) + })? + })); + + Ok(Self { + krun_init_log: load_symbol(library, b"krun_init_log\0", &path)?, + krun_create_ctx: load_symbol(library, b"krun_create_ctx\0", &path)?, + krun_free_ctx: load_symbol(library, b"krun_free_ctx\0", &path)?, + krun_set_vm_config: load_symbol(library, b"krun_set_vm_config\0", &path)?, + krun_set_root: load_symbol(library, b"krun_set_root\0", &path)?, + krun_set_workdir: load_symbol(library, b"krun_set_workdir\0", &path)?, + krun_set_exec: load_symbol(library, b"krun_set_exec\0", &path)?, + krun_set_port_map: load_symbol(library, b"krun_set_port_map\0", &path)?, + krun_set_console_output: load_symbol(library, b"krun_set_console_output\0", &path)?, + krun_add_vsock_port2: load_symbol(library, b"krun_add_vsock_port2\0", &path)?, + krun_start_enter: load_symbol(library, b"krun_start_enter\0", &path)?, + krun_disable_implicit_vsock: load_symbol( + library, + b"krun_disable_implicit_vsock\0", + &path, + )?, + krun_add_vsock: load_symbol(library, b"krun_add_vsock\0", &path)?, + krun_add_net_unixgram: load_symbol(library, b"krun_add_net_unixgram\0", &path)?, + }) + } +} + +fn runtime_libkrun_path() -> Result { + Ok(crate::configured_runtime_dir()?.join(required_runtime_lib_name())) +} + +fn preload_runtime_support_libraries(runtime_dir: &Path) -> Result, VmError> { + let entries = fs::read_dir(runtime_dir) + .map_err(|e| VmError::HostSetup(format!("read {}: {e}", runtime_dir.display())))?; + + let mut support_libs: Vec = entries + .filter_map(Result::ok) + .map(|entry| entry.path()) + .filter(|path| { + path.file_name() + .and_then(|name| name.to_str()) + .map(|name| { + #[cfg(target_os = "macos")] + { + name.starts_with("libkrunfw") && name.ends_with(".dylib") + } + #[cfg(not(target_os = "macos"))] + { + name.starts_with("libkrunfw") && name.contains(".so") + } + }) + .unwrap_or(false) + }) + .collect(); + + support_libs.sort(); + + for path in &support_libs { + let path_cstr = std::ffi::CString::new(path.to_string_lossy().as_bytes()).map_err(|e| { + VmError::HostSetup(format!( + "invalid support library path {}: {e}", + path.display() + )) + })?; + let handle = + unsafe { libc::dlopen(path_cstr.as_ptr(), libc::RTLD_NOW | libc::RTLD_GLOBAL) }; + if handle.is_null() { + let error = unsafe { + let err = libc::dlerror(); + if err.is_null() { + "unknown dlopen error".to_string() + } else { + std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned() + } + }; + return Err(VmError::HostSetup(format!( + "preload runtime support library {}: {error}", + path.display() + ))); + } + } + + Ok(support_libs) +} + +fn required_runtime_lib_name() -> &'static str { + #[cfg(target_os = "macos")] + { + "libkrun.dylib" + } + #[cfg(not(target_os = "macos"))] + { + "libkrun.so" + } +} + +/// Compute SHA-256 hash of a file, returning hex string. +fn compute_sha256(path: &Path) -> Result { + use std::io::Read; + let mut file = fs::File::open(path)?; + let mut hasher = sha2_hasher(); + let mut buf = [0u8; 8192]; + loop { + let n = file.read(&mut buf)?; + if n == 0 { + break; + } + hasher_update(&mut hasher, &buf[..n]); + } + Ok(hasher_finalize(hasher)) +} + +// Minimal SHA-256 using the sha2 crate if available, otherwise shell out. +// We attempt a runtime `shasum` call to avoid adding a crate dependency. +fn sha2_hasher() -> Sha256State { + Sha256State { + data: Vec::with_capacity(1024 * 1024), + } +} + +struct Sha256State { + data: Vec, +} + +fn hasher_update(state: &mut Sha256State, bytes: &[u8]) { + state.data.extend_from_slice(bytes); +} + +fn hasher_finalize(state: Sha256State) -> String { + // Use shasum via process for simplicity — avoids adding a crypto dependency. + use std::io::Write; + use std::process::{Command, Stdio}; + + let mut child = match Command::new("shasum") + .args(["-a", "256"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::null()) + .spawn() + { + Ok(c) => c, + Err(_) => return "unknown".to_string(), + }; + + if let Some(mut stdin) = child.stdin.take() { + let _ = stdin.write_all(&state.data); + } + + match child.wait_with_output() { + Ok(output) if output.status.success() => { + let stdout = String::from_utf8_lossy(&output.stdout); + stdout + .split_whitespace() + .next() + .unwrap_or("unknown") + .to_string() + } + _ => "unknown".to_string(), + } +} + +fn load_symbol( + library: &'static Library, + symbol: &[u8], + path: &Path, +) -> Result { + let loaded = unsafe { + library.get::(symbol).map_err(|e| { + VmError::HostSetup(format!( + "resolve {} from {}: {e}", + String::from_utf8_lossy(symbol).trim_end_matches('\0'), + path.display() + )) + })? + }; + Ok(*loaded) +} diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs new file mode 100644 index 00000000..3a6818f4 --- /dev/null +++ b/crates/openshell-vm/src/lib.rs @@ -0,0 +1,1455 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! `MicroVM` runtime using libkrun for hardware-isolated execution. +//! +//! This crate provides a thin wrapper around the libkrun C API to boot +//! lightweight VMs backed by virtio-fs root filesystems. On macOS ARM64, +//! it uses Apple's Hypervisor.framework; on Linux it uses KVM. +//! +//! # Codesigning (macOS) +//! +//! The calling binary must be codesigned with the +//! `com.apple.security.hypervisor` entitlement. See `entitlements.plist`. + +#![allow(unsafe_code)] + +mod ffi; + +use std::ffi::CString; +use std::path::{Path, PathBuf}; +use std::ptr; +use std::time::Instant; + +// ── Error type ───────────────────────────────────────────────────────── + +/// Errors that can occur when configuring or launching a microVM. +#[derive(Debug, thiserror::Error, miette::Diagnostic)] +pub enum VmError { + /// A libkrun FFI call returned a negative error code. + #[error("{func} failed with error code {code}")] + Krun { func: &'static str, code: i32 }, + + /// The rootfs directory does not exist. + #[error( + "rootfs directory not found: {path}\nRun: ./crates/openshell-vm/scripts/build-rootfs.sh" + )] + RootfsNotFound { path: String }, + + /// A path contained invalid UTF-8. + #[error("path is not valid UTF-8: {0}")] + InvalidPath(String), + + /// `CString::new` failed (embedded NUL byte). + #[error("invalid C string: {0}")] + CString(#[from] std::ffi::NulError), + + /// A required host binary was not found. + #[error("required binary not found: {path}\n{hint}")] + BinaryNotFound { path: String, hint: String }, + + /// Host-side VM setup failed before boot. + #[error("host setup failed: {0}")] + HostSetup(String), + + /// `fork()` failed. + #[error("fork() failed: {0}")] + Fork(String), + + /// Post-boot bootstrap failed. + #[error("bootstrap failed: {0}")] + Bootstrap(String), +} + +/// Check a libkrun return code; negative values are errors. +fn check(ret: i32, func: &'static str) -> Result<(), VmError> { + if ret < 0 { + Err(VmError::Krun { func, code: ret }) + } else { + Ok(()) + } +} + +// ── Configuration ────────────────────────────────────────────────────── + +/// Networking backend for the microVM. +#[derive(Debug, Clone)] +pub enum NetBackend { + /// TSI (Transparent Socket Impersonation) — default libkrun networking. + /// Simple but intercepts guest loopback connections, breaking k3s. + Tsi, + + /// No networking — disable vsock/TSI entirely. For debugging only. + None, + + /// gvproxy (vfkit mode) — real `eth0` interface via virtio-net. + /// Requires gvproxy binary on the host. Port forwarding is done + /// through gvproxy's HTTP API. + Gvproxy { + /// Path to the gvproxy binary. + binary: PathBuf, + }, +} + +/// Host Unix socket bridged into the guest as a vsock port. +#[derive(Debug, Clone)] +pub struct VsockPort { + pub port: u32, + pub socket_path: PathBuf, + pub listen: bool, +} + +/// Configuration for a libkrun microVM. +pub struct VmConfig { + /// Path to the extracted rootfs directory (aarch64 Linux). + pub rootfs: PathBuf, + + /// Number of virtual CPUs. + pub vcpus: u8, + + /// RAM in MiB. + pub mem_mib: u32, + + /// Executable path inside the VM. + pub exec_path: String, + + /// Arguments to the executable (argv, excluding argv\[0\]). + pub args: Vec, + + /// Environment variables in `KEY=VALUE` form. + /// If empty, a minimal default set is used. + pub env: Vec, + + /// Working directory inside the VM. + pub workdir: String, + + /// TCP port mappings in `"host_port:guest_port"` form. + /// Only used with TSI networking. + pub port_map: Vec, + + /// Optional host Unix sockets exposed to the guest over vsock. + pub vsock_ports: Vec, + + /// libkrun log level (0=Off .. 5=Trace). + pub log_level: u32, + + /// Optional file path for VM console output. If `None`, console output + /// goes to the parent directory of the rootfs as `console.log`. + pub console_output: Option, + + /// Networking backend. + pub net: NetBackend, +} + +impl VmConfig { + /// Default gateway configuration: boots k3s server inside the VM. + /// + /// Runs `/srv/gateway-init.sh` which mounts essential filesystems, + /// deploys the OpenShell helm chart, and execs `k3s server`. + /// Exposes the OpenShell gateway on port 30051. + pub fn gateway(rootfs: PathBuf) -> Self { + Self { + rootfs, + vcpus: 4, + mem_mib: 8192, + exec_path: "/srv/gateway-init.sh".to_string(), + args: vec![], + env: vec![ + "HOME=/root".to_string(), + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), + "TERM=xterm".to_string(), + ], + workdir: "/".to_string(), + port_map: vec![ + // OpenShell server — with bridge CNI the pod listens on + // 8080 inside its own network namespace (10.42.0.x), not + // on the VM's root namespace. The NodePort service + // (kube-proxy nftables) forwards VM:30051 → pod:8080. + // gvproxy maps host:30051 → VM:30051 to complete the path. + "30051:30051".to_string(), + ], + vsock_ports: vec![], + log_level: 3, // Info — for debugging + console_output: None, + net: NetBackend::Gvproxy { + binary: default_runtime_gvproxy_path(), + }, + } + } +} + +// ── Helpers ───────────────────────────────────────────────────────────── + +/// Build a null-terminated C string array from a slice of strings. +/// +/// Returns both the `CString` owners (to keep them alive) and the pointer array. +fn c_string_array(strings: &[&str]) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { + let owned: Vec = strings + .iter() + .map(|s| CString::new(*s)) + .collect::, _>>()?; + let mut ptrs: Vec<*const libc::c_char> = owned.iter().map(|c| c.as_ptr()).collect(); + ptrs.push(ptr::null()); // null terminator + Ok((owned, ptrs)) +} + +const VM_RUNTIME_DIR_NAME: &str = "gateway.runtime"; +const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR"; + +pub(crate) fn configured_runtime_dir() -> Result { + if let Some(path) = std::env::var_os(VM_RUNTIME_DIR_ENV) { + return Ok(PathBuf::from(path)); + } + + let exe = std::env::current_exe().map_err(|e| VmError::HostSetup(e.to_string()))?; + let exe_dir = exe.parent().ok_or_else(|| { + VmError::HostSetup(format!( + "executable has no parent directory: {}", + exe.display() + )) + })?; + Ok(exe_dir.join(VM_RUNTIME_DIR_NAME)) +} + +fn required_runtime_lib_name() -> &'static str { + #[cfg(target_os = "macos")] + { + "libkrun.dylib" + } + #[cfg(not(target_os = "macos"))] + { + "libkrun.so" + } +} + +fn validate_runtime_dir(dir: &Path) -> Result { + if !dir.is_dir() { + return Err(VmError::BinaryNotFound { + path: dir.display().to_string(), + hint: format!( + "stage the VM runtime bundle with `mise run vm:bundle-runtime` or set {VM_RUNTIME_DIR_ENV}" + ), + }); + } + + let libkrun = dir.join(required_runtime_lib_name()); + if !libkrun.is_file() { + return Err(VmError::BinaryNotFound { + path: libkrun.display().to_string(), + hint: "runtime bundle is incomplete: missing libkrun".to_string(), + }); + } + + let has_krunfw = std::fs::read_dir(dir) + .map_err(|e| VmError::HostSetup(format!("read {}: {e}", dir.display())))? + .filter_map(Result::ok) + .any(|entry| { + entry + .file_name() + .to_string_lossy() + .starts_with("libkrunfw.") + }); + if !has_krunfw { + return Err(VmError::BinaryNotFound { + path: dir.display().to_string(), + hint: "runtime bundle is incomplete: missing libkrunfw".to_string(), + }); + } + + let gvproxy = dir.join("gvproxy"); + if !gvproxy.is_file() { + return Err(VmError::BinaryNotFound { + path: gvproxy.display().to_string(), + hint: "runtime bundle is incomplete: missing gvproxy".to_string(), + }); + } + + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt as _; + + let mode = std::fs::metadata(&gvproxy) + .map_err(|e| VmError::HostSetup(format!("stat {}: {e}", gvproxy.display())))? + .permissions() + .mode(); + if mode & 0o111 == 0 { + return Err(VmError::HostSetup(format!( + "gvproxy is not executable: {}", + gvproxy.display() + ))); + } + } + + // Validate manifest.json if present — warn but don't fail if files + // listed in the manifest are missing (backwards compatibility). + let manifest_path = dir.join("manifest.json"); + if manifest_path.is_file() { + if let Ok(contents) = std::fs::read_to_string(&manifest_path) { + // Simple check: verify all listed files exist. + // The manifest lists files as JSON strings in a "files" array. + for line in contents.lines() { + let trimmed = line.trim().trim_matches(|c| c == '"' || c == ','); + if !trimmed.is_empty() + && !trimmed.starts_with('{') + && !trimmed.starts_with('}') + && !trimmed.starts_with('[') + && !trimmed.starts_with(']') + && !trimmed.contains(':') + { + let file_path = dir.join(trimmed); + if !file_path.exists() { + eprintln!( + "warning: manifest.json references missing file: {}", + trimmed + ); + } + } + } + } + } + + Ok(gvproxy) +} + +fn resolve_runtime_bundle() -> Result { + let runtime_dir = configured_runtime_dir()?; + validate_runtime_dir(&runtime_dir) +} + +pub fn default_runtime_gvproxy_path() -> PathBuf { + configured_runtime_dir() + .unwrap_or_else(|_| PathBuf::from(VM_RUNTIME_DIR_NAME)) + .join("gvproxy") +} + +#[cfg(target_os = "macos")] +fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), VmError> { + let existing = std::env::var_os("DYLD_FALLBACK_LIBRARY_PATH"); + let mut paths = vec![runtime_dir.to_path_buf()]; + if let Some(existing) = existing { + paths.extend(std::env::split_paths(&existing)); + } + let joined = std::env::join_paths(paths) + .map_err(|e| VmError::HostSetup(format!("join DYLD_FALLBACK_LIBRARY_PATH: {e}")))?; + unsafe { + std::env::set_var("DYLD_FALLBACK_LIBRARY_PATH", joined); + } + Ok(()) +} + +#[cfg(not(target_os = "macos"))] +fn configure_runtime_loader_env(_runtime_dir: &Path) -> Result<(), VmError> { + Ok(()) +} + +fn raise_nofile_limit() { + #[cfg(unix)] + unsafe { + let mut rlim = libc::rlimit { + rlim_cur: 0, + rlim_max: 0, + }; + if libc::getrlimit(libc::RLIMIT_NOFILE, &raw mut rlim) == 0 { + rlim.rlim_cur = rlim.rlim_max; + let _ = libc::setrlimit(libc::RLIMIT_NOFILE, &rlim); + } + } +} + +/// Log runtime provenance information for diagnostics. +/// +/// Prints the libkrun/libkrunfw versions, artifact hashes, and whether +/// a custom runtime is in use. This makes it easy to correlate VM issues +/// with the specific runtime bundle. +fn log_runtime_provenance(runtime_dir: &Path) { + if let Some(prov) = ffi::runtime_provenance() { + eprintln!("runtime: {}", runtime_dir.display()); + eprintln!(" libkrun: {}", prov.libkrun_path.display()); + for krunfw in &prov.libkrunfw_paths { + let name = krunfw + .file_name() + .map(|n| n.to_string_lossy().to_string()) + .unwrap_or_else(|| "unknown".to_string()); + eprintln!(" libkrunfw: {name}"); + } + if let Some(ref sha) = prov.libkrunfw_sha256 { + let short = if sha.len() > 12 { &sha[..12] } else { sha }; + eprintln!(" sha256: {short}..."); + } + if prov.is_custom { + eprintln!(" type: custom (OpenShell-built)"); + // Parse provenance.json for additional details. + if let Some(ref json) = prov.provenance_json { + // Extract key fields without pulling in serde_json for this. + for key in &["libkrunfw_commit", "kernel_version", "build_timestamp"] { + if let Some(val) = extract_json_string(json, key) { + eprintln!(" {}: {}", key.replace('_', "-"), val); + } + } + } + } else { + eprintln!(" type: stock (system/homebrew)"); + } + } +} + +/// Simple JSON string value extractor (avoids serde_json dependency +/// for this single use case). +fn extract_json_string(json: &str, key: &str) -> Option { + let pattern = format!("\"{}\"", key); + let idx = json.find(&pattern)?; + let after_key = &json[idx + pattern.len()..]; + // Skip whitespace and colon + let after_colon = after_key.trim_start().strip_prefix(':')?; + let after_ws = after_colon.trim_start(); + if after_ws.starts_with('"') { + let value_start = &after_ws[1..]; + let end = value_start.find('"')?; + Some(value_start[..end].to_string()) + } else { + None + } +} + +fn clamp_log_level(level: u32) -> u32 { + match level { + 0 => ffi::KRUN_LOG_LEVEL_OFF, + 1 => ffi::KRUN_LOG_LEVEL_ERROR, + 2 => ffi::KRUN_LOG_LEVEL_WARN, + 3 => ffi::KRUN_LOG_LEVEL_INFO, + 4 => ffi::KRUN_LOG_LEVEL_DEBUG, + _ => ffi::KRUN_LOG_LEVEL_TRACE, + } +} + +struct VmContext { + krun: &'static ffi::LibKrun, + ctx_id: u32, +} + +impl VmContext { + fn create(log_level: u32) -> Result { + let krun = ffi::libkrun()?; + unsafe { + check( + (krun.krun_init_log)( + ffi::KRUN_LOG_TARGET_DEFAULT, + clamp_log_level(log_level), + ffi::KRUN_LOG_STYLE_AUTO, + ffi::KRUN_LOG_OPTION_NO_ENV, + ), + "krun_init_log", + )?; + } + + let ctx_id = unsafe { (krun.krun_create_ctx)() }; + if ctx_id < 0 { + return Err(VmError::Krun { + func: "krun_create_ctx", + code: ctx_id, + }); + } + + Ok(Self { + krun, + ctx_id: ctx_id as u32, + }) + } + + fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), + "krun_set_vm_config", + ) + } + } + + fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { + let rootfs_c = path_to_cstring(rootfs)?; + unsafe { + check( + (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), + "krun_set_root", + ) + } + } + + fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { + let workdir_c = CString::new(workdir)?; + unsafe { + check( + (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), + "krun_set_workdir", + ) + } + } + + fn disable_implicit_vsock(&self) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_disable_implicit_vsock)(self.ctx_id), + "krun_disable_implicit_vsock", + ) + } + } + + fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { + unsafe { + check( + (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), + "krun_add_vsock", + ) + } + } + + fn add_net_unixgram( + &self, + socket_path: &Path, + mac: &[u8; 6], + features: u32, + flags: u32, + ) -> Result<(), VmError> { + let sock_c = path_to_cstring(socket_path)?; + unsafe { + check( + (self.krun.krun_add_net_unixgram)( + self.ctx_id, + sock_c.as_ptr(), + -1, + mac.as_ptr(), + features, + flags, + ), + "krun_add_net_unixgram", + ) + } + } + + fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { + let port_strs: Vec<&str> = port_map.iter().map(String::as_str).collect(); + let (_port_owners, port_ptrs) = c_string_array(&port_strs)?; + unsafe { + check( + (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), + "krun_set_port_map", + ) + } + } + + fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { + let socket_c = path_to_cstring(&port.socket_path)?; + unsafe { + check( + (self.krun.krun_add_vsock_port2)( + self.ctx_id, + port.port, + socket_c.as_ptr(), + port.listen, + ), + "krun_add_vsock_port2", + ) + } + } + + fn set_console_output(&self, path: &Path) -> Result<(), VmError> { + let console_c = path_to_cstring(path)?; + unsafe { + check( + (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), + "krun_set_console_output", + ) + } + } + + fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { + let exec_c = CString::new(exec_path)?; + let argv_strs: Vec<&str> = args.iter().map(String::as_str).collect(); + let (_argv_owners, argv_ptrs) = c_string_array(&argv_strs)?; + let env_strs: Vec<&str> = env.iter().map(String::as_str).collect(); + let (_env_owners, env_ptrs) = c_string_array(&env_strs)?; + + unsafe { + check( + (self.krun.krun_set_exec)( + self.ctx_id, + exec_c.as_ptr(), + argv_ptrs.as_ptr(), + env_ptrs.as_ptr(), + ), + "krun_set_exec", + ) + } + } + + fn start_enter(&self) -> i32 { + unsafe { (self.krun.krun_start_enter)(self.ctx_id) } + } +} + +impl Drop for VmContext { + fn drop(&mut self) { + unsafe { + let _ = (self.krun.krun_free_ctx)(self.ctx_id); + } + } +} + +/// Issue a gvproxy expose call via its HTTP API (unix socket). +/// +/// Sends a raw HTTP/1.1 POST request over the unix socket to avoid +/// depending on `curl` being installed on the host. +fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { + use std::io::{Read, Write}; + use std::os::unix::net::UnixStream; + + let mut stream = + UnixStream::connect(api_sock).map_err(|e| format!("connect to gvproxy API socket: {e}"))?; + + let request = format!( + "POST /services/forwarder/expose HTTP/1.1\r\n\ + Host: localhost\r\n\ + Content-Type: application/json\r\n\ + Content-Length: {}\r\n\ + Connection: close\r\n\ + \r\n\ + {}", + body.len(), + body, + ); + + stream + .write_all(request.as_bytes()) + .map_err(|e| format!("write to gvproxy API: {e}"))?; + + // Read just enough of the response to get the status line. + let mut buf = [0u8; 1024]; + let n = stream + .read(&mut buf) + .map_err(|e| format!("read from gvproxy API: {e}"))?; + let response = String::from_utf8_lossy(&buf[..n]); + + // Parse the HTTP status code from the first line (e.g. "HTTP/1.1 200 OK"). + let status = response + .lines() + .next() + .and_then(|line| line.split_whitespace().nth(1)) + .unwrap_or("0"); + + match status { + "200" | "204" => Ok(()), + _ => { + let first_line = response.lines().next().unwrap_or(""); + Err(format!("gvproxy API: {first_line}")) + } + } +} + +/// Kill any stale gvproxy process from a previous gateway run. +/// +/// If the CLI crashes or is killed before cleanup, gvproxy keeps running +/// and holds port 2222. A new gvproxy instance then fails with +/// "bind: address already in use". +fn kill_stale_gvproxy() { + let output = std::process::Command::new("pkill") + .args(["-x", "gvproxy"]) + .output(); + if let Ok(o) = output { + if o.status.success() { + eprintln!("Killed stale gvproxy process"); + // Brief pause for the port to be released. + std::thread::sleep(std::time::Duration::from_millis(200)); + } + } +} + +fn path_to_cstring(path: &Path) -> Result { + let s = path + .to_str() + .ok_or_else(|| VmError::InvalidPath(path.display().to_string()))?; + Ok(CString::new(s)?) +} + +// ── Launch ────────────────────────────────────────────────────────────── + +/// Configure and launch a libkrun microVM. +/// +/// This forks the process. The child enters the VM (never returns); the +/// parent blocks until the VM exits or a signal is received. +/// +/// Returns the VM exit code (from `waitpid`). +#[allow(clippy::similar_names)] +pub fn launch(config: &VmConfig) -> Result { + // Validate rootfs + if !config.rootfs.is_dir() { + return Err(VmError::RootfsNotFound { + path: config.rootfs.display().to_string(), + }); + } + + let launch_start = Instant::now(); + eprintln!("rootfs: {}", config.rootfs.display()); + eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib); + + // The runtime must already be staged as a sidecar bundle next to the + // binary (or explicitly pointed to via OPENSHELL_VM_RUNTIME_DIR). + let runtime_gvproxy = resolve_runtime_bundle()?; + let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { + VmError::HostSetup(format!( + "runtime bundle file has no parent directory: {}", + runtime_gvproxy.display() + )) + })?; + configure_runtime_loader_env(runtime_dir)?; + raise_nofile_limit(); + + // ── Log runtime provenance ───────────────────────────────────── + // After configuring the loader, trigger library loading so that + // provenance is captured before we proceed with VM configuration. + let _ = ffi::libkrun()?; + log_runtime_provenance(runtime_dir); + + // ── Configure the microVM ────────────────────────────────────── + + let vm = VmContext::create(config.log_level)?; + vm.set_vm_config(config.vcpus, config.mem_mib)?; + vm.set_root(&config.rootfs)?; + vm.set_workdir(&config.workdir)?; + + // Networking setup + let mut gvproxy_child: Option = None; + let mut gvproxy_api_sock: Option = None; + + match &config.net { + NetBackend::Tsi => { + // Default TSI — no special setup needed. + } + NetBackend::None => { + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + eprintln!("Networking: disabled (no TSI, no virtio-net)"); + } + NetBackend::Gvproxy { binary } => { + if !binary.exists() { + return Err(VmError::BinaryNotFound { + path: binary.display().to_string(), + hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), + }); + } + + // Create temp socket paths + let run_dir = config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .to_path_buf(); + let vfkit_sock = run_dir.join("gvproxy-vfkit.sock"); + let api_sock = run_dir.join("gvproxy-api.sock"); + + // Kill any stale gvproxy process from a previous run. + // If gvproxy is still holding port 2222, the new instance + // will fail with "bind: address already in use". + kill_stale_gvproxy(); + + // Clean stale sockets (including the -krun.sock file that + // libkrun creates as its datagram endpoint). + let _ = std::fs::remove_file(&vfkit_sock); + let _ = std::fs::remove_file(&api_sock); + let krun_sock = run_dir.join("gvproxy-vfkit.sock-krun.sock"); + let _ = std::fs::remove_file(&krun_sock); + + // Start gvproxy + eprintln!("Starting gvproxy: {}", binary.display()); + let gvproxy_log = run_dir.join("gvproxy.log"); + let gvproxy_log_file = std::fs::File::create(&gvproxy_log) + .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; + let child = std::process::Command::new(binary) + .arg("-listen-vfkit") + .arg(format!("unixgram://{}", vfkit_sock.display())) + .arg("-listen") + .arg(format!("unix://{}", api_sock.display())) + .stdout(std::process::Stdio::null()) + .stderr(gvproxy_log_file) + .spawn() + .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; + + eprintln!( + "gvproxy started (pid {}) [{:.1}s]", + child.id(), + launch_start.elapsed().as_secs_f64() + ); + + // Wait for the socket to appear (exponential backoff: 5ms → 100ms). + { + let deadline = Instant::now() + std::time::Duration::from_secs(5); + let mut interval = std::time::Duration::from_millis(5); + while !vfkit_sock.exists() { + if Instant::now() >= deadline { + return Err(VmError::Fork( + "gvproxy socket did not appear within 5s".to_string(), + )); + } + std::thread::sleep(interval); + interval = (interval * 2).min(std::time::Duration::from_millis(100)); + } + } + + // Disable implicit TSI and add virtio-net via gvproxy + vm.disable_implicit_vsock()?; + vm.add_vsock(0)?; + // This MAC matches gvproxy's default static DHCP lease for + // 192.168.127.2. Using a different MAC can cause the gVisor + // network stack to misroute or drop packets. + let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; + + // COMPAT_NET_FEATURES from libkrun.h + const NET_FEATURE_CSUM: u32 = 1 << 0; + const NET_FEATURE_GUEST_CSUM: u32 = 1 << 1; + const NET_FEATURE_GUEST_TSO4: u32 = 1 << 7; + const NET_FEATURE_GUEST_UFO: u32 = 1 << 10; + const NET_FEATURE_HOST_TSO4: u32 = 1 << 11; + const NET_FEATURE_HOST_UFO: u32 = 1 << 14; + const COMPAT_NET_FEATURES: u32 = NET_FEATURE_CSUM + | NET_FEATURE_GUEST_CSUM + | NET_FEATURE_GUEST_TSO4 + | NET_FEATURE_GUEST_UFO + | NET_FEATURE_HOST_TSO4 + | NET_FEATURE_HOST_UFO; + const NET_FLAG_VFKIT: u32 = 1 << 0; + + vm.add_net_unixgram(&vfkit_sock, &mac, COMPAT_NET_FEATURES, NET_FLAG_VFKIT)?; + + eprintln!( + "Networking: gvproxy (virtio-net) [{:.1}s]", + launch_start.elapsed().as_secs_f64() + ); + gvproxy_child = Some(child); + gvproxy_api_sock = Some(api_sock); + } + } + + // Port mapping (TSI only) + if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { + vm.set_port_map(&config.port_map)?; + } + + for vsock_port in &config.vsock_ports { + vm.add_vsock_port(vsock_port)?; + } + + // Console output + let console_log = config.console_output.clone().unwrap_or_else(|| { + config + .rootfs + .parent() + .unwrap_or(&config.rootfs) + .join("console.log") + }); + vm.set_console_output(&console_log)?; + + // envp: use provided env or minimal defaults + let env: Vec = if config.env.is_empty() { + vec![ + "HOME=/root", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + ] + .into_iter() + .map(ToOwned::to_owned) + .collect() + } else { + config.env.clone() + }; + vm.set_exec(&config.exec_path, &config.args, &env)?; + + // ── Fork and enter the VM ────────────────────────────────────── + // + // krun_start_enter() never returns — it calls exit() when the guest + // process exits. We fork so the parent can monitor and report. + + let boot_start = Instant::now(); + eprintln!("Booting microVM..."); + + let pid = unsafe { libc::fork() }; + match pid { + -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), + 0 => { + // Child process: enter the VM (never returns on success) + let ret = vm.start_enter(); + eprintln!("krun_start_enter failed: {ret}"); + std::process::exit(1); + } + _ => { + // Parent: wait for child + eprintln!( + "VM started (child pid {pid}) [{:.1}s]", + boot_start.elapsed().as_secs_f64() + ); + for pm in &config.port_map { + let host_port = pm.split(':').next().unwrap_or(pm); + eprintln!(" port {pm} -> http://localhost:{host_port}"); + } + eprintln!("Console output: {}", console_log.display()); + + // Set up gvproxy port forwarding via its HTTP API. + // The port_map entries use the same "host:guest" format + // as TSI, but here we translate them into gvproxy expose + // calls targeting the guest IP (192.168.127.2). + // + // Instead of a fixed 500ms sleep, poll the API socket with + // exponential backoff (5ms → 200ms, ~1s total budget). + if let Some(ref api_sock) = gvproxy_api_sock { + let fwd_start = Instant::now(); + // Wait for the API socket to appear (it lags slightly + // behind the vfkit data socket). + { + let deadline = Instant::now() + std::time::Duration::from_secs(2); + let mut interval = std::time::Duration::from_millis(5); + while !api_sock.exists() { + if Instant::now() >= deadline { + eprintln!( + "warning: gvproxy API socket not ready after 2s, attempting anyway" + ); + break; + } + std::thread::sleep(interval); + interval = (interval * 2).min(std::time::Duration::from_millis(200)); + } + } + + let guest_ip = "192.168.127.2"; + + for pm in &config.port_map { + let parts: Vec<&str> = pm.split(':').collect(); + let (host_port, guest_port) = match parts.len() { + 2 => (parts[0], parts[1]), + 1 => (parts[0], parts[0]), + _ => { + eprintln!(" skipping invalid port mapping: {pm}"); + continue; + } + }; + + let expose_body = format!( + r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# + ); + + match gvproxy_expose(api_sock, &expose_body) { + Ok(()) => { + eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); + } + Err(e) => { + eprintln!(" port {host_port}: {e}"); + } + } + } + eprintln!( + "Port forwarding ready [{:.1}s]", + fwd_start.elapsed().as_secs_f64() + ); + } + + // Bootstrap the OpenShell control plane and wait for the + // service to be reachable. Only for the gateway preset. + if config.exec_path == "/srv/gateway-init.sh" { + // Bootstrap stores host-side metadata and mTLS creds. + // With pre-baked rootfs (Path 1) this reads PKI directly + // from virtio-fs — no kubectl or port forwarding needed. + // Cold boot (Path 2) writes secret manifests into the + // k3s auto-deploy directory via virtio-fs. + if let Err(e) = bootstrap_gateway(&config.rootfs) { + eprintln!("Bootstrap failed: {e}"); + eprintln!(" The VM is running but OpenShell may not be fully operational."); + } + + // Wait for the gRPC service to be reachable via TCP + // probe on host:30051. This confirms the full path + // (gvproxy → kube-proxy nftables → pod:8080) is working. + wait_for_gateway_service(); + + // Best-effort: copy kubeconfig for manual debugging. + // Not blocking — the boot pipeline doesn't need it. + copy_kubeconfig_best_effort(&config.rootfs); + } + + eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); + eprintln!("Press Ctrl+C to stop."); + + // Forward signals to child + unsafe { + libc::signal( + libc::SIGINT, + forward_signal as *const () as libc::sighandler_t, + ); + libc::signal( + libc::SIGTERM, + forward_signal as *const () as libc::sighandler_t, + ); + CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); + } + + let mut status: libc::c_int = 0; + unsafe { + libc::waitpid(pid, &raw mut status, 0); + } + + // Clean up gvproxy + if let Some(mut child) = gvproxy_child { + let _ = child.kill(); + let _ = child.wait(); + eprintln!("gvproxy stopped"); + } + + if libc::WIFEXITED(status) { + let code = libc::WEXITSTATUS(status); + eprintln!("VM exited with code {code}"); + return Ok(code); + } else if libc::WIFSIGNALED(status) { + let sig = libc::WTERMSIG(status); + eprintln!("VM killed by signal {sig}"); + return Ok(128 + sig); + } + + Ok(status) + } + } +} + +// ── Post-boot bootstrap ──────────────────────────────────────────────── + +/// Cluster name used for metadata and mTLS storage. +const GATEWAY_CLUSTER_NAME: &str = "gateway"; + +/// Gateway port: the host port mapped to the OpenShell `NodePort` (30051). +const GATEWAY_PORT: u16 = 30051; + +/// Bootstrap the OpenShell control plane after k3s is ready. +/// +/// All operations use the virtio-fs rootfs — no kubectl or API server +/// port forwarding required. This avoids exposing port 6443 outside the +/// VM. +/// +/// Three paths, in priority order: +/// +/// 1. **Pre-baked rootfs** (from `build-rootfs.sh`): PKI files at +/// `rootfs/opt/openshell/pki/`. TLS secrets already exist in the k3s +/// database. Reads certs from the filesystem and stores metadata on the +/// host. +/// +/// 2. **Warm boot**: host-side metadata + mTLS certs survive across VM +/// restarts. Nothing to do — service readiness is confirmed by the TCP +/// probe in `wait_for_gateway_service()`. +/// +/// 3. **Cold boot**: generates fresh PKI and writes TLS secret manifests +/// into the k3s auto-deploy directory (`/var/lib/rancher/k3s/server/manifests/`) +/// via virtio-fs. k3s picks them up automatically. +fn bootstrap_gateway(rootfs: &Path) -> Result<(), VmError> { + let bootstrap_start = Instant::now(); + + // Build gateway metadata early — it only depends on knowing the port and + // gateway name, not on the cluster being ready. + let metadata = openshell_bootstrap::GatewayMetadata { + name: GATEWAY_CLUSTER_NAME.to_string(), + gateway_endpoint: format!("https://127.0.0.1:{GATEWAY_PORT}"), + is_remote: false, + gateway_port: GATEWAY_PORT, + remote_host: None, + resolved_host: None, + auth_mode: None, + edge_team_domain: None, + edge_auth_url: None, + }; + + // ── Path 1: Pre-baked PKI from build-rootfs.sh ───────────────── + // + // If the rootfs was pre-initialized, PKI files are baked into + // /opt/openshell/pki/. Read them directly — no cluster interaction + // needed. The TLS secrets already exist inside the cluster from + // the build-time k3s boot. + let pki_dir = rootfs.join("opt/openshell/pki"); + if pki_dir.join("ca.crt").is_file() { + eprintln!("Pre-baked PKI detected — fast bootstrap"); + + let read = |name: &str| -> Result { + std::fs::read_to_string(pki_dir.join(name)) + .map_err(|e| VmError::Bootstrap(format!("failed to read {name}: {e}"))) + }; + + let pki_bundle = openshell_bootstrap::pki::PkiBundle { + ca_cert_pem: read("ca.crt")?, + ca_key_pem: read("ca.key")?, + server_cert_pem: read("server.crt")?, + server_key_pem: read("server.key")?, + client_cert_pem: read("client.crt")?, + client_key_pem: read("client.key")?, + }; + + // Store metadata and credentials on the host. + openshell_bootstrap::store_gateway_metadata(GATEWAY_CLUSTER_NAME, &metadata) + .map_err(|e| VmError::Bootstrap(format!("failed to store metadata: {e}")))?; + + openshell_bootstrap::mtls::store_pki_bundle(GATEWAY_CLUSTER_NAME, &pki_bundle) + .map_err(|e| VmError::Bootstrap(format!("failed to store mTLS creds: {e}")))?; + + openshell_bootstrap::save_active_gateway(GATEWAY_CLUSTER_NAME) + .map_err(|e| VmError::Bootstrap(format!("failed to set active cluster: {e}")))?; + + eprintln!( + "Bootstrap complete [{:.1}s]", + bootstrap_start.elapsed().as_secs_f64() + ); + eprintln!(" Cluster: {GATEWAY_CLUSTER_NAME}"); + eprintln!(" Gateway: https://127.0.0.1:{GATEWAY_PORT}"); + eprintln!(" mTLS: ~/.config/openshell/gateways/{GATEWAY_CLUSTER_NAME}/mtls/"); + return Ok(()); + } + + // ── Path 2: Warm boot ────────────────────────────────────────── + // + // Host-side metadata + mTLS certs survive from a previous boot. + // Service readiness is confirmed by the TCP probe in + // `wait_for_gateway_service()` — no kubectl needed here. + if is_warm_boot() { + eprintln!("Warm boot detected — reusing existing PKI and metadata."); + eprintln!( + "Warm boot ready [{:.1}s]", + bootstrap_start.elapsed().as_secs_f64() + ); + eprintln!(" Cluster: {GATEWAY_CLUSTER_NAME}"); + eprintln!(" Gateway: https://127.0.0.1:{GATEWAY_PORT}"); + eprintln!(" mTLS: ~/.config/openshell/gateways/{GATEWAY_CLUSTER_NAME}/mtls/"); + return Ok(()); + } + + // ── Path 3: Cold boot (no pre-baked state) ───────────────────── + // + // Generate fresh PKI and write TLS secret manifests into the k3s + // auto-deploy directory via virtio-fs. k3s watches this directory + // and applies any YAML files automatically. + eprintln!("Generating TLS certificates..."); + let pki_bundle = openshell_bootstrap::pki::generate_pki(&[]) + .map_err(|e| VmError::Bootstrap(format!("PKI generation failed: {e}")))?; + + openshell_bootstrap::store_gateway_metadata(GATEWAY_CLUSTER_NAME, &metadata) + .map_err(|e| VmError::Bootstrap(format!("failed to store cluster metadata: {e}")))?; + + // Write TLS secrets as k3s auto-deploy manifests via virtio-fs. + // k3s watches /var/lib/rancher/k3s/server/manifests/ and applies + // any YAML files dropped there, eliminating the need for kubectl + // or API server port forwarding. + eprintln!("Writing TLS secret manifests via virtio-fs..."); + write_tls_secret_manifests(rootfs, &pki_bundle)?; + + openshell_bootstrap::mtls::store_pki_bundle(GATEWAY_CLUSTER_NAME, &pki_bundle) + .map_err(|e| VmError::Bootstrap(format!("failed to store mTLS credentials: {e}")))?; + + openshell_bootstrap::save_active_gateway(GATEWAY_CLUSTER_NAME) + .map_err(|e| VmError::Bootstrap(format!("failed to set active cluster: {e}")))?; + + eprintln!( + "Bootstrap complete [{:.1}s]", + bootstrap_start.elapsed().as_secs_f64() + ); + eprintln!(" Cluster: {GATEWAY_CLUSTER_NAME}"); + eprintln!(" Gateway: https://127.0.0.1:{GATEWAY_PORT}"); + eprintln!(" mTLS: ~/.config/openshell/gateways/{GATEWAY_CLUSTER_NAME}/mtls/"); + + Ok(()) +} + +/// Check whether a previous bootstrap left valid state on disk. +/// +/// A warm boot is detected when both: +/// - Cluster metadata exists: `$XDG_CONFIG_HOME/openshell/gateways/gateway/metadata.json` +/// - mTLS certs exist: `$XDG_CONFIG_HOME/openshell/gateways/gateway/mtls/{ca.crt,tls.crt,tls.key}` +/// +/// When true, the host-side bootstrap (PKI generation, secret manifest writing, +/// metadata storage) can be skipped because the virtio-fs rootfs persists k3s +/// state (TLS certs, kine/sqlite, containerd images, helm releases) across VM +/// restarts. +fn is_warm_boot() -> bool { + let Ok(home) = std::env::var("HOME") else { + return false; + }; + + let config_base = + std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); + + let config_dir = PathBuf::from(&config_base) + .join("openshell") + .join("gateways"); + + // Check metadata file. + let metadata_path = config_dir.join(GATEWAY_CLUSTER_NAME).join("metadata.json"); + if !metadata_path.is_file() { + return false; + } + + // Check mTLS cert files. + let mtls_dir = config_dir.join(GATEWAY_CLUSTER_NAME).join("mtls"); + for name in &["ca.crt", "tls.crt", "tls.key"] { + let path = mtls_dir.join(name); + match std::fs::metadata(&path) { + Ok(m) if m.is_file() && m.len() > 0 => {} + _ => return false, + } + } + + true +} + +/// Wait for the openshell pod to become Ready inside the k3s cluster +/// and verify the gRPC service is reachable from the host. +/// +/// Stale pod/lease records are cleaned from the kine DB at build time +/// (see `build-rootfs.sh`). Containerd metadata (meta.db) is preserved +/// across boots so the native snapshotter doesn't re-extract image layers. +/// Runtime task state is cleaned by `gateway-init.sh` on each boot. +/// +/// Wait for the OpenShell gRPC service to be reachable from the host. +/// +/// Polls `host_tcp_probe()` on `127.0.0.1:30051` with 1s intervals. +/// The probe confirms the full networking path: gvproxy → kube-proxy +/// nftables → pod:8080. A successful probe means the pod is running, +/// the NodePort service is routing, and the server is accepting +/// connections. No kubectl or API server access required. +fn wait_for_gateway_service() { + let start = Instant::now(); + let timeout = std::time::Duration::from_secs(90); + let poll_interval = std::time::Duration::from_secs(1); + + eprintln!("Waiting for gateway service..."); + + loop { + if host_tcp_probe() { + eprintln!("Service healthy [{:.1}s]", start.elapsed().as_secs_f64()); + return; + } + + if start.elapsed() >= timeout { + eprintln!( + " gateway service not ready after {:.0}s, continuing anyway", + timeout.as_secs_f64() + ); + return; + } + + std::thread::sleep(poll_interval); + } +} + +/// Probe `127.0.0.1:30051` from the host to verify the full +/// gvproxy → VM → pod path is working. +/// +/// gvproxy accepts TCP connections even when the guest port is closed, +/// but those connections are immediately reset. A server that is truly +/// listening will hold the connection open (waiting for a TLS +/// ClientHello). We exploit this: connect, then try a short read. If +/// the read **times out** the server is alive; if it returns an error +/// (reset/EOF) the server is down. +fn host_tcp_probe() -> bool { + use std::io::Read; + use std::net::{SocketAddr, TcpStream}; + use std::time::Duration; + + let addr: SocketAddr = ([127, 0, 0, 1], GATEWAY_PORT).into(); + let Ok(mut stream) = TcpStream::connect_timeout(&addr, Duration::from_secs(2)) else { + return false; + }; + + // A short read timeout: if the server is alive it will wait for us + // to send a TLS ClientHello, so the read will time out (= good). + // If the connection resets or closes, the server is dead. + stream + .set_read_timeout(Some(Duration::from_millis(200))) + .ok(); + let mut buf = [0u8; 1]; + match stream.read(&mut buf) { + Err(e) + if e.kind() == std::io::ErrorKind::WouldBlock + || e.kind() == std::io::ErrorKind::TimedOut => + { + true // Timeout = server alive, waiting for ClientHello. + } + _ => false, // Reset, EOF, or unexpected data = not healthy. + } +} + +/// Write TLS secret manifests into the k3s auto-deploy directory via virtio-fs. +/// +/// k3s watches `/var/lib/rancher/k3s/server/manifests/` and automatically +/// applies any YAML files placed there. This avoids the need for kubectl +/// or API server port forwarding from the host. +fn write_tls_secret_manifests( + rootfs: &Path, + bundle: &openshell_bootstrap::pki::PkiBundle, +) -> Result<(), VmError> { + use base64::Engine; + use base64::engine::general_purpose::STANDARD; + + let manifests_dir = rootfs.join("var/lib/rancher/k3s/server/manifests"); + std::fs::create_dir_all(&manifests_dir) + .map_err(|e| VmError::Bootstrap(format!("failed to create manifests dir: {e}")))?; + + let server_tls_name = openshell_bootstrap::constants::SERVER_TLS_SECRET_NAME; + let client_ca_name = openshell_bootstrap::constants::SERVER_CLIENT_CA_SECRET_NAME; + let client_tls_name = openshell_bootstrap::constants::CLIENT_TLS_SECRET_NAME; + + // Combine all three secrets into a single multi-document YAML file. + // k3s applies the entire file atomically. + let manifest = format!( + r#"--- +apiVersion: v1 +kind: Namespace +metadata: + name: openshell +--- +apiVersion: v1 +kind: Secret +metadata: + name: {server_tls_name} + namespace: openshell +type: kubernetes.io/tls +data: + tls.crt: {server_crt} + tls.key: {server_key} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {client_ca_name} + namespace: openshell +type: Opaque +data: + ca.crt: {ca_crt} +--- +apiVersion: v1 +kind: Secret +metadata: + name: {client_tls_name} + namespace: openshell +type: Opaque +data: + tls.crt: {client_crt} + tls.key: {client_key} + ca.crt: {ca_crt} +"#, + server_crt = STANDARD.encode(&bundle.server_cert_pem), + server_key = STANDARD.encode(&bundle.server_key_pem), + ca_crt = STANDARD.encode(&bundle.ca_cert_pem), + client_crt = STANDARD.encode(&bundle.client_cert_pem), + client_key = STANDARD.encode(&bundle.client_key_pem), + ); + + let dest = manifests_dir.join("openshell-tls-secrets.yaml"); + std::fs::write(&dest, manifest) + .map_err(|e| VmError::Bootstrap(format!("failed to write TLS manifest: {e}")))?; + + eprintln!(" TLS secret manifests written to {}", dest.display()); + Ok(()) +} + +/// Best-effort: copy the k3s kubeconfig to `~/.kube/gateway.yaml` for +/// manual debugging. Not required for the boot pipeline — runs after +/// the service is already confirmed healthy. +fn copy_kubeconfig_best_effort(rootfs: &Path) { + let kubeconfig_src = rootfs.join("etc/rancher/k3s/k3s.yaml"); + if !kubeconfig_src.is_file() { + return; + } + + let home = std::env::var("HOME").unwrap_or_else(|_| "/tmp".to_string()); + let kube_dir = PathBuf::from(&home).join(".kube"); + let _ = std::fs::create_dir_all(&kube_dir); + let dest = kube_dir.join("gateway.yaml"); + + match std::fs::read_to_string(&kubeconfig_src) { + Ok(contents) => { + if let Err(e) = std::fs::write(&dest, &contents) { + eprintln!(" failed to write kubeconfig: {e}"); + } else { + eprintln!("Kubeconfig: {}", dest.display()); + eprintln!(" export KUBECONFIG={}", dest.display()); + } + } + Err(e) => { + eprintln!(" failed to read kubeconfig: {e}"); + } + } +} + +static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); + +extern "C" fn forward_signal(_sig: libc::c_int) { + let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed); + if pid > 0 { + unsafe { + libc::kill(pid, libc::SIGTERM); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn temp_runtime_dir() -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("time went backwards") + .as_nanos(); + std::env::temp_dir().join(format!( + "openshell-vm-runtime-{}-{nanos}", + std::process::id() + )) + } + + fn write_runtime_file(path: &Path) { + fs::write(path, b"test").expect("failed to write runtime file"); + } + + #[test] + fn validate_runtime_dir_accepts_minimal_bundle() { + let dir = temp_runtime_dir(); + fs::create_dir_all(&dir).expect("failed to create runtime dir"); + + write_runtime_file(&dir.join(required_runtime_lib_name())); + write_runtime_file(&dir.join("libkrunfw.test")); + let gvproxy = dir.join("gvproxy"); + write_runtime_file(&gvproxy); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt as _; + + let mut perms = fs::metadata(&gvproxy).expect("stat gvproxy").permissions(); + perms.set_mode(0o755); + fs::set_permissions(&gvproxy, perms).expect("chmod gvproxy"); + } + + let resolved_gvproxy = validate_runtime_dir(&dir).expect("runtime bundle should validate"); + assert_eq!(resolved_gvproxy, gvproxy); + + let _ = fs::remove_dir_all(&dir); + } + + #[test] + fn validate_runtime_dir_requires_gvproxy() { + let dir = temp_runtime_dir(); + fs::create_dir_all(&dir).expect("failed to create runtime dir"); + + write_runtime_file(&dir.join(required_runtime_lib_name())); + write_runtime_file(&dir.join("libkrunfw.test")); + + let err = validate_runtime_dir(&dir).expect_err("missing gvproxy should fail"); + match err { + VmError::BinaryNotFound { hint, .. } => { + assert!(hint.contains("missing gvproxy")); + } + other => panic!("unexpected error: {other:?}"), + } + + let _ = fs::remove_dir_all(&dir); + } +} diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs new file mode 100644 index 00000000..6e84c489 --- /dev/null +++ b/crates/openshell-vm/src/main.rs @@ -0,0 +1,291 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Standalone gateway binary. +//! +//! Boots a libkrun microVM running the OpenShell control plane (k3s + +//! openshell-server). By default it uses the pre-built rootfs at +//! `~/.local/share/openshell/gateway/rootfs`. +//! +//! # Codesigning (macOS) +//! +//! This binary must be codesigned with the `com.apple.security.hypervisor` +//! entitlement. See `entitlements.plist` in this crate. +//! +//! ```sh +//! codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - target/debug/gateway +//! ``` + +use std::ffi::OsString; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; + +use clap::{Parser, Subcommand, ValueHint}; + +/// Boot the OpenShell gateway microVM. +/// +/// Starts a libkrun microVM running a k3s Kubernetes cluster with the +/// OpenShell control plane. Use `--exec` to run a custom process instead. +#[derive(Parser)] +#[command(name = "gateway", version)] +struct Cli { + #[command(subcommand)] + command: Option, + + #[command(flatten)] + run: RunArgs, +} + +#[derive(Subcommand)] +enum GatewayCommand { + /// Run a command with the gateway kubeconfig pre-configured. + /// + /// Examples: + /// gateway exec -- kubectl get pods -A + /// gateway exec -- kubectl -n openshell logs statefulset/openshell + /// gateway exec -- sh + Exec { + /// Command and arguments to run on the host with KUBECONFIG pointing + /// at the VM-backed gateway cluster. + #[arg(trailing_var_arg = true, required = true)] + command: Vec, + }, +} + +#[derive(clap::Args)] +struct RunArgs { + /// Path to the rootfs directory (aarch64 Linux). + /// Defaults to `~/.local/share/openshell/gateway/rootfs`. + #[arg(long, value_hint = ValueHint::DirPath)] + rootfs: Option, + + /// Executable path inside the VM. When set, runs this instead of + /// the default k3s server. + #[arg(long)] + exec: Option, + + /// Arguments to the executable (requires `--exec`). + #[arg(long, num_args = 1..)] + args: Vec, + + /// Environment variables in `KEY=VALUE` form (requires `--exec`). + #[arg(long, num_args = 1..)] + env: Vec, + + /// Working directory inside the VM. + #[arg(long, default_value = "/")] + workdir: String, + + /// Port mappings (`host_port:guest_port`). + #[arg(long, short, num_args = 1..)] + port: Vec, + + /// Number of virtual CPUs (default: 4 for gateway, 2 for --exec). + #[arg(long)] + vcpus: Option, + + /// RAM in MiB (default: 8192 for gateway, 2048 for --exec). + #[arg(long)] + mem: Option, + + /// libkrun log level (0=Off .. 5=Trace). + #[arg(long, default_value_t = 1)] + krun_log_level: u32, + + /// Networking backend: "gvproxy" (default), "tsi", or "none". + #[arg(long, default_value = "gvproxy")] + net: String, +} + +fn main() { + tracing_subscriber::fmt::init(); + + let cli = Cli::parse(); + + let code = match run(cli) { + Ok(code) => code, + Err(e) => { + eprintln!("Error: {e}"); + 1 + } + }; + + if code != 0 { + std::process::exit(code); + } +} + +fn run(cli: Cli) -> Result> { + if let Some(command) = cli.command { + return match command { + GatewayCommand::Exec { command } => exec_with_gateway_kubeconfig(&command), + }; + } + + let cli = cli.run; + + let net_backend = match cli.net.as_str() { + "tsi" => openshell_vm::NetBackend::Tsi, + "none" => openshell_vm::NetBackend::None, + "gvproxy" => openshell_vm::NetBackend::Gvproxy { + binary: openshell_vm::default_runtime_gvproxy_path(), + }, + other => { + return Err( + format!("unknown --net backend: {other} (expected: gvproxy, tsi, none)").into(), + ); + } + }; + + let rootfs = match cli.rootfs { + Some(p) => p, + None => openshell_bootstrap::paths::default_rootfs_dir()?, + }; + + let mut config = if let Some(exec_path) = cli.exec { + openshell_vm::VmConfig { + rootfs, + vcpus: cli.vcpus.unwrap_or(2), + mem_mib: cli.mem.unwrap_or(2048), + exec_path, + args: cli.args, + env: cli.env, + workdir: cli.workdir, + port_map: cli.port, + vsock_ports: vec![], + log_level: cli.krun_log_level, + console_output: None, + net: net_backend.clone(), + } + } else { + let mut c = openshell_vm::VmConfig::gateway(rootfs); + if !cli.port.is_empty() { + c.port_map = cli.port; + } + if let Some(v) = cli.vcpus { + c.vcpus = v; + } + if let Some(m) = cli.mem { + c.mem_mib = m; + } + c.net = net_backend; + c + }; + config.log_level = cli.krun_log_level; + + Ok(openshell_vm::launch(&config)?) +} + +fn gateway_kubeconfig_path() -> Result> { + let home = std::env::var("HOME")?; + Ok(PathBuf::from(home).join(".kube").join("gateway.yaml")) +} + +fn workspace_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .ancestors() + .nth(2) + .expect("workspace root") + .to_path_buf() +} + +fn openshell_kubectl_wrapper_path() -> PathBuf { + workspace_root().join("scripts/bin/kubectl") +} + +fn is_openshell_kubectl_wrapper(path: &Path) -> bool { + path.canonicalize().ok() == openshell_kubectl_wrapper_path().canonicalize().ok() +} + +fn filtered_path() -> OsString { + let wrapper_dir = openshell_kubectl_wrapper_path() + .parent() + .map(Path::to_path_buf) + .unwrap_or_default(); + let entries = std::env::var_os("PATH") + .map(|path| { + std::env::split_paths(&path) + .filter(|entry| entry != &wrapper_dir) + .collect::>() + }) + .unwrap_or_default(); + + std::env::join_paths(entries).unwrap_or_else(|_| OsString::from("/usr/bin:/bin")) +} + +fn resolve_kubectl_binary() -> Result> { + if let Some(path) = std::env::var_os("OPENSHELL_GATEWAY_KUBECTL") { + return Ok(PathBuf::from(path)); + } + + let path = std::env::var_os("PATH").unwrap_or_default(); + for dir in std::env::split_paths(&path) { + let candidate = dir.join("kubectl"); + if candidate.is_file() && !is_openshell_kubectl_wrapper(&candidate) { + return Ok(candidate); + } + } + + Err( + "could not find a real kubectl binary on PATH; install kubectl or set OPENSHELL_GATEWAY_KUBECTL" + .into(), + ) +} + +fn configure_clean_env(cmd: &mut Command, kubeconfig: &Path) { + cmd.env_clear().env("KUBECONFIG", kubeconfig); + + for key in [ + "HOME", + "TERM", + "COLORTERM", + "NO_COLOR", + "LANG", + "LC_ALL", + "LC_CTYPE", + "TMPDIR", + ] { + if let Some(value) = std::env::var_os(key) { + cmd.env(key, value); + } + } + + cmd.env("PATH", filtered_path()); +} + +fn exec_with_gateway_kubeconfig(command: &[String]) -> Result> { + let kubeconfig = gateway_kubeconfig_path()?; + if !kubeconfig.is_file() { + return Err(format!( + "gateway kubeconfig not found: {}\nStart the VM first with `gateway` or `mise run vm`.", + kubeconfig.display() + ) + .into()); + } + + let program = &command[0]; + let mut cmd = if program == "kubectl" { + let mut kubectl = Command::new(resolve_kubectl_binary()?); + let has_kubeconfig = command + .iter() + .skip(1) + .any(|arg| arg == "--kubeconfig" || arg.starts_with("--kubeconfig=")); + if !has_kubeconfig { + kubectl.arg("--kubeconfig").arg(&kubeconfig); + } + kubectl.args(&command[1..]); + kubectl + } else { + let mut other = Command::new(program); + other.args(&command[1..]); + other + }; + + cmd.stdin(Stdio::inherit()) + .stdout(Stdio::inherit()) + .stderr(Stdio::inherit()); + configure_clean_env(&mut cmd, &kubeconfig); + + let status = cmd.status()?; + Ok(status.code().unwrap_or(1)) +} diff --git a/crates/openshell-vm/tests/gateway_integration.rs b/crates/openshell-vm/tests/gateway_integration.rs new file mode 100644 index 00000000..03419021 --- /dev/null +++ b/crates/openshell-vm/tests/gateway_integration.rs @@ -0,0 +1,117 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Integration tests for the standalone `gateway` binary. +//! +//! These tests require: +//! - libkrun installed (e.g. `brew tap slp/krun && brew install libkrun`) +//! - macOS ARM64 with Apple Hypervisor.framework +//! - A pre-built rootfs at `~/.local/share/openshell/gateway/rootfs` +//! +//! All tests are `#[ignore]` — run them explicitly: +//! +//! ```sh +//! cargo test -p openshell-vm --test gateway_integration -- --ignored +//! ``` + +#![allow(unsafe_code)] + +use std::net::{SocketAddr, TcpStream}; +use std::process::{Command, Stdio}; +use std::time::{Duration, Instant}; + +/// Path to the built `gateway` binary (resolved by Cargo at compile time). +const GATEWAY: &str = env!("CARGO_BIN_EXE_gateway"); + +// ── Helpers ──────────────────────────────────────────────────────────── + +/// Codesign the binary on macOS so it can access Hypervisor.framework. +fn codesign_if_needed() { + if cfg!(target_os = "macos") { + let entitlements = format!("{}/entitlements.plist", env!("CARGO_MANIFEST_DIR")); + let status = Command::new("codesign") + .args([ + "--entitlements", + &entitlements, + "--force", + "-s", + "-", + GATEWAY, + ]) + .status() + .expect("codesign command failed to execute"); + assert!(status.success(), "failed to codesign gateway binary"); + } +} + +fn assert_runtime_bundle_staged() { + let bundle_dir = std::path::Path::new(GATEWAY) + .parent() + .expect("gateway binary has no parent") + .join("gateway.runtime"); + assert!( + bundle_dir.is_dir(), + "gateway.runtime is missing next to the test binary: {}. Run `mise run vm:bundle-runtime` first.", + bundle_dir.display() + ); +} + +// ── Tests ────────────────────────────────────────────────────────────── + +/// Boot the full OpenShell gateway and verify the gRPC service becomes +/// reachable on port 30051. +#[test] +#[ignore] // requires libkrun + rootfs +fn gateway_boots_and_service_becomes_reachable() { + codesign_if_needed(); + assert_runtime_bundle_staged(); + + let mut cmd = Command::new(GATEWAY); + cmd.stdout(Stdio::null()).stderr(Stdio::piped()); + + let mut child = cmd.spawn().expect("failed to start gateway"); + + // Poll for the OpenShell gRPC service. + let addr: SocketAddr = ([127, 0, 0, 1], 30051).into(); + let timeout = Duration::from_secs(180); + let start = Instant::now(); + let mut reachable = false; + + while start.elapsed() < timeout { + if TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { + reachable = true; + break; + } + std::thread::sleep(Duration::from_secs(2)); + } + + // Tear down regardless of result. + let _ = unsafe { libc::kill(child.id() as i32, libc::SIGTERM) }; + let _ = child.wait(); + + assert!( + reachable, + "gateway service on port 30051 not reachable within {timeout:?}" + ); +} + +/// Run a trivial command inside the VM via `--exec` and verify it exits +/// successfully, proving the VM boots and can execute guest processes. +#[test] +#[ignore] // requires libkrun + rootfs +fn gateway_exec_runs_guest_command() { + codesign_if_needed(); + assert_runtime_bundle_staged(); + + let mut cmd = Command::new(GATEWAY); + cmd.args(["--exec", "/bin/true"]); + + let output = cmd.output().expect("failed to run gateway --exec"); + + assert!( + output.status.success(), + "gateway --exec /bin/true failed with status {:?}\nstderr: {}", + output.status, + String::from_utf8_lossy(&output.stderr), + ); +} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 83ece499..ee0b38e0 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -26,11 +26,16 @@ spec: {{- end }} spec: terminationGracePeriodSeconds: {{ .Values.podLifecycle.terminationGracePeriodSeconds }} + {{- if .Values.hostNetwork }} + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + {{- end }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} {{- end }} serviceAccountName: {{ include "openshell.serviceAccountName" . }} + automountServiceAccountToken: {{ .Values.automountServiceAccountToken }} {{- if .Values.server.hostGatewayIP }} hostAliases: - ip: {{ .Values.server.hostGatewayIP | quote }} @@ -94,10 +99,16 @@ spec: - name: OPENSHELL_DISABLE_GATEWAY_AUTH value: "true" {{- end }} + {{- if and (not .Values.automountServiceAccountToken) .Values.kubeconfig.hostPath }} + - name: KUBECONFIG + value: /etc/openshell/kubeconfig + {{- end }} {{- end }} volumeMounts: + {{- if .Values.persistence.enabled }} - name: openshell-data mountPath: /var/openshell + {{- end }} {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -105,6 +116,12 @@ spec: - name: tls-client-ca mountPath: /etc/openshell-tls/client-ca readOnly: true + {{- if and (not .Values.automountServiceAccountToken) .Values.kubeconfig.hostPath }} + - name: kubeconfig + mountPath: /etc/openshell/kubeconfig + subPath: k3s.yaml + readOnly: true + {{- end }} {{- end }} ports: - name: grpc @@ -134,6 +151,16 @@ spec: - name: tls-client-ca secret: secretName: {{ .Values.server.tls.clientCaSecretName }} + {{- if not .Values.persistence.enabled }} + - name: openshell-data + emptyDir: {} + {{- end }} + {{- if and (not .Values.automountServiceAccountToken) .Values.kubeconfig.hostPath }} + - name: kubeconfig + hostPath: + path: {{ .Values.kubeconfig.hostPath }} + type: Directory + {{- end }} {{- end }} {{- with .Values.nodeSelector }} nodeSelector: @@ -147,6 +174,7 @@ spec: tolerations: {{- toYaml . | nindent 8 }} {{- end }} + {{- if .Values.persistence.enabled }} volumeClaimTemplates: - metadata: name: openshell-data @@ -155,3 +183,4 @@ spec: resources: requests: storage: 1Gi + {{- end }} diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index 2691fc48..2ff42a49 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -19,6 +19,19 @@ serviceAccount: annotations: {} name: "" +# Whether to auto-mount the ServiceAccount token into the pod. Disabled +# in microVM gateway mode because the projected volume mount at +# /var/run/secrets/kubernetes.io/serviceaccount hits a containerd +# native-snapshotter + virtiofs incompatibility on sandbox re-creation. +automountServiceAccountToken: true + +# When automountServiceAccountToken is false, the OpenShell gateway needs +# a kubeconfig to reach the API server. Point this to the directory +# containing the k3s kubeconfig (k3s.yaml). Only used when +# automountServiceAccountToken is false. +kubeconfig: + hostPath: "" + podAnnotations: {} podLabels: {} @@ -56,6 +69,19 @@ probes: resources: {} +# Persistent storage for the OpenShell database. When disabled, an +# emptyDir volume is used instead of a PVC. This is useful in microVM +# environments where overlayfs-on-virtiofs doesn't support PVC mounts +# reliably. +persistence: + enabled: true + +# Run the pod directly on the host network. Useful in microVM +# environments where kube-proxy is unavailable (no iptables). +# When true, the pod binds to the VM's eth0 and NodePort is +# unnecessary — gvproxy forwards host ports to the pod directly. +hostNetwork: false + nodeSelector: {} tolerations: [] diff --git a/deploy/kube/manifests/openshell-helmchart.yaml b/deploy/kube/manifests/openshell-helmchart.yaml index 2245c72e..cf07bf00 100644 --- a/deploy/kube/manifests/openshell-helmchart.yaml +++ b/deploy/kube/manifests/openshell-helmchart.yaml @@ -28,11 +28,18 @@ spec: repository: ghcr.io/nvidia/openshell/gateway tag: latest pullPolicy: Always + hostNetwork: __HOST_NETWORK__ + automountServiceAccountToken: __AUTOMOUNT_SA_TOKEN__ + kubeconfig: + hostPath: __KUBECONFIG_HOST_PATH__ + persistence: + enabled: __PERSISTENCE_ENABLED__ server: sandboxImage: ghcr.io/nvidia/openshell-community/sandboxes/base:latest sshGatewayHost: __SSH_GATEWAY_HOST__ sshGatewayPort: __SSH_GATEWAY_PORT__ sshHandshakeSecret: __SSH_HANDSHAKE_SECRET__ + dbUrl: __DB_URL__ grpcEndpoint: "https://openshell.openshell.svc.cluster.local:8080" hostGatewayIP: __HOST_GATEWAY_IP__ disableGatewayAuth: __DISABLE_GATEWAY_AUTH__ diff --git a/scripts/bin/gateway b/scripts/bin/gateway new file mode 100755 index 00000000..8438dfdf --- /dev/null +++ b/scripts/bin/gateway @@ -0,0 +1,25 @@ +#!/usr/bin/env bash +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +BINARY="$PROJECT_ROOT/target/debug/gateway" + +cargo build --package openshell-vm --bin gateway --quiet + +# On macOS, codesign with the hypervisor entitlement so libkrun can use +# Apple's Hypervisor.framework. Re-sign after every build. +ENTITLEMENTS="$PROJECT_ROOT/crates/openshell-vm/entitlements.plist" +if [[ "$(uname)" == "Darwin" ]] && [[ -f "$ENTITLEMENTS" ]]; then + codesign --entitlements "$ENTITLEMENTS" --force -s - "$BINARY" 2>/dev/null +fi + +# Ensure libkrunfw is discoverable by libkrun's dlopen on macOS. +# dyld only reads DYLD_FALLBACK_LIBRARY_PATH at process startup, so we +# set it here before exec. +if [[ "$(uname)" == "Darwin" ]]; then + HOMEBREW_LIB="$(brew --prefix 2>/dev/null || echo /opt/homebrew)/lib" + export DYLD_FALLBACK_LIBRARY_PATH="${HOMEBREW_LIB}${DYLD_FALLBACK_LIBRARY_PATH:+:$DYLD_FALLBACK_LIBRARY_PATH}" +fi + +exec "$BINARY" "$@" diff --git a/scripts/bin/openshell b/scripts/bin/openshell index 8b8a9c21..19a55c2e 100755 --- a/scripts/bin/openshell +++ b/scripts/bin/openshell @@ -90,6 +90,8 @@ fi if [[ "$needs_build" == "1" ]]; then echo "Recompiling openshell..." >&2 cargo build --package openshell-cli --quiet + + # Persist state after successful build mkdir -p "$(dirname "$STATE_FILE")" cd "$PROJECT_ROOT" diff --git a/tasks/rust.toml b/tasks/rust.toml index 69214ce7..dfa4068f 100644 --- a/tasks/rust.toml +++ b/tasks/rust.toml @@ -5,12 +5,12 @@ ["rust:check"] description = "Check all Rust crates for errors" -run = "cargo check --workspace" +run = "cargo check --workspace --exclude openshell-vm" hide = true ["rust:lint"] description = "Lint Rust code with Clippy" -run = "cargo clippy --workspace --all-targets" +run = "cargo clippy --workspace --all-targets --exclude openshell-vm" hide = true ["rust:format"] diff --git a/tasks/scripts/bundle-vm-runtime.sh b/tasks/scripts/bundle-vm-runtime.sh new file mode 100755 index 00000000..f391363a --- /dev/null +++ b/tasks/scripts/bundle-vm-runtime.sh @@ -0,0 +1,114 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +if [ "$(uname -s)" != "Darwin" ]; then + echo "vm:bundle-runtime currently supports macOS only" >&2 + exit 1 +fi + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +LIB_DIR="${OPENSHELL_VM_RUNTIME_SOURCE_DIR:-}" +GVPROXY_BIN="${OPENSHELL_VM_GVPROXY:-}" + +if [ -z "$LIB_DIR" ]; then + BREW_PREFIX="$(brew --prefix 2>/dev/null || true)" + if [ -n "$BREW_PREFIX" ]; then + LIB_DIR="${BREW_PREFIX}/lib" + else + LIB_DIR="/opt/homebrew/lib" + fi +fi + +if [ -z "$GVPROXY_BIN" ]; then + if command -v gvproxy >/dev/null 2>&1; then + GVPROXY_BIN="$(command -v gvproxy)" + elif [ -x /opt/homebrew/bin/gvproxy ]; then + GVPROXY_BIN="/opt/homebrew/bin/gvproxy" + elif [ -x /opt/podman/bin/gvproxy ]; then + GVPROXY_BIN="/opt/podman/bin/gvproxy" + else + echo "gvproxy not found; set OPENSHELL_VM_GVPROXY or install gvproxy" >&2 + exit 1 + fi +fi + +# libkrun.dylib: prefer the custom runtime dir, fall back to Homebrew. +# libkrun is the VMM and does not need a custom build; only libkrunfw +# carries the custom kernel. +LIBKRUN="${LIB_DIR}/libkrun.dylib" +if [ ! -e "$LIBKRUN" ]; then + BREW_PREFIX="${BREW_PREFIX:-$(brew --prefix 2>/dev/null || true)}" + if [ -n "$BREW_PREFIX" ] && [ -e "${BREW_PREFIX}/lib/libkrun.dylib" ]; then + LIBKRUN="${BREW_PREFIX}/lib/libkrun.dylib" + echo "using Homebrew libkrun at ${LIBKRUN}" + else + echo "libkrun not found at ${LIB_DIR}/libkrun.dylib or Homebrew; install libkrun or set OPENSHELL_VM_RUNTIME_SOURCE_DIR" >&2 + exit 1 + fi +fi + +KRUNFW_FILES=() +while IFS= read -r line; do + KRUNFW_FILES+=("$line") +done < <(find "$LIB_DIR" -maxdepth 1 \( -type f -o -type l \) \( -name 'libkrunfw.dylib' -o -name 'libkrunfw.*.dylib' \) | sort -u) + +if [ "${#KRUNFW_FILES[@]}" -eq 0 ]; then + echo "libkrunfw not found under ${LIB_DIR}; set OPENSHELL_VM_RUNTIME_SOURCE_DIR" >&2 + exit 1 +fi + +# Check for provenance.json (custom runtime indicator) +PROVENANCE_FILE="${LIB_DIR}/provenance.json" +IS_CUSTOM="false" +if [ -f "$PROVENANCE_FILE" ]; then + IS_CUSTOM="true" + echo "custom runtime detected (provenance.json present)" +fi + +TARGETS=( + "${ROOT}/target/debug" + "${ROOT}/target/release" + "${ROOT}/target/aarch64-apple-darwin/debug" + "${ROOT}/target/aarch64-apple-darwin/release" +) + +for target_dir in "${TARGETS[@]}"; do + runtime_dir="${target_dir}/gateway.runtime" + mkdir -p "$runtime_dir" + + install -m 0644 "$LIBKRUN" "${runtime_dir}/libkrun.dylib" + install -m 0755 "$GVPROXY_BIN" "${runtime_dir}/gvproxy" + for krunfw in "${KRUNFW_FILES[@]}"; do + install -m 0644 "$krunfw" "${runtime_dir}/$(basename "$krunfw")" + done + + # Copy provenance.json if this is a custom runtime. + if [ "$IS_CUSTOM" = "true" ] && [ -f "$PROVENANCE_FILE" ]; then + install -m 0644 "$PROVENANCE_FILE" "${runtime_dir}/provenance.json" + fi + + manifest_entries=() + manifest_entries+=(' "libkrun.dylib"') + manifest_entries+=(' "gvproxy"') + for krunfw in "${KRUNFW_FILES[@]}"; do + manifest_entries+=(" \"$(basename "$krunfw")\"") + done + if [ "$IS_CUSTOM" = "true" ]; then + manifest_entries+=(' "provenance.json"') + fi + + cat > "${runtime_dir}/manifest.json" <&2 + exit 1 +fi + +if [ ! -d "${TARGET_DIR}/gateway.runtime" ]; then + echo "target/release/gateway.runtime not found; run mise run vm:bundle-runtime first" >&2 + exit 1 +fi + +mkdir -p "${ARTIFACT_DIR}" +tar -czf "${ARTIFACT_DIR}/gateway-aarch64-apple-darwin.tar.gz" \ + -C "${TARGET_DIR}" \ + gateway \ + gateway.runtime + +ls -lh "${ARTIFACT_DIR}/gateway-aarch64-apple-darwin.tar.gz" diff --git a/tasks/scripts/run-vm.sh b/tasks/scripts/run-vm.sh new file mode 100755 index 00000000..229fd91c --- /dev/null +++ b/tasks/scripts/run-vm.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +RUNTIME_DIR="${ROOT}/target/debug/gateway.runtime" +GATEWAY_BIN="${ROOT}/target/debug/gateway" + +if [ "$(uname -s)" = "Darwin" ]; then + export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" +fi + +exec "${GATEWAY_BIN}" "$@" diff --git a/tasks/test.toml b/tasks/test.toml index f53f9152..791ca2aa 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -17,7 +17,7 @@ depends = ["e2e:python:gpu"] ["test:rust"] description = "Run Rust tests" -run = "cargo test --workspace" +run = "cargo test --workspace --exclude openshell-vm" hide = true ["test:python"] @@ -43,3 +43,9 @@ description = "Run Python GPU e2e tests" depends = ["python:proto", "cluster"] env = { UV_NO_SYNC = "1", PYTHONPATH = "python" } run = "uv run pytest -o python_files='test_*.py' -m gpu -n ${E2E_PARALLEL:-1} e2e/python" + +["e2e:vm"] +description = "Run e2e tests against a gateway VM (macOS ARM64)" +depends = ["python:proto"] +env = { UV_NO_SYNC = "1", PYTHONPATH = "python", OPENSHELL_GATEWAY = "gateway" } +run = "uv run pytest -o python_files='test_*.py' e2e/python" diff --git a/tasks/vm.toml b/tasks/vm.toml new file mode 100644 index 00000000..7f8c3013 --- /dev/null +++ b/tasks/vm.toml @@ -0,0 +1,72 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# openshell-vm development helpers + +[vm] +description = "Build and run the standalone gateway microVM" +run = [ + "mise run vm:build:binary", + "tasks/scripts/bundle-vm-runtime.sh", + "tasks/scripts/ensure-vm-rootfs.sh", + "tasks/scripts/codesign-gateway.sh", + "tasks/scripts/run-vm.sh", +] +hide = false + +["vm:build"] +description = "Force a fresh gateway rebuild, including the rootfs" +run = [ + "mise run vm:build:binary", + "tasks/scripts/bundle-vm-runtime.sh", + "OPENSHELL_VM_FORCE_ROOTFS_REBUILD=1 tasks/scripts/ensure-vm-rootfs.sh", + "tasks/scripts/codesign-gateway.sh", +] +hide = false + +["vm:build:binary"] +description = "Build the standalone gateway binary" +run = "cargo build -p openshell-vm" +hide = true + +["vm:build:release"] +description = "Build the standalone gateway binary in release mode" +run = "cargo build -p openshell-vm --release" +hide = true + +["vm:rootfs"] +description = "Build the default gateway rootfs if needed" +run = "tasks/scripts/ensure-vm-rootfs.sh" +hide = true + +["vm:codesign"] +description = "Codesign the gateway binary for Hypervisor.framework access on macOS" +depends = ["vm:build:binary"] +run = "tasks/scripts/codesign-gateway.sh" +hide = true + +["vm:bundle-runtime"] +description = "Stage the gateway sidecar runtime bundle next to local build outputs" +run = "tasks/scripts/bundle-vm-runtime.sh" +hide = false + +["vm:build-custom-runtime"] +description = "Build a custom libkrunfw with bridge/netfilter kernel support" +run = "crates/openshell-vm/runtime/build-custom-libkrunfw.sh" +hide = false + +["vm:verify"] +description = "Run the VM verification matrix against a running gateway" +run = "crates/openshell-vm/scripts/verify-vm.sh" +hide = false + +["vm:check-capabilities"] +description = "Check VM kernel capabilities (run inside the VM)" +run = "echo 'This script must be run inside the VM. Copy it to the rootfs or exec into a running VM.'" +hide = false + +["vm:package:gateway"] +description = "Package the gateway binary with its sidecar runtime bundle" +run = "tasks/scripts/package-gateway-runtime.sh" +depends = ["vm:build:release", "vm:bundle-runtime"] +hide = false